## Data Cleaning Strategies Comparison

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from wordcloud import WordCloud

# Download required NLTK data
nltk.download(['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger'])

class PreprocessingExperiments:
    def __init__(self, df):
        self.df = df
        self.text_col = [col for col in df.columns if any(x in col.lower() for x in ['text', 'message'])][0]
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
    
    def experiment_cleaning_levels(self):
        """Compare different cleaning intensities"""
        sample_texts = self.df[self.text_col].dropna().sample(5).tolist()
        
        cleaning_strategies = {
            'light_clean': self.light_cleaning,
            'medium_clean': self.medium_cleaning, 
            'heavy_clean': self.heavy_cleaning
        }
        
        results = {}
        for strategy_name, strategy_func in cleaning_strategies.items():
            cleaned_texts = [strategy_func(text) for text in sample_texts]
            results[strategy_name] = cleaned_texts
            
            print(f"\n=== {strategy_name.upper()} ===")
            for orig, clean in zip(sample_texts, cleaned_texts):
                print(f"Original: {orig[:100]}...")
                print(f"Cleaned:  {clean[:100]}...")
                print("-" * 50)
        
        return results
    
    def light_cleaning(self, text):
        """Basic cleaning - keep most information"""
        text = str(text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
        return text.strip()
    
    def medium_cleaning(self, text):
        """Moderate cleaning for general NLP"""
        text = self.light_cleaning(text)
        text = text.lower()
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        return text
    
    def heavy_cleaning(self, text):
        """Heavy cleaning for focused analysis"""
        text = self.medium_cleaning(text)
        # Remove all punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Remove stopwords
        tokens = text.split()
        tokens = [token for token in tokens if token not in self.stop_words]
        return ' '.join(tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Tokenization Methods Comparison

In [2]:
def tokenization_experiment(self):
    """Compare different tokenization approaches"""
    sample_text = self.df[self.text_col].dropna().iloc[0]
    
    tokenization_methods = {
        'word_tokenize': nltk.word_tokenize,
        'split': lambda x: x.split(),
        'regex_tokenize': lambda x: re.findall(r'\b\w+\b', x)
    }
    
    print("Original Text:", sample_text)
    print("\n" + "="*50)
    
    for method_name, method_func in tokenization_methods.items():
        tokens = method_func(sample_text)
        print(f"\n{method_name}:")
        print(f"Tokens: {tokens}")
        print(f"Token Count: {len(tokens)}")
        print(f"Unique Tokens: {len(set(tokens))}")

## Stemming vs Lemmatization Impact

In [3]:
def stemming_vs_lemmatization(self):
    """Compare stemming and lemmatization results"""
    sample_words = ['running', 'ran', 'runs', 'better', 'best', 'books', 'booking']
    
    stemmed = [self.stemmer.stem(word) for word in sample_words]
    lemmatized = [self.lemmatizer.lemmatize(word) for word in sample_words]
    
    comparison_df = pd.DataFrame({
        'Original': sample_words,
        'Stemmed': stemmed,
        'Lemmatized': lemmatized
    })
    
    print("Stemming vs Lemmatization Comparison:")
    print(comparison_df)
    
    # Impact on vocabulary size
    sample_texts = self.df[self.text_col].dropna().head(100)
    
    original_vocab = set()
    stemmed_vocab = set()
    lemmatized_vocab = set()
    
    for text in sample_texts:
        tokens = word_tokenize(str(text).lower())
        original_vocab.update(tokens)
        stemmed_vocab.update([self.stemmer.stem(token) for token in tokens])
        lemmatized_vocab.update([self.lemmatizer.lemmatize(token) for token in tokens])
    
    print(f"\nVocabulary Size Impact:")
    print(f"Original: {len(original_vocab)}")
    print(f"Stemmed: {len(stemmed_vocab)}")
    print(f"Lemmatized: {len(lemmatized_vocab)}")

## Stopword Removal Analysis

In [4]:
def stopword_analysis(self):
    """Analyze impact of stopword removal"""
    sample_texts = self.df[self.text_col].dropna().head(50)
    
    # Custom stopwords for personality analysis
    personality_stopwords = {'i', 'you', 'we', 'they', 'my', 'your', 'our', 'their'}
    extended_stopwords = self.stop_words.union(personality_stopwords)
    
    strategies = {
        'no_removal': set(),
        'standard_stopwords': self.stop_words,
        'extended_stopwords': extended_stopwords
    }
    
    results = {}
    for strategy_name, stopword_set in strategies.items():
        processed_texts = []
        for text in sample_texts:
            tokens = word_tokenize(str(text).lower())
            if stopword_set:
                tokens = [token for token in tokens if token not in stopword_set]
            processed_texts.append(' '.join(tokens))
        
        # Calculate statistics
        total_words = sum(len(text.split()) for text in processed_texts)
        unique_words = len(set(' '.join(processed_texts).split()))
        
        results[strategy_name] = {
            'total_words': total_words,
            'unique_words': unique_words,
            'avg_words_per_text': total_words / len(sample_texts)
        }
    
    results_df = pd.DataFrame(results).T
    print("Stopword Removal Impact:")
    print(results_df)
    
    return results_df

##  Personality-Specific Preprocessing

In [5]:
def personality_specific_preprocessing(self):
    """Experiment with personality-aware preprocessing"""
    if 'personality' not in self.df.columns:
        return
    
    # Analyze vocabulary differences between personalities
    personalities = self.df['personality'].unique()
    
    personality_vocab = {}
    for personality in personalities:
        personality_texts = self.df[self.df['personality'] == personality][self.text_col]
        all_words = ' '.join(personality_texts.astype(str)).lower().split()
        word_freq = Counter(all_words)
        personality_vocab[personality] = word_freq
    
    # Find personality-specific words
    print("Personality-Specific Vocabulary Analysis:")
    for personality, vocab in personality_vocab.items():
        common_words = set(vocab.keys())
        other_words = set()
        for other_personality, other_vocab in personality_vocab.items():
            if other_personality != personality:
                other_words.update(other_vocab.keys())
        
        unique_words = common_words - other_words
        print(f"\n{personality} unique words (top 10):")
        unique_word_freq = {word: vocab[word] for word in unique_words if word in vocab}
        print(sorted(unique_word_freq.items(), key=lambda x: x[1], reverse=True)[:10])

## Visualization Experiments

In [6]:

def preprocessing_visualizations(self):
    """Create visualizations to compare preprocessing effects"""
    sample_texts = self.df[self.text_col].dropna().head(100)
    
    # Before and after cleaning word clouds
    original_text = ' '.join(sample_texts.astype(str))
    cleaned_text = ' '.join([self.medium_cleaning(text) for text in sample_texts])
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Original word cloud
    wordcloud_orig = WordCloud(width=600, height=400, background_color='white').generate(original_text)
    ax1.imshow(wordcloud_orig, interpolation='bilinear')
    ax1.set_title('Original Text Word Cloud')
    ax1.axis('off')
    
    # Cleaned word cloud
    wordcloud_clean = WordCloud(width=600, height=400, background_color='white').generate(cleaned_text)
    ax2.imshow(wordcloud_clean, interpolation='bilinear')
    ax2.set_title('Cleaned Text Word Cloud')
    ax2.axis('off')
    
    plt.tight_layout()
    plt.show()