In [1]:

import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Model training
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Evaluation
from sklearn.metrics import (classification_report, confusion_matrix, 
                             accuracy_score, precision_recall_fscore_support)

import warnings
warnings.filterwarnings('ignore')

In [8]:
# Download required NLTK data
print("Downloading NLTK data...")
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Set random seed
np.random.seed(42)

Downloading NLTK data...


In [4]:
# ============================================================================
# STEP 1: CREATE SAMPLE TEXT DATA
# ============================================================================

print("=" * 80)
print("STEP 1: CREATING SAMPLE TEXT DATA")
print("=" * 80)

# Sample product reviews dataset
data = {
    'text': [
        "This product is absolutely amazing! Best purchase ever.",
        "Terrible quality. Waste of money. Do not buy.",
        "Good value for the price. Works as expected.",
        "Horrible experience. Product broke after one week.",
        "Love it! Exceeded my expectations. Highly recommend.",
        "Not satisfied. Poor customer service and low quality.",
        "Decent product. Nothing special but does the job.",
        "Fantastic! Worth every penny. Will buy again.",
        "Disappointing. Expected much better for this price.",
        "Excellent quality and fast shipping. Very happy!",
        "Complete garbage. Returning it immediately.",
        "Pretty good. Minor issues but overall satisfied.",
        "Outstanding! Best in its category. Five stars.",
        "Mediocre at best. Wouldn't recommend to others.",
        "Incredible product. Changed my life for the better.",
        "Worst purchase ever. Total waste of time and money.",
        "Satisfied with my purchase. Good quality product.",
        "Awful. Cheaply made and doesn't work properly.",
        "Great buy! Exactly what I needed. Thank you!",
        "Regret buying this. Save your money.",
        "Amazing quality and design. Love everything about it!",
        "Not worth it. Better alternatives available elsewhere.",
        "Perfect! No complaints whatsoever. Highly satisfied.",
        "Broken on arrival. Very disappointed with this.",
        "Superb product. Elegant design and great functionality.",
        "Barely works. Customer support was unhelpful too.",
        "Really happy with this purchase. Good investment.",
        "Defective item received. Poor quality control.",
        "Brilliant! Solves all my problems. Couldn't be happier.",
        "Absolute trash. Would give zero stars if possible.",
        "Nice product overall. Some room for improvement.",
        "Nightmare experience. Never ordering from here again.",
        "Wonderful quality. Beautifully packaged and delivered.",
        "Junk. Falls apart easily. Very frustrating.",
        "Impressive! Better than advertised. Great deal.",
        "Unacceptable quality. Demanding a full refund.",
        "Solid purchase. Reliable and durable product.",
        "Pathetic. Doesn't match the description at all.",
        "Delighted with this! Perfect for my needs.",
        "Horrible material. Feels cheap and flimsy."
    ],
    'category': [
        'positive', 'negative', 'positive', 'negative', 'positive',
        'negative', 'neutral', 'positive', 'negative', 'positive',
        'negative', 'neutral', 'positive', 'negative', 'positive',
        'negative', 'positive', 'negative', 'positive', 'negative',
        'positive', 'negative', 'positive', 'negative', 'positive',
        'negative', 'positive', 'negative', 'positive', 'negative',
        'neutral', 'negative', 'positive', 'negative', 'positive',
        'negative', 'positive', 'negative', 'positive', 'negative'
    ]
}


df = pd.DataFrame(data)

print(f"\nDataset shape: {df.shape}")
print(f"\nFirst 5 rows:")
print(df.head())
print(f"\nCategory distribution:")
print(df['category'].value_counts())
print(f"\nCategory percentages:")
print(df['category'].value_counts(normalize=True) * 100)

STEP 1: CREATING SAMPLE TEXT DATA

Dataset shape: (40, 2)

First 5 rows:
                                                text  category
0  This product is absolutely amazing! Best purch...  positive
1      Terrible quality. Waste of money. Do not buy.  negative
2       Good value for the price. Works as expected.  positive
3  Horrible experience. Product broke after one w...  negative
4  Love it! Exceeded my expectations. Highly reco...  positive

Category distribution:
category
negative    19
positive    18
neutral      3
Name: count, dtype: int64

Category percentages:
category
negative    47.5
positive    45.0
neutral      7.5
Name: proportion, dtype: float64


In [5]:
# ============================================================================
# STEP 2: EXPLORATORY DATA ANALYSIS
# ============================================================================

print("\n" + "=" * 80)
print("STEP 2: EXPLORATORY DATA ANALYSIS")
print("=" * 80)

# Text length analysis
df['text_length'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

print("\nText Statistics:")
print(df[['text_length', 'word_count']].describe())

print("\nAverage text length by category:")
print(df.groupby('category')['text_length'].mean())

print("\nAverage word count by category:")
print(df.groupby('category')['word_count'].mean())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")




STEP 2: EXPLORATORY DATA ANALYSIS

Text Statistics:
       text_length  word_count
count    40.000000   40.000000
mean     48.250000    7.025000
std       4.259228    0.831665
min      36.000000    5.000000
25%      45.750000    6.750000
50%      48.000000    7.000000
75%      51.250000    8.000000
max      55.000000    9.000000

Average text length by category:
category
negative    47.315789
neutral     48.333333
positive    49.222222
Name: text_length, dtype: float64

Average word count by category:
category
negative    6.894737
neutral     7.333333
positive    7.111111
Name: word_count, dtype: float64

Missing values:
text           0
category       0
text_length    0
word_count     0
dtype: int64

Duplicate rows: 0


In [6]:
# ============================================================================
# STEP 3: TEXT PREPROCESSING - CLEANING
# ============================================================================

print("\n" + "=" * 80)
print("STEP 3: TEXT PREPROCESSING - CLEANING")
print("=" * 80)

def clean_text(text):
    """Clean text data"""
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

print("\nExample of text cleaning:")
print(f"Original: {df['text'].iloc[0]}")
print(f"Cleaned:  {df['cleaned_text'].iloc[0]}")



STEP 3: TEXT PREPROCESSING - CLEANING

Example of text cleaning:
Original: This product is absolutely amazing! Best purchase ever.
Cleaned:  this product is absolutely amazing best purchase ever


In [9]:
# ============================================================================
# STEP 4: TEXT PREPROCESSING - TOKENIZATION
# ============================================================================

print("\n" + "=" * 80)
print("STEP 4: TEXT PREPROCESSING - TOKENIZATION")
print("=" * 80)

def tokenize_text(text):
    """Tokenize text into words"""
    tokens = word_tokenize(text)
    return tokens

# Apply tokenization
df['tokens'] = df['cleaned_text'].apply(tokenize_text)

print("\nExample of tokenization:")
print(f"Text: {df['cleaned_text'].iloc[0]}")
print(f"Tokens: {df['tokens'].iloc[0]}")




STEP 4: TEXT PREPROCESSING - TOKENIZATION

Example of tokenization:
Text: this product is absolutely amazing best purchase ever
Tokens: ['this', 'product', 'is', 'absolutely', 'amazing', 'best', 'purchase', 'ever']


In [10]:
# ============================================================================
# STEP 5: TEXT PREPROCESSING - REMOVE STOPWORDS
# ============================================================================

print("\n" + "=" * 80)
print("STEP 5: TEXT PREPROCESSING - REMOVE STOPWORDS")
print("=" * 80)

# Get English stopwords
stop_words = set(stopwords.words('english'))

print(f"\nNumber of stopwords: {len(stop_words)}")
print(f"Sample stopwords: {list(stop_words)[:10]}")

def remove_stopwords(tokens):
    """Remove stopwords from tokens"""
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

# Apply stopword removal
df['tokens_no_stop'] = df['tokens'].apply(remove_stopwords)

print("\nExample of stopword removal:")
print(f"Before: {df['tokens'].iloc[0]}")
print(f"After:  {df['tokens_no_stop'].iloc[0]}")



STEP 5: TEXT PREPROCESSING - REMOVE STOPWORDS

Number of stopwords: 198
Sample stopwords: ['before', 'through', 'isn', 'me', 'ma', "should've", 'their', "you'd", 'off', "shouldn't"]

Example of stopword removal:
Before: ['this', 'product', 'is', 'absolutely', 'amazing', 'best', 'purchase', 'ever']
After:  ['product', 'absolutely', 'amazing', 'best', 'purchase', 'ever']


In [11]:
# ============================================================================
# STEP 6: TEXT PREPROCESSING - LEMMATIZATION
# ============================================================================

print("\n" + "=" * 80)
print("STEP 6: TEXT PREPROCESSING - LEMMATIZATION")
print("=" * 80)

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    """Lemmatize tokens to their base form"""
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized

# Apply lemmatization
df['tokens_lemmatized'] = df['tokens_no_stop'].apply(lemmatize_tokens)

print("\nExample of lemmatization:")
print(f"Before: {df['tokens_no_stop'].iloc[0]}")
print(f"After:  {df['tokens_lemmatized'].iloc[0]}")

# Alternative: Stemming
stemmer = PorterStemmer()

def stem_tokens(tokens):
    """Stem tokens"""
    stemmed = [stemmer.stem(word) for word in tokens]
    return stemmed

df['tokens_stemmed'] = df['tokens_no_stop'].apply(stem_tokens)

print("\nComparison - Lemmatization vs Stemming:")
print(f"Lemmatized: {df['tokens_lemmatized'].iloc[0]}")
print(f"Stemmed:    {df['tokens_stemmed'].iloc[0]}")



STEP 6: TEXT PREPROCESSING - LEMMATIZATION

Example of lemmatization:
Before: ['product', 'absolutely', 'amazing', 'best', 'purchase', 'ever']
After:  ['product', 'absolutely', 'amazing', 'best', 'purchase', 'ever']

Comparison - Lemmatization vs Stemming:
Lemmatized: ['product', 'absolutely', 'amazing', 'best', 'purchase', 'ever']
Stemmed:    ['product', 'absolut', 'amaz', 'best', 'purchas', 'ever']


In [12]:

# ============================================================================
# STEP 7: CONVERT BACK TO TEXT
# ============================================================================

print("\n" + "=" * 80)
print("STEP 7: FINAL PREPROCESSED TEXT")
print("=" * 80)

# Join tokens back to text
df['processed_text'] = df['tokens_lemmatized'].apply(lambda x: ' '.join(x))

print("\nPreprocessing pipeline complete!")
print("\nExample comparison:")
print(f"Original:   {df['text'].iloc[0]}")
print(f"Processed:  {df['processed_text'].iloc[0]}")

# ===


STEP 7: FINAL PREPROCESSED TEXT

Preprocessing pipeline complete!

Example comparison:
Original:   This product is absolutely amazing! Best purchase ever.
Processed:  product absolutely amazing best purchase ever


In [13]:
# ============================================================================
# STEP 8: FEATURE EXTRACTION - TF-IDF
# ============================================================================

print("\n" + "=" * 80)
print("STEP 8: FEATURE EXTRACTION - TF-IDF")
print("=" * 80)

# Prepare data
X = df['processed_text']
y = df['category']

# Method 1: TF-IDF Vectorizer (recommended)
tfidf_vectorizer = TfidfVectorizer(
    max_features=100,  # Top 100 features
    min_df=1,          # Minimum document frequency
    max_df=0.8,        # Maximum document frequency
    ngram_range=(1, 2) # Unigrams and bigrams
)

X_tfidf = tfidf_vectorizer.fit_transform(X)

print(f"\nTF-IDF Feature Matrix Shape: {X_tfidf.shape}")
print(f"Number of features: {len(tfidf_vectorizer.get_feature_names_out())}")
print(f"\nTop 20 features:")
print(tfidf_vectorizer.get_feature_names_out()[:20])



STEP 8: FEATURE EXTRACTION - TF-IDF

TF-IDF Feature Matrix Shape: (40, 100)
Number of features: 100

Top 20 features:
['absolute' 'amazing' 'best' 'better' 'buy' 'customer' 'design' 'doesnt'
 'ever' 'expected' 'experience' 'good' 'great' 'happy' 'highly' 'horrible'
 'love' 'match' 'match description' 'material']


In [14]:
# Method 2: Count Vectorizer (Bag of Words)
count_vectorizer = CountVectorizer(
    max_features=100,
    ngram_range=(1, 2)
)

X_count = count_vectorizer.fit_transform(X)

print(f"\nCount Vectorizer Shape: {X_count.shape}")

# We'll use TF-IDF for training
X_features = X_tfidf


Count Vectorizer Shape: (40, 100)


In [15]:
# ============================================================================
# STEP 9: TRAIN-TEST SPLIT
# ============================================================================

print("\n" + "=" * 80)
print("STEP 9: TRAIN-TEST SPLIT")
print("=" * 80)

X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

print(f"\nTraining set category distribution:")
print(pd.Series(y_train).value_counts())



STEP 9: TRAIN-TEST SPLIT

Training set size: 32 samples
Testing set size: 8 samples

Training set category distribution:
category
positive    15
negative    15
neutral      2
Name: count, dtype: int64
