In [1]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import re
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# NLP libraries
from textblob import TextBlob


In [2]:
df = pd.read_csv('sentiment-analysis.csv')
print(df)

   Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score
0   "I love this product!", Positive, Twitter, 202...                     
1   "The service was terrible.", Negative, Yelp Re...                     
2   "This movie is amazing!", Positive, IMDb, 2023...                     
3   "I'm so disappointed with their customer suppo...                     
4   "Just had the best meal of my life!", Positive...                     
..                                                ...                     
93  "I can't stop listening to this song. It's my ...                     
94  "Their website is so confusing and poorly desi...                     
95  "I had an incredible experience at the theme p...                     
96                                                NaN                     
97                                                NaN                     

[98 rows x 1 columns]


In [3]:
def fix_dataframe_format(df):
    """Fix the dataframe where all columns are merged into one"""
    if len(df.columns) == 1:
        col_name = df.columns[0]
        split_data = []
        for index, row in df.iterrows():
            if pd.notna(row[col_name]):
                parts = re.split(r',\s*(?=(?:[^"]*"[^"]*")*[^"]*$)', str(row[col_name]))
                split_data.append(parts)
            else:
                split_data.append([None] * 8)  # Now 8 columns including Rating
        
        fixed_df = pd.DataFrame(split_data, columns=[
            'Text', 'Sentiment', 'Source', 'Date/Time', 'User ID', 'Location', 'Rating', 'Confidence Score'
        ])
    else:
        fixed_df = df.copy()
    
    if 'Text' in fixed_df.columns:
        fixed_df['Text'] = fixed_df['Text'].apply(
            lambda x: re.sub(r'^"|"$', '', str(x)) if pd.notna(x) else x
        )
    
    if 'Confidence Score' in fixed_df.columns:
        fixed_df['Confidence Score'] = pd.to_numeric(
            fixed_df['Confidence Score'], errors='coerce'
        )
    
    if 'Rating' in fixed_df.columns:
        fixed_df['Rating'] = pd.to_numeric(fixed_df['Rating'], errors='coerce')
    
    if 'Date/Time' in fixed_df.columns:
        fixed_df['Date/Time'] = pd.to_datetime(
            fixed_df['Date/Time'], errors='coerce'
        )
    
    return fixed_df

In [4]:
# Fix the dataframe
fixed_df = fix_dataframe_format(df)
df=fixed_df

In [5]:
# Create a clean copy of the dataframe
df_clean = df.copy()

print("Before cleaning:", df_clean.shape)

# Handle missing values
df_clean = df_clean.dropna(subset=['Text', 'Sentiment', 'Date/Time'])

# Convert Date/Time to datetime
df_clean['Date/Time'] = pd.to_datetime(df_clean['Date/Time'], errors='coerce')

# Remove invalid dates
df_clean = df_clean.dropna(subset=['Date/Time'])

# Ensure Confidence Score is numeric
if 'Confidence Score' in df_clean.columns:
    df_clean['Confidence Score'] = pd.to_numeric(df_clean['Confidence Score'], errors='coerce')
    df_clean['Confidence Score'] = df_clean['Confidence Score'].fillna(0.5)

print("After cleaning:", df_clean.shape)
print(f"Removed {len(df) - len(df_clean)} rows with missing/invalid data")

Before cleaning: (98, 8)
After cleaning: (96, 8)
Removed 2 rows with missing/invalid data


In [6]:
# Text preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    
    text = str(text).lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply text preprocessing
df_clean['cleaned_text'] = df_clean['Text'].apply(preprocess_text)

# Extract star ratings from text
def extract_star_rating(text):
    if pd.isna(text):
        return None
        
    text = str(text).lower()
    patterns = [
        r'(\d+)\s*stars?',
        r'rating\s*[:\-]?\s*(\d+)',
        r'(\d+)/\d+\s*(?:stars?|rating)',
        r'\b(\d+)\s*out of\s*\d+\s*stars?'
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text)
        if matches:
            return int(matches[0])
    return None

df_clean['extracted_stars'] = df_clean['Text'].apply(extract_star_rating)

print("Text preprocessing completed!")
display(df_clean[['Text', 'cleaned_text', 'extracted_stars']].head())

Text preprocessing completed!


Unnamed: 0,Text,cleaned_text,extracted_stars
0,I love this product!,i love this product,
1,The service was terrible.,the service was terrible,
2,This movie is amazing!,this movie is amazing,
3,I'm so disappointed with their customer support.,im so disappointed with their customer support,
4,Just had the best meal of my life!,just had the best meal of my life,


In [7]:
# Create additional features
df_clean['text_length'] = df_clean['cleaned_text'].apply(len)
df_clean['word_count'] = df_clean['cleaned_text'].apply(lambda x: len(x.split()))

# TextBlob sentiment features
df_clean['textblob_polarity'] = df_clean['Text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df_clean['textblob_subjectivity'] = df_clean['Text'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)

# Time-based features
df_clean['date'] = pd.to_datetime(df_clean['Date/Time'])
df_clean['day_of_week'] = df_clean['date'].dt.dayofweek
df_clean['month'] = df_clean['date'].dt.month

print("Feature engineering completed!")
display(df_clean[['text_length', 'word_count', 'textblob_polarity', 'extracted_stars']].head())

Feature engineering completed!


Unnamed: 0,text_length,word_count,textblob_polarity,extracted_stars
0,19,4,0.625,
1,24,4,-1.0,
2,21,4,0.75,
3,46,7,-0.75,
4,33,8,1.0,


In [8]:
# Prepare features and target
X_text = df_clean['cleaned_text'].fillna('')
y = df_clean['Sentiment']

# Encode target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Classes:", label_encoder.classes_)
print("Class distribution:", np.bincount(y_encoded))

# Create TF-IDF features
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2)
)

X_tfidf = vectorizer.fit_transform(X_text)

# Combine with other features
additional_features = df_clean[[
    'text_length', 'word_count', 'textblob_polarity', 
    'textblob_subjectivity', 'day_of_week', 'month'
]].fillna(0)

# Handle star ratings
star_ratings = df_clean['extracted_stars'].fillna(0)
additional_features['star_rating'] = star_ratings

# Combine all features
from scipy.sparse import hstack
X_combined = hstack([X_tfidf, additional_features.values])

print(f"Final feature matrix shape: {X_combined.shape}")

Classes: ['Negative' 'Positive']
Class distribution: [43 53]
Final feature matrix shape: (96, 481)


In [9]:
# Cell 7: Fixed preprocessing pipeline with proper NaN handling
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Create column transformer with imputation
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(
            max_features=3000,
            stop_words='english',
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.8
        ), 'cleaned_text'),
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),  # Handle missing numeric values
            ('scaler', StandardScaler())
        ]), ['text_length', 'word_count', 'textblob_polarity', 
             'textblob_subjectivity', 'extracted_stars', 
             'day_of_week', 'month'])
    ]
)


In [10]:
# Cell 8: Fixed - Define and train multiple classifiers with error handling
from sklearn.model_selection import cross_val_score
import traceback

# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42, probability=True),
    'Naive Bayes': MultinomialNB()
}

# Split the data - ensure we're using the cleaned dataframe
X_train, X_test, y_train, y_test = train_test_split(
    df_clean, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Train and evaluate each classifier
results = {}
best_score = 0
best_model = None
best_model_name = None

for name, classifier in classifiers.items():

    
    try:
        # Create pipeline with preprocessor and classifier
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', classifier)
        ])
        
        # Train the model
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = pipeline.predict(X_test)
        y_pred_proba = pipeline.predict_proba(X_test) if hasattr(classifier, 'predict_proba') else None
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        
        # Store results
        results[name] = {
            'pipeline': pipeline,
            'accuracy': accuracy,
            'predictions': y_pred,
            'probabilities': y_pred_proba
        }
        
        print(f" {name} Accuracy: {accuracy:.4f}")
        
        # Cross-validation
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
        print(f" {name} CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        if accuracy > best_score:
            best_score = accuracy
            best_model = pipeline
            best_model_name = name
            
    except Exception as e:
        print(f" Error training {name}: {str(e)}")
        print("Skipping this classifier...")
        continue

if best_model:
    print(f"\n BEST MODEL: {best_model_name} with accuracy: {best_score:.4f}")
else:
    print("\n No models were successfully trained. Checking data issues...")
    

    numeric_features = ['text_length', 'word_count', 'textblob_polarity', 
                       'textblob_subjectivity', 'extracted_stars', 'day_of_week', 'month']
    
    for feature in numeric_features:
        if feature in df_clean.columns:
            nan_count = df_clean[feature].isna().sum()
            print(f"  {feature}: {nan_count} NaN values")

Training set: 76 samples
Test set: 20 samples
 Logistic Regression Accuracy: 0.9500
 Logistic Regression CV Accuracy: 0.9600 (+/- 0.0653)
 Random Forest Accuracy: 0.9500
 Random Forest CV Accuracy: 0.9333 (+/- 0.0843)
 SVM Accuracy: 0.9500
 SVM CV Accuracy: 0.9350 (+/- 0.0792)
 Error training Naive Bayes: Negative values in data passed to MultinomialNB (input X).
Skipping this classifier...

 BEST MODEL: Logistic Regression with accuracy: 0.9500


In [11]:
# Cell 11: Fixed sentiment filtering class that works with both model types
class CustomerSentimentFilter:
    def __init__(self, model, label_encoder, vectorizer=None, model_type='pipeline'):
        self.model = model
        self.label_encoder = label_encoder
        self.vectorizer = vectorizer
        self.model_type = model_type  # 'pipeline' or 'simple'
        
    def predict_single(self, text, date_time=None):
        """Predict sentiment for a single text"""
        try:
            if self.model_type == 'pipeline':
                # For pipeline models
                single_data = pd.DataFrame([{
                    'cleaned_text': preprocess_text(text),
                    'text_length': len(preprocess_text(text)),
                    'word_count': len(preprocess_text(text).split()),
                    'textblob_polarity': TextBlob(str(text)).sentiment.polarity,
                    'textblob_subjectivity': TextBlob(str(text)).sentiment.subjectivity,
                    'extracted_stars': extract_star_rating(text),
                    'day_of_week': datetime.now().weekday() if date_time is None else pd.to_datetime(date_time).weekday(),
                    'month': datetime.now().month if date_time is None else pd.to_datetime(date_time).month
                }])
                
                prediction = self.model.predict(single_data)[0]
                probability = self.model.predict_proba(single_data)[0]
                
            else:
                # For simple models (text only)
                cleaned_text = preprocess_text(text)
                text_features = self.vectorizer.transform([cleaned_text])
                prediction = self.model.predict(text_features)[0]
                probability = self.model.predict_proba(text_features)[0]
            
            sentiment = self.label_encoder.inverse_transform([prediction])[0]
            confidence = probability[prediction]
            stars = extract_star_rating(text)
            
            return sentiment, confidence, stars
            
        except Exception as e:
            print(f"Prediction error: {e}")
            return "Unknown", 0.0, None
    
    def filter_positive_feedback(self, df, min_confidence=0.7, min_stars=3):
        """Filter positive feedback from last 1 year"""

        
        # Filter data from last 1 year
        if 'Date/Time' in df.columns:
            one_year_ago = datetime.now() - timedelta(days=365)
            recent_data = df[df['Date/Time'] >= one_year_ago].copy()
        else:
            recent_data = df.copy()
            print("  No Date/Time column - using all data")
        
        print(f" Reviews from last 1 year: {len(recent_data)}")
        
        # Predict sentiments
        predictions = []
        confidences = []
        stars_list = []
        
        for idx, row in recent_data.iterrows():
            sentiment, confidence, stars = self.predict_single(
                row['Text'], 
                row['Date/Time'] if 'Date/Time' in row else None
            )
            predictions.append(sentiment)
            confidences.append(confidence)
            stars_list.append(stars)
        
        recent_data['predicted_sentiment'] = predictions
        recent_data['prediction_confidence'] = confidences
        recent_data['extracted_stars'] = stars_list
        
        # Apply filters
        positive_mask = (
            (recent_data['predicted_sentiment'] == 'Positive') &
            (recent_data['prediction_confidence'] >= min_confidence) &
            (
                (recent_data['extracted_stars'] >= min_stars) |
                (recent_data['extracted_stars'].isna()) |
                (recent_data['prediction_confidence'] > 0.8)
            )
        )
        
        positive_feedback = recent_data[positive_mask].copy()
        
        # Sort by date and confidence
        if 'Date/Time' in positive_feedback.columns:
            positive_feedback = positive_feedback.sort_values(
                ['Date/Time', 'prediction_confidence'], 
                ascending=[False, False]
            )
        else:
            positive_feedback = positive_feedback.sort_values('prediction_confidence', ascending=False)
        
        return positive_feedback

# Initialize the filter
if results:
    best_model_name = max(results.keys(), key=lambda x: results[x]['accuracy'])
    best_result = results[best_model_name]
    
    if 'pipeline' in best_result:
        # Pipeline model
        sentiment_filter = CustomerSentimentFilter(
            best_result['pipeline'], 
            label_encoder,
            model_type='pipeline'
        )
    else:
        # Simple model
        sentiment_filter = CustomerSentimentFilter(
            best_result['model'],
            label_encoder,
            vectorizer=best_result['vectorizer'],
            model_type='simple'
        )
    print(" Sentiment filter initialized with best model")
else:
    print(" Cannot initialize filter - no models trained successfully.")

 Sentiment filter initialized with best model


In [12]:
# Quick save cell - run this if you just want to save without all the extras
import joblib
from datetime import datetime

# Quick save function
def quick_save_model():
    if results:
        best_model_name = max(results.keys(), key=lambda x: results[x]['accuracy'])
        best_result = results[best_model_name]
        
        model_assets = {
            'model': best_result['pipeline'] if 'pipeline' in best_result else best_result['model'],
            'label_encoder': label_encoder,
            'best_model_name': best_model_name,
            'accuracy': best_result['accuracy'],
            'timestamp': datetime.now().isoformat()
        }
        
        if 'vectorizer' in best_result:
            model_assets['vectorizer'] = best_result['vectorizer']
        
        filename = f"feedback_review_model{datetime.now().strftime('%Y%m%d_%H%M')}.pkl"
        joblib.dump(model_assets, filename)
        print(f" Model saved as: {filename}")
        return filename
    else:
        print(" No models to save")
        return None

# Quick save
saved_model_path = quick_save_model()

 Model saved as: feedback_review_model20251031_2355.pkl
