In [None]:
# Import necessary libraries
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE  # Import SMOTE
from tabulate import tabulate

# Download NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Text cleaning function
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^\w\s]', '', text)  # Remove emoticons and non-alphabetical characters
        text = text.lower()
        stop_words = set(stopwords.words('english'))
        words = [word for word in text.split() if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        return ' '.join(words)
    else:
        return ''

# Function to load and preprocess data
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df = df[['review_description', 'rating']]
    df['review_description'] = df['review_description'].astype(str)
    df['cleaned_review'] = df['review_description'].apply(clean_text)
    return df

# Function to calculate sentiment score using TextBlob
def calculate_textblob_sentiment(df):
    # Apply TextBlob to calculate sentiment polarity for each review
    df['sentiment_score'] = df['cleaned_review'].apply(lambda x: TextBlob(x).sentiment.polarity)

    # Create new labels based on polarity
    df['sentiment_label'] = df['sentiment_score'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))
    return df

# Function to train the model using SMOTE on the newly created sentiment labels
def train_model_with_smote(df):
    # Vectorize the text data
    vectorizer = TfidfVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df['cleaned_review'])

    # Encode the sentiment labels
    label_mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
    y = df['sentiment_label'].map(label_mapping)

    # Apply SMOTE to handle class imbalance
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Split the resampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    # Train a RandomForest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions and evaluate the model
    y_pred = model.predict(X_test)
    print("\n--- Model Performance on Sentiment Labels (After Applying SMOTE) ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

    return model, vectorizer, label_mapping

# Function to compare sentiment scores with ratings
def compare_sentiment_with_ratings(df):
    # Calculate overall numeric rating
    overall_rating = df['rating'].mean()

    # Calculate aggregate sentiment score
    aggregate_sentiment_score = df['sentiment_score'].mean()

    # Print the comparison
    print("\n--- App Rating vs. Sentiment Score ---")
    print(f"Overall Numeric Rating: {overall_rating:.2f}")
    print(f"Aggregate Sentiment Score (TextBlob): {aggregate_sentiment_score:.2f}")

    # Identify discrepancies
    discrepancy = overall_rating - aggregate_sentiment_score
    print(f"\nDiscrepancy between rating and sentiment score: {discrepancy:.2f}")
    if discrepancy > 0:
        print("The numeric rating is higher than the sentiment score.")
    elif discrepancy < 0:
        print("The sentiment score is higher than the numeric rating.")
    else:
        print("The numeric rating and sentiment score are in agreement.")

    # Save insights to a CSV
    df[['review_description', 'rating', 'sentiment_score', 'sentiment_label']].to_csv('rating_sentiment_comparison_textblob_smote.csv', index=False)
    print("\nThe comparison results have been saved to 'rating_sentiment_comparison_textblob_smote.csv'.")

# Main function
def main_textblob_sentiment_analysis_with_smote(file_path):
    # Load and preprocess data
    df = load_and_preprocess_data(file_path)

    # Calculate sentiment using TextBlob and create new labels
    df = calculate_textblob_sentiment(df)

    # Train the model on the new sentiment labels with SMOTE
    train_model_with_smote(df)

    # Compare the extracted sentiment scores with the app ratings
    compare_sentiment_with_ratings(df)

# Example usage
main_textblob_sentiment_analysis_with_smote('/content/fitbit.csv')


--- Model Performance on Sentiment Labels (After Applying SMOTE) ---
Accuracy: 0.9582

Classification Report:
              precision    recall  f1-score   support

    negative       0.93      0.98      0.95     41136
     neutral       0.96      0.98      0.97     41482
    positive       0.99      0.91      0.95     41209

    accuracy                           0.96    123827
   macro avg       0.96      0.96      0.96    123827
weighted avg       0.96      0.96      0.96    123827


--- App Rating vs. Sentiment Score ---
Overall Numeric Rating: 3.29
Aggregate Sentiment Score (TextBlob): 0.22

Discrepancy between rating and sentiment score: 3.07
The numeric rating is higher than the sentiment score.

The comparison results have been saved to 'rating_sentiment_comparison_textblob_smote.csv'.
