In [2]:
# Import necessary libraries
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob  # Import TextBlob for sentiment analysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tabulate import tabulate

# Download NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Text cleaning function
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
        text = text.lower()
        stop_words = set(stopwords.words('english'))
        words = [word for word in text.split() if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        return ' '.join(words)
    else:
        return ''

# Function to load and preprocess data
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df = df[['review_description', 'rating']]
    df['review_description'] = df['review_description'].astype(str)
    df['cleaned_review'] = df['review_description'].apply(clean_text)
    return df

# Function to calculate sentiment score using TextBlob and convert it to a rating
def calculate_sentiment_and_convert_to_rating(df):
    # Apply TextBlob to calculate sentiment polarity for each review
    df['sentiment_score'] = df['cleaned_review'].apply(lambda x: TextBlob(x).sentiment.polarity)

    # Convert sentiment polarity to a 1-5 rating
    def polarity_to_rating(polarity):
        if polarity > 0.5:
            return 5
        elif polarity > 0.1:
            return 4
        elif polarity >= -0.1:
            return 3
        elif polarity >= -0.5:
            return 2
        else:
            return 1

    df['calculated_rating'] = df['sentiment_score'].apply(polarity_to_rating)
    return df

# Function to train a model using the calculated sentiment ratings
def train_model_on_calculated_ratings(df):
    # Vectorize the text data
    vectorizer = TfidfVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df['cleaned_review'])

    # Use the calculated ratings as the target variable
    y = df['calculated_rating']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a RandomForest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions and evaluate the model
    y_pred = model.predict(X_test)
    print("\n--- Model Performance on Calculated Sentiment Ratings ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Rating 1', 'Rating 2', 'Rating 3', 'Rating 4', 'Rating 5']))

    return model, vectorizer

# Function to compare the model's predicted sentiment ratings with the actual ratings
def compare_predicted_with_actual_ratings(df, model, vectorizer):
    # Transform the text using the trained vectorizer
    X = vectorizer.transform(df['cleaned_review'])

    # Predict the sentiment ratings using the trained model
    df['predicted_rating'] = model.predict(X)

    # Calculate the overall average of the actual and predicted ratings
    overall_actual_rating = df['rating'].mean()
    overall_predicted_rating = df['predicted_rating'].mean()

    # Print the comparison
    print("\n--- Comparison of Predicted Sentiment Rating vs. Actual Rating ---")
    print(f"Overall Actual Rating: {overall_actual_rating:.2f}")
    print(f"Overall Predicted Sentiment Rating: {overall_predicted_rating:.2f}")

    # Identify discrepancies
    discrepancy = overall_actual_rating - overall_predicted_rating
    print(f"\nDiscrepancy between actual rating and predicted sentiment rating: {discrepancy:.2f}")
    if discrepancy > 0:
        print("The actual rating is higher than the predicted sentiment rating.")
    elif discrepancy < 0:
        print("The predicted sentiment rating is higher than the actual rating.")
    else:
        print("The actual rating and predicted sentiment rating are in agreement.")

    # Save insights to a CSV
    df[['review_description', 'rating', 'sentiment_score', 'calculated_rating', 'predicted_rating']].to_csv('comparison_sentiment_ratings.csv', index=False)
    print("\nThe comparison results have been saved to 'comparison_sentiment_ratings.csv'.")

# Main function
def main_sentiment_analysis_pipeline(file_path):
    # Load and preprocess data
    df = load_and_preprocess_data(file_path)

    # Calculate sentiment scores using TextBlob and convert them to ratings
    df = calculate_sentiment_and_convert_to_rating(df)

    # Train a model on the calculated sentiment ratings
    model, vectorizer = train_model_on_calculated_ratings(df)

    # Compare the model's predicted sentiment ratings with the actual ratings
    compare_predicted_with_actual_ratings(df, model, vectorizer)

# Example usage
main_sentiment_analysis_pipeline('/content/fitbit.csv')



--- Model Performance on Calculated Sentiment Ratings ---
Accuracy: 0.8203

Classification Report:
              precision    recall  f1-score   support

    Rating 1       0.81      0.48      0.61      1323
    Rating 2       0.79      0.54      0.64      6943
    Rating 3       0.78      0.82      0.80     21242
    Rating 4       0.81      0.90      0.86     25600
    Rating 5       0.94      0.84      0.89     11591

    accuracy                           0.82     66699
   macro avg       0.83      0.72      0.76     66699
weighted avg       0.82      0.82      0.82     66699


--- Comparison of Predicted Sentiment Rating vs. Actual Rating ---
Overall Actual Rating: 3.29
Overall Predicted Sentiment Rating: 3.60

Discrepancy between actual rating and predicted sentiment rating: -0.30
The predicted sentiment rating is higher than the actual rating.

The comparison results have been saved to 'comparison_sentiment_ratings.csv'.
