In [1]:
# Import necessary libraries
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tabulate import tabulate

# Download NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Text cleaning function with option to remove emoticons
def clean_text(text, remove_emoticons=True):
    if isinstance(text, str):
        if remove_emoticons:
            text = re.sub(r'[^\w\s]', '', text)  # Remove emoticons
        text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove non-alphabetical characters
        text = text.lower()
        stop_words = set(stopwords.words('english'))
        words = [word for word in text.split() if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        return ' '.join(words)
    else:
        return ''

# Function to load and preprocess data
def load_and_preprocess_data(file_path, remove_emoticons=True):
    df = pd.read_csv(file_path)
    df = df[['review_description', 'rating']]
    df['review_description'] = df['review_description'].astype(str)
    df['cleaned_review'] = df['review_description'].apply(lambda x: clean_text(x, remove_emoticons))
    df['sentiment'] = df['rating'].apply(label_sentiment)
    return df

# Function to label sentiment
def label_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

# Function to create additional features
def add_text_features(df):
    # Adding length of review as a feature
    df['review_length'] = df['cleaned_review'].apply(len)

    # Adding word count as a feature
    df['word_count'] = df['cleaned_review'].apply(lambda x: len(x.split()))

    return df

# Function to prepare data for sentiment analysis
def prepare_data_for_sentiment(df):
    vectorizer = TfidfVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df['cleaned_review'])
    le = LabelEncoder()
    y = le.fit_transform(df['sentiment'])
    return train_test_split(X, y, test_size=0.2, random_state=42), le, vectorizer

# Function to train the sentiment analysis model
def train_sentiment_model(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

# Function to calculate aggregate sentiment score
def calculate_sentiment_score(df, model, vectorizer, label_encoder):
    X = vectorizer.transform(df['cleaned_review'])
    y_pred = model.predict(X)
    sentiment_scores = label_encoder.inverse_transform(y_pred)

    # Convert sentiments to numeric scores
    sentiment_numeric = {'positive': 1, 'neutral': 0, 'negative': -1}
    df['sentiment_score'] = [sentiment_numeric[sent] for sent in sentiment_scores]

    # Calculate aggregate sentiment score
    aggregate_sentiment_score = df['sentiment_score'].mean()
    return aggregate_sentiment_score

# Function to compare app rating with sentiment score
def compare_rating_with_sentiment(df):
    # Calculate the overall app rating
    overall_rating = df['rating'].mean()

    # Prepare data and train sentiment model
    (X_train, X_test, y_train, y_test), label_encoder, vectorizer = prepare_data_for_sentiment(df)
    sentiment_model = train_sentiment_model(X_train, y_train)

    # Calculate aggregate sentiment score
    aggregate_sentiment_score = calculate_sentiment_score(df, sentiment_model, vectorizer, label_encoder)

    # Print the comparison
    print("\n--- App Rating vs. Sentiment Score ---")
    print(f"Overall Numeric Rating: {overall_rating:.2f}")
    print(f"Aggregate Sentiment Score: {aggregate_sentiment_score:.2f}")

    # Identify discrepancies
    discrepancy = overall_rating - aggregate_sentiment_score
    print(f"\nDiscrepancy between rating and sentiment score: {discrepancy:.2f}")
    if discrepancy > 0:
        print("The numeric rating is higher than the sentiment score.")
    elif discrepancy < 0:
        print("The sentiment score is higher than the numeric rating.")
    else:
        print("The numeric rating and sentiment score are in agreement.")

    # Save insights to a CSV
    df[['review_description', 'rating', 'sentiment_score']].to_csv('rating_sentiment_comparison.csv', index=False)
    print("\nThe comparison results have been saved to 'rating_sentiment_comparison.csv'.")

# Main function
def main_comprehensive_insight(file_path):
    # Load and preprocess data
    df = load_and_preprocess_data(file_path, remove_emoticons=False)

    # Add text features (optional, for future use)
    df = add_text_features(df)

    # Compare overall rating with sentiment score
    compare_rating_with_sentiment(df)

# Example usage
main_comprehensive_insight('/content/fitbit (1).csv')



--- App Rating vs. Sentiment Score ---
Overall Numeric Rating: 3.29
Aggregate Sentiment Score: 0.15

Discrepancy between rating and sentiment score: 3.15
The numeric rating is higher than the sentiment score.

The comparison results have been saved to 'rating_sentiment_comparison.csv'.
