In [None]:
# Import necessary libraries
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE  # Import SMOTE
from tabulate import tabulate

# Download NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Text cleaning function
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^\w\s]', '', text)  # Remove emoticons and non-alphabetical characters
        text = text.lower()
        stop_words = set(stopwords.words('english'))
        words = [word for word in text.split() if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        return ' '.join(words)
    else:
        return ''

# Function to load and preprocess data
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df = df[['review_description', 'rating']]
    df['review_description'] = df['review_description'].astype(str)
    df['cleaned_review'] = df['review_description'].apply(clean_text)
    return df

# Function to calculate sentiment score using TextBlob and convert to rating scale
def calculate_textblob_sentiment_and_convert_to_rating(df):
    # Apply TextBlob to calculate sentiment polarity for each review
    df['sentiment_score'] = df['cleaned_review'].apply(lambda x: TextBlob(x).sentiment.polarity)

    # Convert sentiment polarity (-1 to 1) to a rating (1 to 5)
    df['converted_sentiment_rating'] = df['sentiment_score'].apply(lambda x: ((x + 1) * 2) + 1)

    return df

# Function to compare sentiment scores (converted) with app ratings
def compare_sentiment_with_ratings(df):
    # Calculate overall numeric rating
    overall_rating = df['rating'].mean()

    # Calculate average converted sentiment rating
    average_sentiment_rating = df['converted_sentiment_rating'].mean()

    # Print the comparison
    print("\n--- App Rating vs. Converted Sentiment Rating ---")
    print(f"Overall Numeric Rating: {overall_rating:.2f}")
    print(f"Average Sentiment Rating (Converted): {average_sentiment_rating:.2f}")

    # Identify discrepancies
    discrepancy = overall_rating - average_sentiment_rating
    print(f"\nDiscrepancy between numeric rating and sentiment-based rating: {discrepancy:.2f}")
    if discrepancy > 0:
        print("The numeric rating is higher than the sentiment-based rating.")
    elif discrepancy < 0:
        print("The sentiment-based rating is higher than the numeric rating.")
    else:
        print("The numeric rating and sentiment-based rating are in agreement.")

    # Save insights to a CSV
    df[['review_description', 'rating', 'sentiment_score', 'converted_sentiment_rating']].to_csv('rating_sentiment_comparison_converted.csv', index=False)
    print("\nThe comparison results have been saved to 'rating_sentiment_comparison_converted.csv'.")

# Main function
def main_textblob_sentiment_analysis_with_conversion(file_path):
    # Load and preprocess data
    df = load_and_preprocess_data(file_path)

    # Calculate sentiment using TextBlob and convert to a rating scale
    df = calculate_textblob_sentiment_and_convert_to_rating(df)

    # Compare the converted sentiment ratings with the app ratings
    compare_sentiment_with_ratings(df)

# Example usage
main_textblob_sentiment_analysis_with_conversion('/content/fitbit.csv')



--- App Rating vs. Converted Sentiment Rating ---
Overall Numeric Rating: 3.29
Average Sentiment Rating (Converted): 3.44

Discrepancy between numeric rating and sentiment-based rating: -0.15
The sentiment-based rating is higher than the numeric rating.

The comparison results have been saved to 'rating_sentiment_comparison_converted.csv'.
