In [None]:
# Import necessary libraries
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from tabulate import tabulate

# Download NLTK data (stopwords and wordnet)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Text cleaning function
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^\w\s]', '', text)  # Remove emoticons and non-alphabetical characters
        text = text.lower()
        stop_words = set(stopwords.words('english'))
        words = [word for word in text.split() if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        return ' '.join(words)
    else:
        return ''

# Function to load data and preprocess it
def load_and_preprocess_data_for_models(file_path):
    df = pd.read_csv(file_path)
    df = df[['review_description', 'rating']]
    df['review_description'] = df['review_description'].astype(str)
    df['cleaned_review'] = df['review_description'].apply(clean_text)
    return df

# Function to train and evaluate different models
def train_and_evaluate_models(df):
    # Feature extraction using TF-IDF
    tfidf = TfidfVectorizer(max_features=1000)
    X = tfidf.fit_transform(df['cleaned_review']).toarray()
    y = df['rating']

    # Splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define models
    models = {
        'XGB': XGBRegressor(),
        'RF': RandomForestRegressor(),
        'GBM': GradientBoostingRegressor(),
        'AB': AdaBoostRegressor(),
        'ET': ExtraTreesRegressor()
    }

    # Dictionary to store results
    results = {}

    # Train and evaluate models
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[model_name] = np.mean(y_pred)  # Store average rating from the predictions

    # Aggregate results
    aggregate_rating = np.mean(list(results.values()))
    results['Aggregate rating'] = aggregate_rating

    return results

# Function to create a comparison table
def create_comparison_table(results, df):
    # Get the total number of reviews (just for context)
    total_reviews = df.shape[0]

    # Create a comparison DataFrame
    comparison_df = pd.DataFrame({
        'App reviews': [total_reviews],
        'XGB': [results['XGB']],
        'RF': [results['RF']],
        'GBM': [results['GBM']],
        'AB': [results['AB']],
        'ET': [results['ET']],
        'Aggregate rating': [results['Aggregate rating']]
    })

    # Display the results in a tabulated form
    print(tabulate(comparison_df, headers='keys', tablefmt='pipe', floatfmt=".6f"))

    return comparison_df

# Main function to load data, train models, and compare ratings
def main_model_comparison(file_path):
    # Load and preprocess data
    df = load_and_preprocess_data_for_models(file_path)

    # Train models and get results
    results = train_and_evaluate_models(df)

    # Create a comparison table
    comparison_table = create_comparison_table(results, df)

    # Save the comparison table as CSV
    comparison_table.to_csv('model_comparison_results.csv', index=False)
    print("\nThe comparison table has been saved to 'model_comparison_results.csv'.")

# Example usage
main_model_comparison('/content/fitbit.csv')
