In [None]:
# Import necessary libraries
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from tabulate import tabulate

# Download NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Text cleaning function with option to remove emoticons
def clean_text(text, remove_emoticons=True):
    if isinstance(text, str):
        if remove_emoticons:
            text = re.sub(r'[^\w\s]', '', text)  # Remove emoticons
        text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove non-alphabetical characters
        text = text.lower()
        stop_words = set(stopwords.words('english'))
        words = [word for word in text.split() if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        return ' '.join(words)
    else:
        return ''

# Function to load and preprocess data
def load_and_preprocess_data(file_path, remove_emoticons=True):
    df = pd.read_csv(file_path)
    df = df[['review_description', 'rating']]
    df['review_description'] = df['review_description'].astype(str)
    df['cleaned_review'] = df['review_description'].apply(lambda x: clean_text(x, remove_emoticons))
    df['sentiment'] = df['rating'].apply(label_sentiment)
    return df

# Function to label sentiment
def label_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

# Function to create additional features
def add_text_features(df):
    # Adding length of review as a feature
    df['review_length'] = df['cleaned_review'].apply(len)

    # Adding word count as a feature
    df['word_count'] = df['cleaned_review'].apply(lambda x: len(x.split()))

    return df

# Function to prepare data for modeling
def prepare_data_for_modeling(df):
    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df['cleaned_review'])

    # Encode the sentiment labels
    le = LabelEncoder()
    y = le.fit_transform(df['sentiment'])

    # Split the data into training and testing sets
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest using different VSM techniques
def train_rf_with_vsm_techniques(df):
    results = []

    # TF-IDF Unigram
    vectorizer = TfidfVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df['cleaned_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'VSM Technique': 'TF/IDF', 'Accuracy': accuracy})

    # TF-IDF Bigrams
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
    X = vectorizer.fit_transform(df['cleaned_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'VSM Technique': 'TF/IDF(Bigram)', 'Accuracy': accuracy})

    # TF-IDF Trigrams
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))
    X = vectorizer.fit_transform(df['cleaned_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'VSM Technique': 'TF/IDF(Trigram)', 'Accuracy': accuracy})

    # TF (Term Frequency only)
    vectorizer = CountVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df['cleaned_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'VSM Technique': 'TF', 'Accuracy': accuracy})

    return pd.DataFrame(results)

# Function to train Random Forest with and without emoticons
def train_rf_with_emoticons(df_with_emoticons, df_without_emoticons):
    results = []

    # Run Random Forest with Emoticons
    vectorizer = TfidfVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df_with_emoticons['cleaned_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df_with_emoticons['sentiment'], test_size=0.2, random_state=42)
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'VSM Technique': 'TF/IDF with Emoticons', 'Accuracy': accuracy})

    # Run Random Forest without Emoticons
    vectorizer = TfidfVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df_without_emoticons['cleaned_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df_without_emoticons['sentiment'], test_size=0.2, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'VSM Technique': 'TF/IDF without Emoticons', 'Accuracy': accuracy})

    return pd.DataFrame(results)

# Train Gradient Boosting models with different learning rates
def train_gbm_with_different_learning_rates(X_train, X_test, y_train, y_test):
    learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1.0]
    results = []
    for rate in learning_rates:
        model = GradientBoostingClassifier(n_estimators=100, learning_rate=rate, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results.append({'Learning Rate': rate, 'Accuracy': accuracy})
    return pd.DataFrame(results)

# Function to train ensemble classifiers and predict numeric ratings
def analyze_and_predict(df):
    # Vectorize the text
    vectorizer = TfidfVectorizer(max_features=10000)
    X_text = vectorizer.fit_transform(df['cleaned_review'])

    # Prepare numeric rating target
    y_numeric = df['rating']

    # Prepare sentiment target
    le = LabelEncoder()
    y_sentiment = le.fit_transform(df['sentiment'])

    # Combine text features and additional features
    X_features = np.hstack((X_text.toarray(), df[['review_length', 'word_count']].values))

    # Split data into train and test sets
    X_train, X_test, y_train_numeric, y_test_numeric = train_test_split(X_features, y_numeric, test_size=0.2, random_state=42)
    _, _, y_train_sentiment, y_test_sentiment = train_test_split(X_features, y_sentiment, test_size=0.2, random_state=42)

    # Models for numeric rating prediction
    numeric_models = {
        'XGB': XGBClassifier(n_estimators=100, random_state=42),
        'RF': RandomForestClassifier(n_estimators=100, random_state=42),
        'GBM': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'AB': AdaBoostClassifier(n_estimators=100, random_state=42),
        'ET': ExtraTreesClassifier(n_estimators=100, random_state=42)
    }

    # Model for sentiment analysis (Random Forest)
    # Model for sentiment analysis (Random Forest)
    sentiment_model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Store numeric predictions and discrepancies
    numeric_predictions = {}
    discrepancies = []

    # Train sentiment model
    sentiment_model.fit(X_train, y_train_sentiment)
    y_pred_sentiment = sentiment_model.predict(X_test)

    # Train and predict numeric ratings
    for name, model in numeric_models.items():
        model.fit(X_train, y_train_numeric)
        y_pred_numeric = model.predict(X_test)
        numeric_predictions[name] = y_pred_numeric

        # Check for discrepancies
        for i, (sent_pred, num_pred) in enumerate(zip(y_pred_sentiment, y_pred_numeric)):
            if (sent_pred == 2 and num_pred < 3) or (sent_pred == 0 and num_pred >= 4):
                discrepancies.append({
                    'Model': name,
                    'Review': df.iloc[i]['review_description'],
                    'Predicted Sentiment': le.inverse_transform([sent_pred])[0],
                    'Predicted Rating': num_pred
                })

    return numeric_predictions, y_test_numeric, y_pred_sentiment, discrepancies





# Main function to handle comprehensive analysis and print all tables
def main_full_analysis(file_path):
    # Load and preprocess data without and with emoticons for Table 10 & 11
    df_with_emoticons = load_and_preprocess_data(file_path, remove_emoticons=False)
    df_without_emoticons = load_and_preprocess_data(file_path, remove_emoticons=True)

    # Add additional features for analysis (Table 9)
    df_with_features = add_text_features(df_with_emoticons)

    # Table 10: Random Forest with Different VSM Techniques
    print("\n--- Table 10: Random Forest with Different VSM Techniques ---")
    vsm_results = train_rf_with_vsm_techniques(df_without_emoticons)
    print(tabulate(vsm_results, headers='keys', tablefmt='pipe', floatfmt='.4f'))

    # Table 11: Random Forest with and without Emoticons
    print("\n--- Table 11: Random Forest with and without Emoticons ---")
    emoticon_results = train_rf_with_emoticons(df_with_emoticons, df_without_emoticons)
    print(tabulate(emoticon_results, headers='keys', tablefmt='pipe', floatfmt='.4f'))

    # Table 12: Gradient Boosting with Different Learning Rates
    print("\n--- Table 12: Gradient Boosting with Different Learning Rates ---")
    X_train, X_test, y_train, y_test = prepare_data_for_modeling(df_without_emoticons)
    gbm_results = train_gbm_with_different_learning_rates(X_train, X_test, y_train, y_test)
    print(tabulate(gbm_results, headers='keys', tablefmt='pipe', floatfmt='.4f'))

    # Table 9: Numeric Rating Prediction and Aggregation
    print("\n--- Table 9: Numeric Rating Prediction using Ensemble Classifiers ---")
    numeric_predictions, y_test_numeric, y_pred_sentiment, discrepancies = analyze_and_predict(df_with_features)

    # Aggregate numeric predictions for Table 9
    aggregate_results = {'App Name': 'Example App'}  # Placeholder, use actual app names in a loop if processing multiple files
    for model_name, preds in numeric_predictions.items():
        aggregate_results[model_name] = np.mean(preds)

    # Print aggregate results
    print("\nAggregate Numeric Predictions (Table 9):")
    for model_name, agg_pred in aggregate_results.items():
        print(f"{model_name}: {agg_pred:.2f}")

    # Print discrepancies identified for validation and insights
    print("\nDiscrepancies Identified (Additional Insights):")
    discrepancies_df = pd.DataFrame(discrepancies)
    print(tabulate(discrepancies_df, headers='keys', tablefmt='pipe'))

    # Save all results to CSV files
    vsm_results.to_csv('rf_vsm_results.csv', index=False)
    emoticon_results.to_csv('rf_emoticon_results.csv', index=False)
    gbm_results.to_csv('gbm_learning_rate_results.csv', index=False)
    discrepancies_df.to_csv('discrepancies.csv', index=False)
    print("\nAll results have been saved to CSV files.")

# Example usage
main_full_analysis('/content/fitbit.csv')




--- Table 10: Random Forest with Different VSM Techniques ---
|    | VSM Technique   |   Accuracy |
|---:|:----------------|-----------:|
|  0 | TF/IDF          |     0.8171 |
|  1 | TF/IDF(Bigram)  |     0.8166 |
|  2 | TF/IDF(Trigram) |     0.8168 |
|  3 | TF              |     0.8174 |

--- Table 11: Random Forest with and without Emoticons ---
|    | VSM Technique            |   Accuracy |
|---:|:-------------------------|-----------:|
|  0 | TF/IDF with Emoticons    |     0.8171 |
|  1 | TF/IDF without Emoticons |     0.8171 |

--- Table 12: Gradient Boosting with Different Learning Rates ---
|    |   Learning Rate |   Accuracy |
|---:|----------------:|-----------:|
|  0 |          0.0500 |     0.7536 |
|  1 |          0.1000 |     0.7752 |
|  2 |          0.2500 |     0.7974 |
|  3 |          0.5000 |     0.8077 |
|  4 |          0.7500 |     0.8099 |
|  5 |          1.0000 |     0.8114 |

--- Table 9: Numeric Rating Prediction using Ensemble Classifiers ---
