In [15]:
# Import necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
from tabulate import tabulate

# Download NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Function to load and preprocess data
def load_and_preprocess_data(file_path, remove_emoticons=True):
    df = pd.read_csv(file_path)
    df = df[['review_description', 'rating']]
    df['review_description'] = df['review_description'].astype(str)
    df['cleaned_review'] = df['review_description'].apply(lambda x: clean_text(x, remove_emoticons))
    df['sentiment'] = df['rating'].apply(label_sentiment)
    return df

# Text cleaning function with option to remove emoticons
def clean_text(text, remove_emoticons=True):
    if isinstance(text, str):
        if remove_emoticons:
            text = re.sub(r'[^\w\s]', '', text)  # Remove emoticons
        text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove non-alphabetical characters
        text = text.lower()
        stop_words = set(stopwords.words('english'))
        words = [word for word in text.split() if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        return ' '.join(words)
    else:
        return ''

# Function to label sentiment
def label_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

# Prepare data for modeling
def prepare_data_for_modeling(df):
    vectorizer = TfidfVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df['cleaned_review'])
    le = LabelEncoder()
    y = le.fit_transform(df['sentiment'])
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest using different VSM techniques
def train_rf_with_vsm_techniques(df):
    results = []

    # TF-IDF Unigram
    vectorizer = TfidfVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df['cleaned_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'VSM Technique': 'TF/IDF', 'Accuracy': accuracy})

    # TF-IDF Bigrams
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
    X = vectorizer.fit_transform(df['cleaned_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'VSM Technique': 'TF/IDF(Bigram)', 'Accuracy': accuracy})

    # TF-IDF Trigrams
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))
    X = vectorizer.fit_transform(df['cleaned_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'VSM Technique': 'TF/IDF(Trigram)', 'Accuracy': accuracy})

    # TF (Term Frequency only)
    vectorizer = CountVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df['cleaned_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'VSM Technique': 'TF', 'Accuracy': accuracy})

    return pd.DataFrame(results)

# Function to train Random Forest with and without emoticons
def train_rf_with_emoticons(df_with_emoticons, df_without_emoticons):
    results = []

    # Run Random Forest with Emoticons
    vectorizer = TfidfVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df_with_emoticons['cleaned_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df_with_emoticons['sentiment'], test_size=0.2, random_state=42)
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'VSM Technique': 'TF/IDF with Emoticons', 'Accuracy': accuracy})

    # Run Random Forest without Emoticons
    vectorizer = TfidfVectorizer(max_features=10000)
    X = vectorizer.fit_transform(df_without_emoticons['cleaned_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df_without_emoticons['sentiment'], test_size=0.2, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'VSM Technique': 'TF/IDF without Emoticons', 'Accuracy': accuracy})

    return pd.DataFrame(results)

# Train Gradient Boosting models with different learning rates
def train_gbm_with_different_learning_rates(X_train, X_test, y_train, y_test):
    learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1.0]
    results = []
    for rate in learning_rates:
        model = GradientBoostingClassifier(n_estimators=100, learning_rate=rate, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results.append({'Learning Rate': rate, 'Accuracy': accuracy})
    return pd.DataFrame(results)

# Function to generate Table 9: Numeric-Rating Prediction using Ensemble Classifiers
def numeric_rating_prediction(df):
    classifiers = {
        'XGB': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        'RF': RandomForestClassifier(n_estimators=100, random_state=42),
        'GBM': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'AB': AdaBoostClassifier(n_estimators=100, random_state=42),
        'ET': ExtraTreesClassifier(n_estimators=100, random_state=42)
    }

    results = []

    for name, clf in classifiers.items():
        X_train, X_test, y_train, y_test = prepare_data_for_modeling(df)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rating = 5 - mse  # Example of numeric rating conversion
        results.append({'Classifier': name, 'Predicted Rating': f"{rating:.2f}"})

    # Display the table
    table_df = pd.DataFrame(results)
    print("\nTable 9: Numeric-Rating Prediction using Ensemble Classifiers")
    print(tabulate(table_df, headers='keys', tablefmt='pipe', showindex=False))

    return table_df

# Function to run the analysis for client's tables 9, 10, 11, and 12
def main_vsm_and_gbm(file_path):
    # Load and preprocess data without and with emoticons
    df_with_emoticons = load_and_preprocess_data(file_path, remove_emoticons=False)
    df_without_emoticons = load_and_preprocess_data(file_path, remove_emoticons=True)

    # Train and display Table 9
    numeric_results = numeric_rating_prediction(df_without_emoticons)

    # Train Random Forest using different V
        # Train Random Forest using different VSM techniques (Table 10)
    vsm_results = train_rf_with_vsm_techniques(df_without_emoticons)
    print("\nRandom Forest Results with Different VSM Techniques (Table 10):")
    print(tabulate(vsm_results, headers='keys', tablefmt='pipe', floatfmt='.4f'))

    # Train Random Forest with and without emoticons (Table 11)
    emoticon_results = train_rf_with_emoticons(df_with_emoticons, df_without_emoticons)
    print("\nRandom Forest Results with and without Emoticons (Table 11):")
    print(tabulate(emoticon_results, headers='keys', tablefmt='pipe', floatfmt='.4f'))

    # Prepare data for Gradient Boosting (Table 12)
    X_train, X_test, y_train, y_test = prepare_data_for_modeling(df_without_emoticons)

    # Train Gradient Boosting with different learning rates (Table 12)
    gbm_results = train_gbm_with_different_learning_rates(X_train, X_test, y_train, y_test)
    print("\nGradient Boosting Results with Different Learning Rates (Table 12):")
    print(tabulate(gbm_results, headers='keys', tablefmt='pipe', floatfmt='.4f'))

    # Save all tables to CSV files
    numeric_results.to_csv('numeric_rating_results.csv', index=False)
    vsm_results.to_csv('rf_vsm_results.csv', index=False)
    emoticon_results.to_csv('rf_emoticon_results.csv', index=False)
    gbm_results.to_csv('gbm_learning_rate_results.csv', index=False)

    print("\nResults have been saved to 'numeric_rating_results.csv', 'rf_vsm_results.csv', 'rf_emoticon_results.csv', and 'gbm_learning_rate_results.csv'")

# Running the updated pipeline using the 'fitbit.csv' file as input
if __name__ == "__main__":
    main_vsm_and_gbm('/content/fitbit.csv')


Parameters: { "use_label_encoder" } are not used.




Table 9: Numeric-Rating Prediction using Ensemble Classifiers
| Classifier   |   Predicted Rating |
|:-------------|-------------------:|
| XGB          |               4.56 |
| RF           |               4.59 |
| GBM          |               4.42 |
| AB           |               4.47 |
| ET           |               4.61 |

Random Forest Results with Different VSM Techniques (Table 10):
|    | VSM Technique   |   Accuracy |
|---:|:----------------|-----------:|
|  0 | TF/IDF          |     0.8171 |
|  1 | TF/IDF(Bigram)  |     0.8166 |
|  2 | TF/IDF(Trigram) |     0.8168 |
|  3 | TF              |     0.8174 |

Random Forest Results with and without Emoticons (Table 11):
|    | VSM Technique            |   Accuracy |
|---:|:-------------------------|-----------:|
|  0 | TF/IDF with Emoticons    |     0.8171 |
|  1 | TF/IDF without Emoticons |     0.8171 |

Gradient Boosting Results with Different Learning Rates (Table 12):
|    |   Learning Rate |   Accuracy |
|---:|---------------