In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Download NLTK data (if not already installed)
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv('fitbit.csv')

df = df[['review_description', 'rating']]# Define function to clean text

def clean_text(text):
    if isinstance(text, str):  # Check if the text is a string
        # Remove punctuation, numbers, and special characters
        text = re.sub(r'[^A-Za-z\s]', '', text)
        # Convert text to lowercase
        text = text.lower()
        # Tokenize text and remove stopwords
        stop_words = set(stopwords.words('english'))
        words = text.split()
        words = [word for word in words if word not in stop_words]
        # Lemmatize words
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        return ' '.join(words)
    else:
        return ''  # Return empty string if not a valid string

# Ensure the 'review_description' column is treated as strings
df['review_description'] = df['review_description'].astype(str)

# Apply cleaning to the 'review_description' column
df['cleaned_review'] = df['review_description'].apply(clean_text)

# Convert rating to sentiment labels
def label_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

df['sentiment'] = df['rating'].apply(label_sentiment)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features if necessary
X = vectorizer.fit_transform(df['cleaned_review'])

# Encode sentiment labels
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate models
for name, model in models.items():
    print(f'Training {name}...')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test) if hasattr(model, 'predict_proba') else None

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    if y_prob is not None:
        auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
    else:
        auc = 'Not available'

    print(f'{name} Accuracy: {accuracy}')
    print(f'{name} AUC: {auc}')
    print(f'Classification Report for {name}:\n')
    print(classification_report(y_test, y_pred))
    print("-" * 50)

Training Random Forest...
Random Forest Accuracy: 0.8171336901602723
Random Forest AUC: 0.884712841023002
Classification Report for Random Forest:

              precision    recall  f1-score   support

           0       0.73      0.93      0.82     24323
           1       0.32      0.01      0.03      6995
           2       0.90      0.90      0.90     35381

    accuracy                           0.82     66699
   macro avg       0.65      0.61      0.58     66699
weighted avg       0.78      0.82      0.78     66699

--------------------------------------------------
Training Gradient Boosting...
Gradient Boosting Accuracy: 0.7752140211997182
Gradient Boosting AUC: 0.864904975566419
Classification Report for Gradient Boosting:

              precision    recall  f1-score   support

           0       0.75      0.77      0.76     24323
           1       0.43      0.02      0.03      6995
           2       0.79      0.93      0.86     35381

    accuracy                          

KeyboardInterrupt: 