In [11]:
# ! pip install xgboost
# ! pip install imblearn

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from scipy.sparse import load_npz

In [2]:
df = pd.read_csv('../data/processed/labelled_data.csv')
df.head()

Unnamed: 0,helpfulVotes,id,address,city,placeName,numberOfReviews,placeRating,webUrl,publishedDate,userRating,...,rule_based_tokens,spacy_tokens,wordpiece_tokens,vader_score,compound_score,vader_category,flair_sentiment,flair_confidence,textblob_sentiment,ground_truth
0,0.0,978474125.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-11-03,5.0,...,"['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","{'neg': 0.0, 'neu': 0.389, 'pos': 0.611, 'comp...",0.9868,positive,positive,0.943697,positive,positive
1,0.0,978053018.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-30,1.0,...,"['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","{'neg': 0.173, 'neu': 0.762, 'pos': 0.066, 'co...",-0.399,negative,positive,0.590723,negative,negative
2,0.0,976992067.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-28,5.0,...,"['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","{'neg': 0.079, 'neu': 0.334, 'pos': 0.587, 'co...",0.9796,positive,positive,0.922921,positive,positive
3,0.0,976690540.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,...,"['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","{'neg': 0.0, 'neu': 0.68, 'pos': 0.32, 'compou...",0.9468,positive,positive,0.996439,positive,positive
4,0.0,976664122.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,...,"['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","{'neg': 0.024, 'neu': 0.522, 'pos': 0.455, 'co...",0.9961,positive,positive,0.991161,positive,positive


In [13]:
count_vectors = load_npz('../data/processed/vector_representations/count_vectors.npz')
tdidf_vectors = load_npz('../data/processed/vector_representations/tfidf_vectors.npz')

In [14]:
count_vectors

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 458359 stored elements and shape (11808, 3000)>

In [15]:
tdidf_vectors

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 458359 stored elements and shape (11808, 3000)>

In [6]:
# one hot encode majority vote
df['ground_truth'] = pd.Categorical(df['ground_truth'])
df['ground_truth'] = df['ground_truth'].cat.codes

In [7]:
df['ground_truth'].value_counts()

ground_truth
1    11227
0      581
Name: count, dtype: int64

In [8]:
def train_evaluate_xgboost(X, y, vectorizer_name):
    """Train and evaluate XGBoost model with given features"""
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Convert sparse matrices to DMatrix format if needed
    if isinstance(X, np.ndarray):
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
    else:
        dtrain = xgb.DMatrix(X_train.toarray(), label=y_train)
        dtest = xgb.DMatrix(X_test.toarray(), label=y_test)
    
    # Set XGBoost parameters
    params = {
        'objective': 'multi:softmax',
        'num_class': 3,
        'eval_metric': 'mlogloss',
        'max_depth': 6,
        'eta': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8
    }
    
    # Train the model
    num_rounds = 100
    model = xgb.train(params, dtrain, num_rounds)
    
    # Make predictions
    y_pred = (model.predict(dtest) > 0.5).astype(int)
    
    # Print results
    print(f"\nResults for {vectorizer_name}:")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    
    return model, y_test, y_pred

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import numpy as np

def train_evaluate_xgboost(X, y, vectorizer_name):
    """Train and evaluate XGBoost model with class imbalance handling."""
    # Stratified split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Handle class imbalance with SMOTE (optional, for oversampling)
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    print(f"Resampled dataset shape: {X_train.shape, y_train.shape}")
    # print resamped dataset class distribution
    print(pd.Series(y_train).value_counts())

    # Convert sparse matrices to DMatrix format if needed
    if isinstance(X, np.ndarray):
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
    else:
        dtrain = xgb.DMatrix(X_train.toarray(), label=y_train)
        dtest = xgb.DMatrix(X_test.toarray(), label=y_test)

    # Calculate class weights
    unique_classes, class_counts = np.unique(y_train, return_counts=True)
    class_weights = {cls: max(class_counts) / count for cls, count in zip(unique_classes, class_counts)}

    # Set XGBoost parameters with class weights
    params = {
        'objective': 'multi:softmax',  # Use 'multi:softmax' for class labels
        'num_class': len(unique_classes),  # Number of classes
        'eval_metric': 'mlogloss',
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'eta': 0.1,  # Learning rate
        # 'scale_pos_weight': [class_weights[i] for i in range(len(unique_classes))]
    }

    # Train the model
    num_rounds = 100
    model = xgb.train(params, dtrain, num_rounds)

    # Make predictions
    y_pred = model.predict(dtest).astype(int)

    # Print results
    print(f"\nResults for {vectorizer_name}:")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

    return model, y_test, y_pred


In [None]:
count_xgb, _, _ = train_evaluate_xgboost(count_vectors, df['ground_truth'], 'Count Vectorizer')

Resampled dataset shape: ((17962, 3000), (17962,))
ground_truth
1    8981
0    8981
Name: count, dtype: int64

Results for Count Vectorizer:
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.79      0.68       116
           1       0.99      0.97      0.98      2246

    accuracy                           0.96      2362
   macro avg       0.80      0.88      0.83      2362
weighted avg       0.97      0.96      0.97      2362

Accuracy: 0.9640


In [16]:
tfidf_xgb, _, _ = train_evaluate_xgboost(tdidf_vectors, df['ground_truth'], 'TF-IDF Vectorizer')

Resampled dataset shape: ((17962, 3000), (17962,))
ground_truth
1    8981
0    8981
Name: count, dtype: int64

Results for TF-IDF Vectorizer:
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.67      0.66       116
           1       0.98      0.98      0.98      2246

    accuracy                           0.97      2362
   macro avg       0.82      0.83      0.82      2362
weighted avg       0.97      0.97      0.97      2362

Accuracy: 0.9661
