# Boosting, Bagging, NB, and SVC Implementations for Text Classification: A Performance Comparison

### Imports

In [62]:
import os
from time import time
import numpy as np
import pandas as pd, polars as pl
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import (RandomForestClassifier,
                              AdaBoostClassifier,
                              GradientBoostingClassifier,
                              HistGradientBoostingClassifier)
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt

### Read in data from txt directories (`pos`, `neg`, `train`, `test`)

In [63]:
def read_data(split: str, sentiment: str):
    dir = os.listdir(f'data/{split}/{sentiment}/') # directory of reviews

    review_rating = {}
    for file in dir:
        rating = file.split("_")[-1].split(".")[0] # rating is in the file name
        with open(f"data/{split}/{sentiment}/{file}") as file:
            for line in file:
                line = " ".join(line.split("<br />")) # replace line break symbols with spaces
                review_rating[line] = rating
    return review_rating

### Form `train` and `test` sets

In [64]:
pos_train = read_data('train', 'pos')
neg_train = read_data('train', 'neg')
pos_test = read_data('test', 'neg')
neg_test = read_data('test', 'neg')

train = pos_train | neg_train
test = pos_test | neg_test

In [65]:
train_data = pd.DataFrame(data=train.items(), 
                          index=range(len(train.keys())), 
                          columns=['Review', 'Rating'])

test_data = pd.DataFrame(data=test.items(), 
                          index=range(len(test.keys())), 
                          columns=['Review', 'Rating'])

#### Download stopwords for vectorizer

In [66]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rohilk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [74]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

X_train = TfidfVectorizer(max_features=100, stop_words=stop_words).fit_transform(train_data.Review).toarray()
X_test = TfidfVectorizer(max_features=100, stop_words=stop_words).fit_transform(test_data.Review).toarray()

y_train = train_data.Rating
y_train = encoder.fit_transform(y_train)
y_test = test_data.Rating
y_test = encoder.transform(y_test)

X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.1)

### Building Each Model

In [86]:
dtree = DecisionTreeClassifier(max_depth=12, random_state=1234)

randfrst = RandomForestClassifier(n_estimators=500,
                            max_features=0.06,
                            n_jobs=6,
                            random_state=1234)

mnb = MultinomialNB()

dec_stump = DecisionTreeClassifier(max_depth=1, max_features=0.06)                      
ab = AdaBoostClassifier(estimator=dec_stump, 
                        n_estimators=500, 
                        learning_rate=0.5, 
                        random_state=1234)

gradboost = GradientBoostingClassifier(n_estimators=2000,
                                 subsample=0.67,
                                 max_features=0.06,
                                 validation_fraction=0.1,
                                 n_iter_no_change=15,
                                 verbose=0,
                                 random_state=1234)

histgrad = HistGradientBoostingClassifier(max_iter=2000,
                                      validation_fraction=0.1,
                                      n_iter_no_change=15,
                                      verbose=0,
                                      random_state=1234)

xgb = XGBClassifier(n_estimators=2000,
                    tree_method='hist',
                    subsample=0.67,
                    colsample_level=0.06,
                    verbose=0,
                    n_jobs=6,
                    random_state=1234)

# catboost = CatBoostClassifier(n_estimators=2000,
#                         colsample_bylevel=0.06,
#                         max_leaves=31,
#                         subsample=0.67,
#                         verbose=0,
#                         thread_count=6,
#                         random_state=1234)

### Model Comparison

In [89]:
models = [dtree, randfrst, mnb, ab, gradboost, histgrad, xgb, #catboost
]
model_names = [i.__class__.__name__ for i in models]

In [90]:
earlystops = ['XGBClassifier',
             'CatBoostClassifier']

### Run Models + Runtime and Manual Accuracy

In [91]:
results = pd.DataFrame(columns=['Accuracy', 'Run Time'])

for m, n in zip(models, model_names):
    
    start_time = time()
    # if n in earlystops:
    #     m.fit(X_train_sub,
    #           y_train_sub,
    #           eval_set = [(X_val, y_val)],
    #           early_stopping_rounds=15,
    #           verbose=0)
    # else:
    m.fit(X_train, y_train)
    
    run_time = time() - start_time
    accuracy = np.mean(m.predict(X_test) == y_test)
        
    results.loc[n] = [accuracy, run_time]
    
    del m

Parameters: { "colsample_level", "verbose" } are not used.



### Report results

In [92]:
print(results)

                                Accuracy    Run Time
DecisionTreeClassifier          0.242861    0.604859
RandomForestClassifier          0.244317    6.479730
MultinomialNB                   0.240272    0.016123
AdaBoostClassifier              0.213251    5.667853
GradientBoostingClassifier      0.216649   12.838163
HistGradientBoostingClassifier  0.211957    4.650105
XGBClassifier                   0.188901  355.328899
