# Boosting, Bagging, NB, and SVC Implementations for Text Classification: A Performance Comparison

In [2]:
import os
from time import time
import numpy as np
import pandas as pd, polars as pl
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import (RandomForestClassifier,
                              AdaBoostClassifier,
                              GradientBoostingClassifier,
                              HistGradientBoostingClassifier)
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt

### Read in data from txt directories (`pos`, `neg`, `train`, `test`)

In [3]:
def read_data(split: str, sentiment: str):
    dir = os.listdir(f'data/{split}/{sentiment}/') # directory of reviews

    review_rating = {}
    for file in dir:
        rating = file.split("_")[-1].split(".")[0] # rating is in the file name
        with open(f"data/{split}/{sentiment}/{file}") as file:
            for line in file:
                line = " ".join(line.split("<br />")) # replace line break symbols with spaces
                review_rating[line] = rating
    return review_rating

### Form `train` and `test` sets

In [12]:
pos_train = read_data('train', 'pos')
neg_train = read_data('train', 'neg')
pos_test = read_data('test', 'neg')
neg_test = read_data('test', 'neg')

train = pos_train | neg_train
test = pos_test | neg_test

In [13]:
train_data = pd.DataFrame(data=train.items(), 
                          index=range(len(train.keys())), 
                          columns=['Review', 'Rating'])

test_data = pd.DataFrame(data=test.items(), 
                          index=range(len(test.keys())), 
                          columns=['Review', 'Rating'])

In [None]:
X_train = TfidfVectorizer().fit_transform(train_data.Review).toarray()
X_test = TfidfVectorizer().fit_transform(test_data.Review).toarray()

y_train = train_data.Rating
y_test = test_data.Rating

### Read in train_data from `.feat` file

In [3]:
train_data = []
with open("data/train/labeledBow.feat") as file:
    for line in file:
        line = [(line:=line.split())[0]] + [l.split(":")[1] for l in line[1:500]]
        train_data.append(line)

train = pd.DataFrame(train_data).fillna(0).rename({0:'Rating'}, axis=1)

### Read in test_data from `.feat` file

In [4]:
test_data = []
with open("data/test/labeledBow.feat") as file:
    for line in file:
        line = [(line:=line.split())[0]] + [l.split(":")[1] for l in line[1:500]]
        test_data.append(line)

test = pd.DataFrame(test_data).fillna(0).rename({0:'Rating'}, axis=1)

### Clean `train` and `test` data

In [5]:
def clean(df):
    df = df.copy()

    df = df.fillna(0).rename({0:'Rating'}, axis=1)
    df.columns = ['Rating'] + [f"Feature{i}" for i in range(1, df.shape[1])]

    for c in df.columns:
        df[c] = df[c].astype(np.int64)

    return df

In [6]:
train = clean(train)
test = clean(test)

### Split train, test, and validation sets

In [None]:
X_train = train.drop('Rating', axis=1).to_numpy()
y_train = train.Rating.to_numpy().ravel()

X_test = test.drop('Rating', axis=1).to_numpy()
y_test = test.Rating.to_numpy().ravel()

X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.1)

### Building Each Model

In [8]:
dtree = DecisionTreeClassifier(max_depth=12, random_state=1234)

randfrst = RandomForestClassifier(n_estimators=500,
                            max_features=0.06,
                            n_jobs=6,
                            random_state=1234)

mnb = MultinomialNB()

dec_stump = DecisionTreeClassifier(max_depth=1, max_features=0.06)                      
ab = AdaBoostClassifier(estimator=dec_stump, 
                        n_estimators=500, 
                        learning_rate=0.5, 
                        random_state=1234)

gradboost = GradientBoostingClassifier(n_estimators=2000,
                                 subsample=0.67,
                                 max_features=0.06,
                                 validation_fraction=0.1,
                                 n_iter_no_change=15,
                                 verbose=0,
                                 random_state=1234)

histgrad = HistGradientBoostingClassifier(max_iter=2000,
                                      validation_fraction=0.1,
                                      n_iter_no_change=15,
                                      verbose=0,
                                      random_state=1234)

xgb = XGBClassifier(n_estimators=2000,
                    tree_method='hist',
                    subsample=0.67,
                    colsample_level=0.06,
                    verbose=0,
                    n_jobs=6,
                    random_state=1234)

catboost = CatBoostClassifier(n_estimators=2000,
                        colsample_bylevel=0.06,
                        max_leaves=31,
                        subsample=0.67,
                        verbose=0,
                        thread_count=6,
                        random_state=1234)

### Model Comparison

In [16]:
models = [dtree, randfrst, mnb, ab, gradboost, histgrad, xgb, catboost]
model_names = [i.__class__.__name__ for i in models]

In [17]:
earlystops = ['XGBClassifier',
             'CatBoostClassifier']

### Run Models + Runtime and Manual Accuracy

In [18]:
results = pd.DataFrame(columns=['Accuracy', 'Run Time'])

for m, n in zip(models, model_names):
    
    start_time = time()
    # if n in earlystops:
    #     m.fit(X_train_sub,
    #           y_train_sub,
    #           eval_set = [(X_val, y_val)],
    #           early_stopping_rounds=15,
    #           verbose=0)
    # else:
    m.fit(X_train, y_train)
    
    run_time = time() - start_time
    accuracy = np.mean(m.predict(X_test) == y_test)
        
    results.loc[n] = [accuracy, run_time]
    
    del m

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5 6 7], got [ 1  2  3  4  7  8  9 10]

### Report results

In [1]:
print(results)

NameError: name 'results' is not defined