## Setup

In [90]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import gensim.downloader as gensim_api
from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import word_tokenize

np.set_printoptions(precision=2, suppress=True)


In [2]:
df = pd.read_csv("../data/Merged/reviews_cleaned.csv")

df["review_title_cleaned"] = df["review_title_cleaned"].astype(str)
df["review_text_cleaned"] = df["review_text_cleaned"].astype(str)

df


Unnamed: 0,review_title_cleaned,review_text_cleaned,title_length,text_length,num_special_chars,review_type
0,hey,what be there to say about a fantastic chocola...,1,45,3,positive
1,tasty nutritious in one easy to administer po...,my 15 monthold twin boy be not big fan of spin...,8,280,34,positive
2,absolutely tasty,okay so these little can be not cheap but wh...,2,102,20,positive
3,a good daily roast,this coffee have become one of my daily favori...,4,35,4,positive
4,newman s own turkey vegetable catfood,i have four cat with differ tastebudslikes th...,6,76,15,positive
...,...,...,...,...,...,...
29995,have an artificial vanilla flavor,sorry but i be look for a nice madagascar vani...,5,28,1,negative
29996,still wait,i order this item in august and i be a premium...,2,43,19,negative
29997,expensive,do yourself a favor and go to the nearest supe...,1,27,11,negative
29998,yucky vegetable smoothie addin,gerber change the recipe for the worse this v...,4,51,7,negative


In [3]:
def split_X_y(data: pd.DataFrame, encoder: LabelEncoder, fit: bool = False):
    features = data.drop("review_type", axis=1)
    labels = data.iloc[:, -1].to_numpy().reshape(-1, 1)

    if fit:
        target = encoder.fit_transform(labels)
    else:
        target = encoder.transform(labels)

    return features, np.array(target)


def build_model_pipe(
    vectorizer: CountVectorizer, model: BaseEstimator, include_extra: bool = True
) -> Pipeline:
    column_transformer = ColumnTransformer(
        [
            ("title_vectorizer", vectorizer, "review_title_cleaned"),
            ("text_vectorizer", vectorizer, "review_text_cleaned"),
        ],
        remainder="passthrough" if include_extra else "drop",
    )

    pipe = Pipeline([("vectorize", column_transformer), ("classify", model)])

    return pipe


In [4]:
encoder = LabelEncoder()

train_df = df.sample(frac=0.8)
X_train, y_train = split_X_y(train_df, encoder, fit=True)

test_df = df.drop(train_df.index)
X_test, y_test = split_X_y(test_df, encoder)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


## Bag of Words

In [5]:
pipe_bow_mnb_clf = build_model_pipe(
    vectorizer=CountVectorizer(), model=MultinomialNB(), include_extra=False
)

pipe_bow_mnb_clf.fit(X_train, y_train)

y_pred = pipe_bow_mnb_clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.73      0.68      0.70      2018
           1       0.61      0.71      0.65      1984
           2       0.85      0.76      0.80      1998

    accuracy                           0.72      6000
   macro avg       0.73      0.72      0.72      6000
weighted avg       0.73      0.72      0.72      6000



In [6]:
pipe_bow_xgb_clf = build_model_pipe(
    vectorizer=CountVectorizer(max_features=35000),
    model=XGBClassifier(),
    include_extra=False,
)

pipe_bow_xgb_clf.fit(X_train, y_train)

y_pred = pipe_bow_xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.72      0.75      0.74      2018
           1       0.67      0.67      0.67      1984
           2       0.82      0.79      0.81      1998

    accuracy                           0.74      6000
   macro avg       0.74      0.74      0.74      6000
weighted avg       0.74      0.74      0.74      6000



In [None]:
import joblib
joblib.dump(pipe_bow_xgb_clf, "../data/models/bow_xgb_clf.jlib")

## Binary Bag of Words

In [7]:
pipe_bbow_mnb_clf = build_model_pipe(
    vectorizer=CountVectorizer(max_features=40000, binary=True),
    model=MultinomialNB(),
    include_extra=False,
)

pipe_bbow_mnb_clf.fit(X_train, y_train)

y_pred = pipe_bbow_mnb_clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.75      0.71      0.73      2018
           1       0.63      0.72      0.67      1984
           2       0.85      0.78      0.81      1998

    accuracy                           0.74      6000
   macro avg       0.74      0.74      0.74      6000
weighted avg       0.74      0.74      0.74      6000



In [8]:
pipe_bbow_xgb_clf = build_model_pipe(
    vectorizer=CountVectorizer(max_features=45000, binary=True),
    model=XGBClassifier(),
    include_extra=False,
)

pipe_bbow_xgb_clf.fit(X_train, y_train)

y_pred = pipe_bbow_xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.74      0.75      0.75      2018
           1       0.66      0.68      0.67      1984
           2       0.82      0.79      0.81      1998

    accuracy                           0.74      6000
   macro avg       0.74      0.74      0.74      6000
weighted avg       0.74      0.74      0.74      6000



## N-Grams

In [9]:
pipe_ngram_mnb_clf = build_model_pipe(
    vectorizer=CountVectorizer(ngram_range=(2, 3)), model=MultinomialNB()
)

pipe_ngram_mnb_clf.fit(X_train, y_train)

y_pred = pipe_ngram_mnb_clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.78      0.65      0.71      2018
           1       0.55      0.85      0.67      1984
           2       0.95      0.59      0.72      1998

    accuracy                           0.70      6000
   macro avg       0.76      0.70      0.70      6000
weighted avg       0.76      0.70      0.70      6000



In [10]:
pipe_ngram_xgb_clf = build_model_pipe(
    vectorizer=CountVectorizer(ngram_range=(2, 5)), model=XGBClassifier()
)

pipe_ngram_xgb_clf.fit(X_train, y_train)

y_pred = pipe_ngram_xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred))


## TF-IDF

In [None]:
pipe_tfidf_mnb_clf = build_model_pipe(
    vectorizer=TfidfVectorizer(), model=MultinomialNB(), include_extra=False
)

pipe_tfidf_mnb_clf.fit(X_train, y_train)

y_pred = pipe_tfidf_mnb_clf.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
pipe_tfidf_xgb_clf = build_model_pipe(
    vectorizer=TfidfVectorizer(ngram_range=(2, 5), max_features=400000),
    model=XGBClassifier(),
)

pipe_tfidf_xgb_clf.fit(X_train, y_train)

y_pred = pipe_tfidf_xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred))


## Word2Vec

In [5]:
w2v: KeyedVectors = gensim_api.load("glove-twitter-200")

In [67]:
def vectorize_sentence(text: str):
    words = word_tokenize(text)
    vectors = []
    for it in words:
        if it in w2v.key_to_index:
            vectors.append(w2v[it])
        else:
            vectors.append([0.0] * 200)
    return np.mean(vectors, axis=0).tolist()


In [68]:
X_train_vec = np.array(
    [vectorize_sentence(it) for it in X_train["review_text_cleaned"]]
)
X_test_vec = np.array([vectorize_sentence(it) for it in X_test["review_text_cleaned"]])


In [80]:
from sklearn.linear_model import RidgeClassifier

m = RidgeClassifier()
m.fit(X_train_vec, y_train)
y_pred = m.predict(X_test_vec)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.65      0.63      0.64      2060
           1       0.56      0.55      0.56      1998
           2       0.68      0.71      0.70      1942

    accuracy                           0.63      6000
   macro avg       0.63      0.63      0.63      6000
weighted avg       0.63      0.63      0.63      6000



In [69]:
m = XGBClassifier()
m.fit(X_train_vec, y_train)
y_pred = m.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.65      0.65      2060
           1       0.60      0.62      0.61      1998
           2       0.71      0.70      0.70      1942

    accuracy                           0.65      6000
   macro avg       0.65      0.65      0.65      6000
weighted avg       0.65      0.65      0.65      6000

