## 1. Setup & Load dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount('p2')

Mounted at p2


In [3]:
df = pd.read_csv('/content/p2/MyDrive/p2/data/preprocessed_500k_imba.csv')
df.fillna('', inplace=True)
df.head()

Unnamed: 0,text,stars,processed_text
0,Three words: Damn good pastries.\n\nA few mor...,4.0,three word damn good pastry word probably best...
1,Easily one of the worst Red Robin locations. T...,0.0,easily one worst red robin location food delic...
2,Maybe I am just spoiled with good Mexican food...,1.0,maybe spoiled good mexican food growing san di...
3,This Wildflower is always kept clean and the e...,4.0,wildflower always kept clean employee nice pot...
4,Favorite bibimbap in the valley! They also hav...,4.0,favorite bibimbap valley also korean fixing sm...


## 2. Preprocess

In [4]:
%pprint
df['processed_text'] = [text.split(' ') for text in df.processed_text]
[w for w in df['processed_text'][1]]

Pretty printing has been turned OFF


['easily', 'one', 'worst', 'red', 'robin', 'location', 'food', 'delicious', 'service', 'agonizingly', 'atrocious', 'went', 'mom', 'lunch', 'service', 'directed', 'table', 'service', 'plummeted', 'server', '!at', 'hospitable', 'attentive', 'smothered', 'table', 'much', 'larger', 'ticket', 'paid', 'almost', '!attention', 'u', 'two', 'tavern', 'double', 'felt', 'like', 'bother', 'wish', 'could', 'remember', 'name', 'poor', 'service', 'gorgeous', 'location']

## 3. Train, Validation, Test Split

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df['processed_text'], 
    df['stars'], 
    test_size=0.2, 
    stratify=df['stars'], 
    random_state=42
)

x_train, x_val, y_train, y_val = train_test_split(
    x_train, 
    y_train, 
    test_size=0.25, 
    stratify=y_train, 
    random_state=42
)

print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(300000,)
(100000,)
(100000,)


## 4. Vectorization & Modeling

### 4.1 Define functions

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def identity_tokenizer(text):
    return text

def run_classifier(tfidf, model):
  X_train = tfidf.fit_transform(x_train)
  X_val = tfidf.transform(x_val)

  print("X_train shape:", X_train.shape)
  print("X_val shape:", X_val.shape)

  model.fit(X_train, y_train)

  y_pred = model.predict(X_val)
  print("Evaluation results:")
  print(classification_report(y_val, y_pred, digits=4))

### 4.2 Base Model

In [9]:
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features=10000, ngram_range=(1,2))
model = LogisticRegression(multi_class='multinomial', solver='newton-cg')

run_classifier(tfidf, model)

X_train shape: (300000, 10000)
X_val shape: (100000, 10000)
Evaluation results:
              precision    recall  f1-score   support

         0.0     0.7220    0.7914    0.7551     11805
         1.0     0.5168    0.3997    0.4508      9288
         2.0     0.5434    0.4570    0.4965     13361
         3.0     0.5692    0.5442    0.5564     26145
         4.0     0.7511    0.8321    0.7895     39401

    accuracy                         0.6617    100000
   macro avg     0.6205    0.6049    0.6097    100000
weighted avg     0.6506    0.6617    0.6539    100000



### 4.3 Vectorizer Tunings

#### 4.3.1 Max Features = 5000

In [10]:
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features=5000, ngram_range=(1,2))
model = LogisticRegression(multi_class='multinomial', solver='newton-cg')

run_classifier(tfidf, model)

X_train shape: (300000, 5000)
X_val shape: (100000, 5000)
Evaluation results:
              precision    recall  f1-score   support

         0.0     0.7145    0.7949    0.7526     11805
         1.0     0.5123    0.3917    0.4439      9288
         2.0     0.5398    0.4506    0.4911     13361
         3.0     0.5693    0.5403    0.5545     26145
         4.0     0.7491    0.8328    0.7887     39401

    accuracy                         0.6598    100000
   macro avg     0.6170    0.6021    0.6062    100000
weighted avg     0.6481    0.6598    0.6514    100000



#### 4.3.3 Max Features = 15000

In [21]:
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features=15000, ngram_range=(1,2))
model = LogisticRegression(multi_class='multinomial', solver='newton-cg')

run_classifier(tfidf, model)

X_train shape: (300000, 15000)
X_val shape: (100000, 15000)
Evaluation results:
              precision    recall  f1-score   support

         0.0     0.7243    0.7928    0.7570     11805
         1.0     0.5169    0.4015    0.4519      9288
         2.0     0.5454    0.4563    0.4969     13361
         3.0     0.5690    0.5443    0.5564     26145
         4.0     0.7510    0.8325    0.7896     39401

    accuracy                         0.6622    100000
   macro avg     0.6213    0.6055    0.6104    100000
weighted avg     0.6510    0.6622    0.6543    100000



#### 4.3.3 Unigram

In [12]:
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features=10000, ngram_range=(1,1))
model = LogisticRegression(multi_class='multinomial', solver='newton-cg')

run_classifier(tfidf, model)

X_train shape: (300000, 10000)
X_val shape: (100000, 10000)
Evaluation results:
              precision    recall  f1-score   support

         0.0     0.7064    0.7822    0.7424     11805
         1.0     0.4872    0.3711    0.4213      9288
         2.0     0.5133    0.4289    0.4674     13361
         3.0     0.5526    0.5260    0.5390     26145
         4.0     0.7430    0.8260    0.7823     39401

    accuracy                         0.6471    100000
   macro avg     0.6005    0.5869    0.5905    100000
weighted avg     0.6345    0.6471    0.6384    100000



#### 4.3.4 Bigram

In [16]:
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features=10000, ngram_range=(2,2))
model = LogisticRegression(multi_class='multinomial', solver='newton-cg')

run_classifier(tfidf, model)

X_train shape: (300000, 10000)
X_val shape: (100000, 10000)
Evaluation results:
              precision    recall  f1-score   support

         0.0     0.6607    0.6933    0.6766     11805
         1.0     0.4673    0.3090    0.3720      9288
         2.0     0.4924    0.3572    0.4141     13361
         3.0     0.5110    0.4724    0.4909     26145
         4.0     0.6807    0.8225    0.7449     39401

    accuracy                         0.6059    100000
   macro avg     0.5624    0.5309    0.5397    100000
weighted avg     0.5890    0.6059    0.5916    100000



#### 4.3.5 Max Document Frequency = 0.75

In [14]:
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features=10000, ngram_range=(1,2), max_df=0.75)
model = LogisticRegression(multi_class='multinomial', solver='newton-cg')

run_classifier(tfidf, model)

X_train shape: (300000, 10000)
X_val shape: (100000, 10000)
Evaluation results:
              precision    recall  f1-score   support

         0.0     0.7220    0.7914    0.7551     11805
         1.0     0.5168    0.3997    0.4508      9288
         2.0     0.5434    0.4570    0.4965     13361
         3.0     0.5692    0.5442    0.5564     26145
         4.0     0.7511    0.8321    0.7895     39401

    accuracy                         0.6617    100000
   macro avg     0.6205    0.6049    0.6097    100000
weighted avg     0.6506    0.6617    0.6539    100000



#### 4.3.6 Max Document Frequency = 0.50

In [15]:
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features=10000, ngram_range=(1,2), max_df=0.5)
model = LogisticRegression(multi_class='multinomial', solver='newton-cg')

run_classifier(tfidf, model)

X_train shape: (300000, 10000)
X_val shape: (100000, 10000)
Evaluation results:
              precision    recall  f1-score   support

         0.0     0.7217    0.7915    0.7550     11805
         1.0     0.5153    0.3979    0.4491      9288
         2.0     0.5412    0.4563    0.4951     13361
         3.0     0.5694    0.5438    0.5563     26145
         4.0     0.7511    0.8320    0.7895     39401

    accuracy                         0.6614    100000
   macro avg     0.6197    0.6043    0.6090    100000
weighted avg     0.6502    0.6614    0.6535    100000



### 4.4 Model Hyperparameter Tunings

#### 4.4.1 C-value = 0.5

In [17]:
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features=10000, ngram_range=(1,2))
model = LogisticRegression(multi_class='multinomial', solver='newton-cg', C=0.5)

run_classifier(tfidf, model)

X_train shape: (300000, 10000)
X_val shape: (100000, 10000)
Evaluation results:
              precision    recall  f1-score   support

         0.0     0.7181    0.7964    0.7552     11805
         1.0     0.5225    0.3857    0.4438      9288
         2.0     0.5496    0.4473    0.4932     13361
         3.0     0.5686    0.5456    0.5569     26145
         4.0     0.7475    0.8364    0.7894     39401

    accuracy                         0.6618    100000
   macro avg     0.6212    0.6023    0.6077    100000
weighted avg     0.6499    0.6618    0.6529    100000



#### 4.4.2 C-value = 0.1

In [18]:
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features=10000, ngram_range=(1,2))
model = LogisticRegression(multi_class='multinomial', solver='newton-cg', C=0.1)

run_classifier(tfidf, model)

X_train shape: (300000, 10000)
X_val shape: (100000, 10000)
Evaluation results:
              precision    recall  f1-score   support

         0.0     0.7042    0.7969    0.7477     11805
         1.0     0.5336    0.3158    0.3968      9288
         2.0     0.5414    0.3906    0.4538     13361
         3.0     0.5512    0.5362    0.5436     26145
         4.0     0.7276    0.8508    0.7844     39401

    accuracy                         0.6510    100000
   macro avg     0.6116    0.5781    0.5852    100000
weighted avg     0.6358    0.6510    0.6369    100000



#### Solver = lbfgs

In [19]:
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features=10000, ngram_range=(1,2))
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

run_classifier(tfidf, model)

X_train shape: (300000, 10000)
X_val shape: (100000, 10000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Evaluation results:
              precision    recall  f1-score   support

         0.0     0.7143    0.7967    0.7532     11805
         1.0     0.5133    0.3928    0.4450      9288
         2.0     0.5488    0.4406    0.4888     13361
         3.0     0.5703    0.5349    0.5520     26145
         4.0     0.7448    0.8408    0.7899     39401

    accuracy                         0.6605    100000
   macro avg     0.6183    0.6011    0.6058    100000
weighted avg     0.6479    0.6605    0.6511    100000



#### Solver = saga

In [20]:
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features=10000, ngram_range=(1,2))
model = LogisticRegression(multi_class='multinomial', solver='saga')

run_classifier(tfidf, model)

X_train shape: (300000, 10000)
X_val shape: (100000, 10000)
Evaluation results:
              precision    recall  f1-score   support

         0.0     0.7220    0.7914    0.7551     11805
         1.0     0.5169    0.3998    0.4509      9288
         2.0     0.5433    0.4570    0.4964     13361
         3.0     0.5692    0.5441    0.5564     26145
         4.0     0.7511    0.8321    0.7895     39401

    accuracy                         0.6617    100000
   macro avg     0.6205    0.6049    0.6097    100000
weighted avg     0.6506    0.6617    0.6539    100000



## 5. Evaluate

In [22]:
from sklearn.metrics import confusion_matrix

tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features=15000, ngram_range=(1,2))
model = LogisticRegression(multi_class='multinomial', solver='newton-cg', C=0.5)

X_train = tfidf.fit_transform(x_train)
X_test = tfidf.transform(x_test)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Evaluation results:")
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))

X_train shape: (300000, 15000)
X_test shape: (100000, 15000)
Evaluation results:
              precision    recall  f1-score   support

         0.0     0.7170    0.7892    0.7514     11805
         1.0     0.5250    0.3904    0.4478      9287
         2.0     0.5452    0.4446    0.4898     13362
         3.0     0.5624    0.5411    0.5516     26145
         4.0     0.7464    0.8345    0.7880     39401

    accuracy                         0.6591    100000
   macro avg     0.6192    0.6000    0.6057    100000
weighted avg     0.6474    0.6591    0.6504    100000

[[ 9317  1569   478   183   258]
 [ 2535  3626  2260   586   280]
 [  771  1458  5941  4213   979]
 [  205   199  1941 14147  9653]
 [  167    55   276  6024 32879]]
