In [1]:
import csv
import pandas as pd
import sklearn.model_selection as skm
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score,
                             confusion_matrix,
                             ConfusionMatrixDisplay,
                             classification_report)
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data = pd.read_csv('yelp_labelled.txt', header=None, sep='\t')
data.head(5)

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
X_text = data[0]
y = data[1]
X_train, X_test, y_train, y_test = skm.train_test_split(X_text, y, test_size=0.2, stratify=y, random_state=0)
X_text_train, X_validation, y_text_train, y_validation = skm.train_test_split(X_train, y_train, test_size=0.125, stratify=y_train, random_state=0)

In [12]:
import fasttext
model = fasttext.load_model("cc.en.300.bin")

In [13]:
X_train_fasttext = [model.get_sentence_vector(text) for text in X_text_train]
X_validation_fasttext = [model.get_sentence_vector(text) for text in X_validation]
X_test_fasttext = [model.get_sentence_vector(text) for text in X_test]

In [14]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer('all-MiniLM-L6-v2') 

X_train_sentence = sentence_model.encode(X_text_train.tolist(), convert_to_tensor=True)
X_validation_sentence = sentence_model.encode(X_validation.tolist(), convert_to_tensor=True)
X_test_sentence = sentence_model.encode(X_test.tolist(), convert_to_tensor=True)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
lr = LogisticRegression(max_iter=1000)

# Define hyperparameters to tune using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'lbfgs'],  # Different solvers
}

# Perform GridSearchCV to find the best hyperparameters on the validation set
grid_search = GridSearchCV(lr, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train_fasttext, y_text_train)

# Get the best model and evaluate on the validation data
best_lr = grid_search.best_estimator_
y_validation_pred = best_lr.predict(X_validation_fasttext)
print("Validation Accuracy with FastText + Logistic Regression: %.3f" % accuracy_score(y_validation, y_validation_pred))

# Apply the best model to the test data
y_test_pred = best_lr.predict(X_test_fasttext)
print("Test Accuracy with FastText + Logistic Regression: %.3f" % accuracy_score(y_test, y_test_pred))


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Validation Accuracy with FastText + Logistic Regression: 0.850
Test Accuracy with FastText + Logistic Regression: 0.755


# As you can see, there is some overfitting for logistic regression, so test set accuracy is not high. However, this is a model that already has regularization

In [None]:
lr = LogisticRegression(max_iter=1000)

# Define the hyperparameters to tune
param_grid = {
    'C': [0.1, 1, 10],        # Regularization strength
    'solver': ['liblinear', 'lbfgs'],  # Different solvers for logistic regression
}

# Perform GridSearchCV on the validation set
grid_search = GridSearchCV(lr, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train_sentence, y_text_train)

# Get the best model and evaluate on the validation data
best_lr = grid_search.best_estimator_
y_validation_pred = best_lr.predict(X_validation_sentence)
print("Validation Accuracy with Sentence Transformers + Logistic Regression: %.3f" % accuracy_score(y_validation, y_validation_pred))


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Validation Accuracy with Sentence Transformers + Logistic Regression: 0.870


In [None]:
# Apply the best model to the test data
y_test_pred = best_lr.predict(X_test_sentence)
print("Test Accuracy with Sentence Transformers + Logistic Regression: %.3f" % accuracy_score(y_test, y_test_pred))


Test Accuracy with Sentence Transformers + Logistic Regression: 0.880


# Sentence transformed data shows better results for test than validation, but that can be explained because validation set is too small and test set is a little bigger (10 validation to 20 test)

In [None]:
gb = GradientBoostingClassifier()
gb_params = {'n_estimators': [50, 100, 150],
             'learning_rate': [0.01, 0.1, 0.2],
             'max_depth': [3, 5, 7]}
gb_grid = GridSearchCV(gb, gb_params, cv=5, scoring='accuracy')
gb_grid.fit(X_train_fasttext, y_text_train)

# Evaluate on validation data
best_gb = gb_grid.best_estimator_
gb_validation_pred = best_gb.predict(X_validation_fasttext)
print("Gradient Boosting Validation Accuracy:", accuracy_score(y_validation, gb_validation_pred))

# Apply on test data
gb_test_pred = best_gb.predict(X_test_fasttext)
print("Gradient Boosting Test Accuracy:", accuracy_score(y_test, gb_test_pred))
print(classification_report(y_test, gb_test_pred))


Gradient Boosting Validation Accuracy: 0.78
Gradient Boosting Test Accuracy: 0.79
              precision    recall  f1-score   support

           0       0.78      0.80      0.79       100
           1       0.80      0.78      0.79       100

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.79      0.79      0.79       200



# I chose gradient boosting, because it can capture non linear relationships well. Sentiment analysis involves capturing complex relationships in the data, hence gradient boosting is useful. Plus fasttext and sentence transformer encode data making it complex, which makes gradient boosting useful. 
# As expected, fasttext model is not working very well given the accuracy of 0.79

In [16]:
gb = GradientBoostingClassifier()
gb_params = {'n_estimators': [50, 100, 150],
             'learning_rate': [0.01, 0.1, 0.2],
             'max_depth': [3, 5, 7]}
gb_grid = GridSearchCV(gb, gb_params, cv=5, scoring='accuracy')
gb_grid.fit(X_train_sentence, y_text_train)

# Evaluate on validation data
best_gb = gb_grid.best_estimator_
gb_validation_pred = best_gb.predict(X_validation_sentence)
print("Gradient Boosting Validation Accuracy:", accuracy_score(y_validation, gb_validation_pred))

# Apply on test data
gb_test_pred = best_gb.predict(X_test_sentence)
print("Gradient Boosting Test Accuracy:", accuracy_score(y_test, gb_test_pred))
print(classification_report(y_test, gb_test_pred))

Gradient Boosting Validation Accuracy: 0.84
Gradient Boosting Test Accuracy: 0.89
              precision    recall  f1-score   support

           0       0.90      0.88      0.89       100
           1       0.88      0.90      0.89       100

    accuracy                           0.89       200
   macro avg       0.89      0.89      0.89       200
weighted avg       0.89      0.89      0.89       200

Best Parameters for Logistic Regression with FastText embeddings: {'C': 10, 'solver': 'liblinear'}


# However, given the sentence transformed data, the test accuracy is highest among all models so far

In [5]:
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment', tokenize_no_ssplit=True)

collection = '\n\n'.join(X_test)
doc = nlp(collection)

stanza_predictions = []
for i, sentence in enumerate(doc.sentences):
    if sentence.sentiment == 0 or sentence.sentiment == 1:
        stanza_predictions.append(0)
    if sentence.sentiment == 2:
        stanza_predictions.append(1)

print('Test set accuracy: %.3f' % accuracy_score(y_test, stanza_predictions))

2024-11-14 17:29:07 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 2.92MB/s]                    
2024-11-14 17:29:08 INFO: Downloaded file to C:\Users\DhM\stanza_resources\resources.json
2024-11-14 17:29:08 INFO: Loading these models for language: en (English):
| Processor | Package        |
------------------------------
| tokenize  | combined       |
| mwt       | combined       |
| sentiment | sstplus_charlm |

2024-11-14 17:29:08 INFO: Using device: cpu
2024-11-14 17:29:08 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-14 17:29:08 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-14 17:29:08 INFO: Loading: sentiment
  checkpoint = 

Test set accuracy: 0.915


# Despite gradient boosting having higher scores than logistic regressor, stanza model which is best suited for sentiment analysis outperforms gradient boosting. Perhaps the data is not complex enough for gradeint boosting to work better. However, there is a possibility that if the hyperparameter range was taken broader, gradient boosting score could have increased