In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('Datasets/ottdata.csv') #Change Path to test code on other datasets
df.head()

BERT Embeddings

In [None]:
# pip install -U sentence-transformers

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    # Tokenize and encode the text
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    with torch.no_grad():
        outputs = model(**inputs)
    # Extract the embeddings from the model output
    embeddings = outputs.last_hidden_state
    # Compute the mean of the embeddings for all tokens
    mean_embeddings = embeddings.mean(dim=1)
    return mean_embeddings.squeeze().numpy()

# Compute BERT embeddings for each review
df['bert_embedding'] = df['review'].apply(get_bert_embedding)

df.head()

In [None]:
labels = df['label']

if len(labels) != bert_embeddings.shape[0]:
    raise ValueError("Mismatch between number of labels and number of embeddings.")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    bert_embeddings, labels, test_size=0.2, random_state=42, stratify=labels)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Initialize the classifier
clf = RandomForestClassifier(random_state=42)

# Perform 5-fold cross-validation
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Random Forest - Cross-validation scores:", scores)
print("Random Forest - Mean CV score:", np.mean(scores))

# Train the classifier on the full training data
clf.fit(X_train, y_train)

# Evaluate on the test set
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)
print("Random Forest - Test accuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize the SVM classifier
svm_clf = SVC(kernel='linear', random_state=42)

# Perform 5-fold cross-validation
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=5)
print("SVM - Cross-validation scores:", svm_scores)
print("SVM - Mean CV score:", np.mean(svm_scores))

# Train the classifier on the full training data
svm_clf.fit(X_train, y_train)

# Evaluate on the test set
y_pred = svm_clf.predict(X_test)
print("SVM - Test accuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Initialize the Logistic Regression classifier
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Perform 5-fold cross-validation
log_reg_scores = cross_val_score(log_reg, X_train, y_train, cv=5)
print("Logistic Regression - Cross-validation scores:", log_reg_scores)
print("Logistic Regression - Mean CV score:", np.mean(log_reg_scores))

# Train the classifier on the full training data
log_reg.fit(X_train, y_train)

# Evaluate on the test set
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression - Test accuracy:", accuracy_score(y_test, y_pred_log_reg))

In [None]:
from sklearn.naive_bayes import GaussianNB

# Initialize the Naive Bayes classifier
nb_classifier = GaussianNB()

# Perform 5-fold cross-validation
nb_scores = cross_val_score(nb_classifier, X_train, y_train, cv=5)
print("Naive Bayes - Cross-validation scores:", nb_scores)
print("Naive Bayes - Mean CV score:", np.mean(nb_scores))

# Train the classifier on the full training data
nb_classifier.fit(X_train, y_train)

# Evaluate on the test set
y_pred_nb = nb_classifier.predict(X_test)
print("Naive Bayes - Test accuracy:", accuracy_score(y_test, y_pred_nb))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score


# Initialize the MLP Classifier
mlp = MLPClassifier(hidden_layer_sizes=(256, 128), activation='relu', solver='adam', 
                    max_iter=500, random_state=42)

# Perform 5-fold cross-validation
mlp_scores = cross_val_score(mlp, X_train, y_train, cv=5)
print("MLP - Cross-validation scores:", mlp_scores)
print("MLP - Mean CV score:", np.mean(mlp_scores))

# Train the classifier on the full training data
mlp.fit(X_train, y_train)

# Evaluate on the test set
y_pred_mlp = mlp.predict(X_test)
print("MLP - Test accuracy:", accuracy_score(y_test, y_pred_mlp))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Initialize the MLP Classifier
mlp = MLPClassifier(hidden_layer_sizes=(256, 128), activation='relu', solver='adam', 
                    max_iter=500, random_state=42)

# Perform 5-fold cross-validation
mlp_scores = cross_val_score(mlp, X_train, y_train, cv=5)
print("MLP - Cross-validation scores:", mlp_scores)
print("MLP - Mean CV score:", np.mean(mlp_scores))

# Train the classifier on the full training data
mlp.fit(X_train, y_train)

# Evaluate on the test set
y_pred_mlp = mlp.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_mlp)
precision = precision_score(y_test, y_pred_mlp)
recall = recall_score(y_test, y_pred_mlp)
f1 = f1_score(y_test, y_pred_mlp)
# Calculate AUC
# Ensure y_test and y_pred for AUC calculation are appropriate (e.g., binary classification output)
# If y_pred needs to be probability scores, use predict_proba and get the probabilities for the positive class
y_scores_mlp = mlp.predict_proba(X_test)[:, 1] 
auc = roc_auc_score(y_test, y_scores_mlp)

print("MLP - Test accuracy:", accuracy)
print("MLP - Precision:", precision)
print("MLP - Recall:", recall)
print("MLP - F1 Score:", f1)
print("MLP - AUC:", auc)


Glove Embeddings

In [None]:
#Load Glove Embeddings
def load_glove_embeddings(path):
    embeddings_index = {}
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_embeddings = load_glove_embeddings('glove.6B.300d.txt')

In [None]:
import numpy as np
import pandas as pd


def preprocess(text):
    # Simple preprocessing: convert to lowercase and split into words
    return text.lower().split()

def get_embedding(text, embeddings):
    words = preprocess(text)
    # Obtain embeddings for each word and ignore words not in the embeddings
    word_embeddings = [embeddings[word] for word in words if word in embeddings]
    
    # Handle case with no valid words found in the embeddings
    if not word_embeddings:
        return np.zeros(300)  
    
    # Compute the average of the embeddings
    return np.mean(word_embeddings, axis=0)

df['embedding'] = df['review'].apply(lambda x: get_embedding(x, glove_embeddings))


In [None]:
labels = df['label']

if len(labels) != df['embedding'].shape[0]:
    raise ValueError("Mismatch between number of labels and number of embeddings.")

In [None]:
embeddings_matrix = np.stack(df['embedding'].values)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    embeddings_matrix, labels, test_size=0.2, random_state=42, stratify=labels)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Initialize the classifier
clf = RandomForestClassifier(random_state=42)

# Perform 5-fold cross-validation
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Random Forest - Cross-validation scores:", scores)
print("Random Forest - Mean CV score:", np.mean(scores))

# Train the classifier on the full training data
clf.fit(X_train, y_train)

# Evaluate on the test set
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)
print("Random Forest - Test accuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize the SVM classifier
svm_clf = SVC(kernel='linear', random_state=42)

# Perform 5-fold cross-validation
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=5)
print("SVM - Cross-validation scores:", svm_scores)
print("SVM - Mean CV score:", np.mean(svm_scores))

# Train the classifier on the full training data
svm_clf.fit(X_train, y_train)

# Evaluate on the test set
y_pred = svm_clf.predict(X_test)
print("SVM - Test accuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Initialize the Logistic Regression classifier
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Perform 5-fold cross-validation
log_reg_scores = cross_val_score(log_reg, X_train, y_train, cv=5)
print("Logistic Regression - Cross-validation scores:", log_reg_scores)
print("Logistic Regression - Mean CV score:", np.mean(log_reg_scores))

# Train the classifier on the full training data
log_reg.fit(X_train, y_train)

# Evaluate on the test set
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression - Test accuracy:", accuracy_score(y_test, y_pred_log_reg))

In [None]:
from sklearn.naive_bayes import GaussianNB

# Initialize the Naive Bayes classifier
nb_classifier = GaussianNB()

# Perform 5-fold cross-validation
nb_scores = cross_val_score(nb_classifier, X_train, y_train, cv=5)
print("Naive Bayes - Cross-validation scores:", nb_scores)
print("Naive Bayes - Mean CV score:", np.mean(nb_scores))

# Train the classifier on the full training data
nb_classifier.fit(X_train, y_train)

# Evaluate on the test set
y_pred_nb = nb_classifier.predict(X_test)
print("Naive Bayes - Test accuracy:", accuracy_score(y_test, y_pred_nb))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score


# Initialize the MLP Classifier
mlp = MLPClassifier(hidden_layer_sizes=(256, 128), activation='relu', solver='adam', 
                    max_iter=500, random_state=42)

# Perform 5-fold cross-validation
mlp_scores = cross_val_score(mlp, X_train, y_train, cv=5)
print("MLP - Cross-validation scores:", mlp_scores)
print("MLP - Mean CV score:", np.mean(mlp_scores))

# Train the classifier on the full training data
mlp.fit(X_train, y_train)

# Evaluate on the test set
y_pred_mlp = mlp.predict(X_test)
print("MLP - Test accuracy:", accuracy_score(y_test, y_pred_mlp))

Word2Vec

In [None]:
from gensim.models import Word2Vec

# Word2Vec
sentences = [row.split() for row in df['review']]
word2vec = Word2Vec(sentences, min_count=1)
review_vectors = [] # Create a vector for each review by taking the mean of the vectors of its words
for sentence in sentences:
    vector_list = [word2vec.wv[word] for word in sentence if word in word2vec.wv.key_to_index]
    review_vectors.append(np.mean(vector_list, axis=0))

In [None]:
from sklearn.model_selection import train_test_split

labels = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    review_vectors, labels, test_size=0.2, random_state=42, stratify=labels)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Initialize the classifier
clf = RandomForestClassifier(random_state=22)

# Perform 5-fold cross-validation
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Random Forest - Cross-validation scores:", scores)
print("Random Forest - Mean CV score:", np.mean(scores))

# Train the classifier on the full training data
clf.fit(X_train, y_train)

# Evaluate on the test set
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)
print("Random Forest - Test accuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize the SVM classifier
svm_clf = SVC()

# Perform 5-fold cross-validation
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=5)
print("SVM - Cross-validation scores:", svm_scores)
print("SVM - Mean CV score:", np.mean(svm_scores))

# Train the classifier on the full training data
svm_clf.fit(X_train, y_train)

# Evaluate on the test set
y_pred = svm_clf.predict(X_test)
print("SVM - Test accuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Initialize the Logistic Regression classifier
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Perform 5-fold cross-validation
log_reg_scores = cross_val_score(log_reg, X_train, y_train, cv=5)
print("Logistic Regression - Cross-validation scores:", log_reg_scores)
print("Logistic Regression - Mean CV score:", np.mean(log_reg_scores))

# Train the classifier on the full training data
log_reg.fit(X_train, y_train)

# Evaluate on the test set
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression - Test accuracy:", accuracy_score(y_test, y_pred_log_reg))

In [None]:
from sklearn.naive_bayes import GaussianNB

# Initialize the Naive Bayes classifier
nb_classifier = GaussianNB()

# Perform 5-fold cross-validation
nb_scores = cross_val_score(nb_classifier, X_train, y_train, cv=5)
print("Naive Bayes - Cross-validation scores:", nb_scores)
print("Naive Bayes - Mean CV score:", np.mean(nb_scores))

# Train the classifier on the full training data
nb_classifier.fit(X_train, y_train)

# Evaluate on the test set
y_pred_nb = nb_classifier.predict(X_test)
print("Naive Bayes - Test accuracy:", accuracy_score(y_test, y_pred_nb))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score


# Initialize the MLP Classifier
mlp = MLPClassifier(hidden_layer_sizes=(256, 128), activation='relu', solver='adam', 
                    max_iter=500, random_state=42)

# Perform 5-fold cross-validation
mlp_scores = cross_val_score(mlp, X_train, y_train, cv=5)
print("MLP - Cross-validation scores:", mlp_scores)
print("MLP - Mean CV score:", np.mean(mlp_scores))

# Train the classifier on the full training data
mlp.fit(X_train, y_train)

# Evaluate on the test set
y_pred_mlp = mlp.predict(X_test)
print("MLP - Test accuracy:", accuracy_score(y_test, y_pred_mlp))