In [1]:
# Install the sentence-transformers library for generating text embeddings
!pip install -q sentence-transformers

# Mount Google Drive and load data
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

# Define file paths
train_path = '/content/drive/MyDrive/EECS4412/Project/train_yelp_60k.csv'
test_path = '/content/drive/MyDrive/EECS4412/Project/test_yelp_60k.csv'

# Load training and test datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Clean column names
train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()

# Preview first few rows
train_df.head()


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Unnamed: 0,Text,Class,ID
0,Chef Kevin Sousa's 2018 award winning restaur...,positive,727658
1,This place has got potential. I did quite enjo...,positive,5407165
2,I was really excited to try this place as they...,neutral,2753394
3,Heyyyyy waffle sandwiches! Who doesn't love a ...,positive,1735407
4,Goods and Provisions has a lovely atmosphere ...,negative,1388216


In [2]:
# Check class distribution
print("Class distribution in training set:")
print(train_df['Class'].value_counts())

# Check for missing values
print("\nMissing values in training data:")
print(train_df.isnull().sum())

# Show a sample review
print("\nSample text from training data:")
print(train_df['Text'].iloc[0])


Class distribution in training set:
Class
positive    39328
negative    14028
neutral      6644
Name: count, dtype: int64

Missing values in training data:
Text     0
Class    0
ID       0
dtype: int64

Sample text from training data:


In [3]:
from sentence_transformers import SentenceTransformer

# Load pre-trained sentence embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Convert training reviews to a list
train_texts = train_df['Text'].tolist()

# Generate sentence embeddings (can take time on large datasets)
X_train = embedder.encode(train_texts, show_progress_bar=True)
y_train = train_df['Class']

print("X_train shape:", X_train.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1875 [00:00<?, ?it/s]

X_train shape: (60000, 384)


In [4]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Define classifiers to evaluate
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "LinearSVC": LinearSVC(max_iter=2000),
    "RandomForest": RandomForestClassifier(n_estimators=100)
}

# 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate models and report accuracy
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
    print(f"{name} CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")


LogisticRegression CV Accuracy: 0.8258 ± 0.0014
LinearSVC CV Accuracy: 0.8262 ± 0.0014
RandomForest CV Accuracy: 0.7834 ± 0.0022


In [5]:
# Based on CV results, LinearSVC performed best, so we choose it.
final_clf = LinearSVC(max_iter=2000)
final_clf.fit(X_train, y_train)


In [6]:
# Generate embeddings for the test set
test_texts = test_df['Text'].tolist()
X_test = embedder.encode(test_texts, show_progress_bar=True)

# Predict labels for test data
test_preds = final_clf.predict(X_test)

# Save predictions to 'prediction2.csv' in the required format
import csv

with open('prediction2.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['ID', 'CLASS'])  # Write header
    for review_id, pred_class in zip(test_df['ID'], test_preds):
        writer.writerow([review_id, pred_class])


Batches:   0%|          | 0/1875 [00:00<?, ?it/s]