In [1]:
# Mount Google Drive in Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install sentence-transformers



Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=689846c41a5126c8fd305781d06f93a2fbef69eb4da1ce736bb69cf4931b690d
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-tr

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sentence_transformers import SentenceTransformer

# Load the training dataset
train_data_path = '/content/drive/MyDrive/DataMining/Project/train_yelp_60k.csv'
train_data = pd.read_csv(train_data_path)

# Load the testing dataset
test_data_path = '/content/drive/MyDrive/DataMining/Project/test_yelp_60k.csv'
test_data = pd.read_csv(test_data_path)

# Split the training data into features (X) and labels (target) (y)
X_train = train_data['Text']
y_train = train_data['Class']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Text Embeddings
# Convert documents to pre-trained embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Resetting the index of X_train and X_val
X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)

X_train_embed = model.encode(X_train)
X_val_embed = model.encode(X_val)

.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [5]:
# Train Logistic Regression
logistic_regression = LogisticRegression(max_iter=10000)
logistic_regression_scores = cross_val_score(logistic_regression, X_train_embed, y_train, cv=3, scoring='accuracy')
logistic_regression.fit(X_train_embed, y_train)
predictions_val_lr = logistic_regression.predict(X_val_embed)

# Evaluate Logistic Regression
print("Logistic Regression Cross-Validation Accuracy: {:.4f} (± {:.4f})".format(logistic_regression_scores.mean(), logistic_regression_scores.std()))
print("Logistic Regression Validation Accuracy:", accuracy_score(y_val, predictions_val_lr))
print(classification_report(y_val, predictions_val_lr))


Logistic Regression Cross-Validation Accuracy: 0.8427 (± 0.0023)
Logistic Regression Validation Accuracy: 0.8409166666666666
              precision    recall  f1-score   support

    negative       0.79      0.84      0.82      2784
     neutral       0.48      0.27      0.34      1359
    positive       0.89      0.94      0.91      7857

    accuracy                           0.84     12000
   macro avg       0.72      0.68      0.69     12000
weighted avg       0.82      0.84      0.83     12000



In [6]:
# Train Random Forest
random_forest = RandomForestClassifier(max_depth=10000)
random_forest_scores = cross_val_score(random_forest, X_train_embed, y_train, cv=3, scoring='accuracy')
random_forest.fit(X_train_embed, y_train)
predictions_val_rf = random_forest.predict(X_val_embed)

# Evaluate Random Forest
print("\nRandom Forest Cross-Validation Accuracy: {:.4f} (± {:.4f})".format(random_forest_scores.mean(), random_forest_scores.std()))
print("Random Forest Validation Accuracy:", accuracy_score(y_val, predictions_val_rf))
print(classification_report(y_val, predictions_val_rf))



Random Forest Cross-Validation Accuracy: 0.8238 (± 0.0013)
Random Forest Validation Accuracy: 0.8246666666666667
              precision    recall  f1-score   support

    negative       0.77      0.82      0.79      2784
     neutral       0.48      0.07      0.13      1359
    positive       0.85      0.95      0.90      7857

    accuracy                           0.82     12000
   macro avg       0.70      0.62      0.61     12000
weighted avg       0.79      0.82      0.79     12000



In [7]:
# Train Support Vector Machine (SVM)
svm_classifier = SVC()
svm_scores = cross_val_score(svm_classifier, X_train_embed, y_train, cv=3, scoring='accuracy')
svm_classifier.fit(X_train_embed, y_train)
predictions_val_svm = svm_classifier.predict(X_val_embed)

# Evaluate SVM
print("\nSVM Cross-Validation Accuracy: {:.4f} (± {:.4f})".format(svm_scores.mean(), svm_scores.std()))
print("SVM Validation Accuracy:", accuracy_score(y_val, predictions_val_svm))
print(classification_report(y_val, predictions_val_svm))



SVM Cross-Validation Accuracy: 0.8389 (± 0.0003)
SVM Validation Accuracy: 0.8385
              precision    recall  f1-score   support

    negative       0.78      0.85      0.82      2784
     neutral       0.55      0.11      0.19      1359
    positive       0.87      0.96      0.91      7857

    accuracy                           0.84     12000
   macro avg       0.73      0.64      0.64     12000
weighted avg       0.81      0.84      0.81     12000



In [None]:
# Select the best classifier based on validation results
best_classifier_name = max([
    ('Logistic Regression', accuracy_score(y_val, predictions_val_lr)),
    ('Random Forest', accuracy_score(y_val, predictions_val_rf)),
    ('SVM', accuracy_score(y_val, predictions_val_svm))
], key=lambda x: x[1])[0]

best_classifier = {
    'Logistic Regression': logistic_regression,
    'Random Forest': random_forest,
    'SVM': svm_classifier
}[best_classifier_name]

# Make predictions on the test dataset
X_test_embed = model.encode(test_data['Text'])
predictions_test_embed = best_classifier.predict(X_test_embed)

# Save predictions to my drive
submission_embed = pd.DataFrame({'ID': test_data['ID'], 'CLASS': predictions_test_embed})
submission_embed.to_csv('/content/drive/MyDrive/DataMining/Project/prediction2.csv', index=False)