In [1]:
# Mount Google Drive in Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import SelectKBest, chi2

# Load the training data
train_data_path = '/content/drive/MyDrive/DataMining/Project/train_yelp_60k.csv'
train_data = pd.read_csv(train_data_path)

# Load the test data
test_data_path = '/content/drive/MyDrive/DataMining/Project/test_yelp_60k.csv'
test_data = pd.read_csv(test_data_path)

# Split the training data into features (X) and labels (target) (y)
X_train = train_data['Text']
y_train = train_data['Class']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Bag of Words (BOW)
# Convert documents to BOW numerical vectors
vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
X_train_bow = vectorizer.fit_transform(X_train)
X_val_bow = vectorizer.transform(X_val)

# Range of features
k_best_values = [600, 900, 1500, 1800, 2100]

# List of classifiers
classifiers = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

best_accuracy = 0.0
best_classifier_name = ''
best_k_best = 0

# Compare the performance of different classifiers and k_best values using cross-validation
for k_best in k_best_values:
    for name, clf in classifiers.items():
        selector = SelectKBest(chi2, k=k_best)
        X_train_bow_selected = selector.fit_transform(X_train_bow, y_train)

        scores = cross_val_score(clf, X_train_bow_selected, y_train, cv=3, scoring='accuracy')
        mean_accuracy = scores.mean()

        print(f'{name} with k_best={k_best} - Cross-Validation Accuracy: {mean_accuracy:.4f} (± {scores.std():.4f})')

        # Update the best classifier and k_best if a better combination is found
        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_classifier_name = name
            best_k_best = k_best

# Train the selected classifier on the full training data with the best k_best value
best_classifier = classifiers[best_classifier_name]
best_selector = SelectKBest(chi2, k=best_k_best)
X_train_bow_selected = best_selector.fit_transform(X_train_bow, y_train)

best_classifier.fit(X_train_bow_selected, y_train)

# Make predictions on the validation set
X_val_bow_selected = best_selector.transform(X_val_bow)
predictions_val_bow = best_classifier.predict(X_val_bow_selected)

# Evaluate the model
print("Selected Classifier:", best_classifier_name)
print("Best k_best:", best_k_best)
print(classification_report(y_val, predictions_val_bow))
print("Accuracy:", accuracy_score(y_val, predictions_val_bow))

# Make predictions on the test dataset
X_test_bow = vectorizer.transform(test_data['Text'])
X_test_bow_selected = best_selector.transform(X_test_bow)
predictions_test_bow = best_classifier.predict(X_test_bow_selected)

# Save predictions to my drive
output_bow = pd.DataFrame({'ID': test_data['ID'], 'CLASS': predictions_test_bow})
output_bow.to_csv('/content/drive/MyDrive/DataMining/Project/prediction1.csv', index=False)


Multinomial Naive Bayes with k_best=600 - Cross-Validation Accuracy: 0.7699 (± 0.0021)
Random Forest with k_best=600 - Cross-Validation Accuracy: 0.8061 (± 0.0006)
SVM with k_best=600 - Cross-Validation Accuracy: 0.8357 (± 0.0002)
Multinomial Naive Bayes with k_best=900 - Cross-Validation Accuracy: 0.7802 (± 0.0031)
Random Forest with k_best=900 - Cross-Validation Accuracy: 0.8081 (± 0.0007)
SVM with k_best=900 - Cross-Validation Accuracy: 0.8410 (± 0.0006)
Multinomial Naive Bayes with k_best=1500 - Cross-Validation Accuracy: 0.7882 (± 0.0023)
Random Forest with k_best=1500 - Cross-Validation Accuracy: 0.8092 (± 0.0011)
SVM with k_best=1500 - Cross-Validation Accuracy: 0.8439 (± 0.0010)
Multinomial Naive Bayes with k_best=1800 - Cross-Validation Accuracy: 0.7901 (± 0.0028)
Random Forest with k_best=1800 - Cross-Validation Accuracy: 0.8075 (± 0.0008)
SVM with k_best=1800 - Cross-Validation Accuracy: 0.8443 (± 0.0003)
Multinomial Naive Bayes with k_best=2100 - Cross-Validation Accuracy: 