In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Step 1: Load and preprocess the data
df = pd.read_csv('../reviews.csv', delimiter='\t', quoting=3)
# Perform any additional preprocessing steps if needed

# Step 2: Vectorize the text using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Review'])
y = df['Liked']

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train a classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


y_prob = classifier.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Step 6: Calculate AUROC
auroc = roc_auc_score(y_test, y_prob)
print("AUROC:", auroc)

Accuracy: 0.81
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.88      0.82        96
           1       0.87      0.75      0.80       104

    accuracy                           0.81       200
   macro avg       0.82      0.81      0.81       200
weighted avg       0.82      0.81      0.81       200

AUROC: 0.8731971153846154
