In [1]:
import csv
from sklearn.feature_extraction.text import TfidfVectorizer

train_x = []
train_y = []
with open('train.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    for [line, label] in reader:
        train_x.append(line)
        train_y.append(int(label))

vectorizer = TfidfVectorizer()
train_x = vectorizer.fit_transform(train_x)

validation_x = []
validation_y = []
with open('validation.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    for [line, label] in reader:
        validation_x.append(line)
        validation_y.append(int(label))

validation_x = vectorizer.transform(validation_x)

test_x = []
test_y = []
with open('test.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    for [line, label] in reader:
        test_x.append(line)
        test_y.append(int(label))

test_x = vectorizer.transform(test_x)

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score

def evaluate(model):
    train_score = model.score(train_x, train_y)
    validation_score = model.score(validation_x, validation_y)
    test_score = model.score(test_x, test_y)
    test_predict = model.predict(test_x)
    print('Training accuracy: ', train_score)
    print('Validation accuracy: ', validation_score)
    print('Test accuracy: ', test_score)
    print('Test precision: ', precision_score(test_y, test_predict))
    print('Test recall: ', recall_score(test_y, test_predict))

print('Naive Bayes:')
model_naive_bayes = MultinomialNB()
model_naive_bayes.fit(train_x, train_y)
evaluate(model_naive_bayes)
print('')

print('Logistic Regression:')
model_logistic_regression = LogisticRegression(random_state=0)
model_logistic_regression.fit(train_x, train_y)
evaluate(model_logistic_regression)
print('')

print('Random Forest:')
model_random_forest = RandomForestClassifier(random_state=0)
model_random_forest.fit(train_x, train_y)
evaluate(model_random_forest)

# Logistic regression and random forests perform the best.
# On the test dataset, logistic regression performs better.

Naive Bayes:
Training accuracy:  0.9710330684439887
Validation accuracy:  0.9521531100478469
Test accuracy:  0.956989247311828
Test precision:  1.0
Test recall:  0.6727272727272727

Logistic Regression:
Training accuracy:  0.9735965137144322
Validation accuracy:  0.9641148325358851
Test accuracy:  0.977299880525687
Test precision:  1.0
Test recall:  0.8272727272727273

Random Forest:
Training accuracy:  1.0
Validation accuracy:  0.9700956937799043
Test accuracy:  0.97610513739546
Test precision:  1.0
Test recall:  0.8181818181818182
