In [0]:
# Sentiment Analysis for Cyberthon 2020 - 74.81% Accuracy
# By richdom2185

import pandas as pd
import numpy as np
import re

# Read the file and ensure it's formatted correctly
df = pd.read_csv('train.csv', encoding = 'utf-8', index_col = 'id')
df.head()

In [0]:
# Drop everything except letters, numbers and whitespaces
df['review'] = df['review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x.lower()))

# Vectorise
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.review).toarray()
labels = df.rating
features.shape

In [0]:
# YOU WILL NEED A LOT OF RAM FOR THIS (~22GB)
# Advisable to use Google Colab -- run it once using the default 12GB RAM runtime,
# Colab will crash and prompt you whether you want to switch to a 25GB RAM runtime,
# and just like that you effectively doubled your RAM :)

# Creates and fits model to training dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['rating'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [0]:
# Save model in the event of a crash
import pickle, time
filename = 'clf_model-{:.3f}.pickle'.format(time.time()) # time.time() prevents overwriting of different models
pickle.dump(clf, open(filename, 'wb'))
print('Saved model to disk')

In [0]:
# Load model
import pickle
clf = pickle.load(open(filename, 'rb'))
print('Loaded model from disk')

In [0]:
# To manually test -- optional to run
clf.predict(count_vect.transform(["It was a really interesting course! Some exercises were challenging but it was very rewarding when you found the solution."]))

In [0]:
# Read file and ensures that it is correctly formatted
df_pred = pd.read_csv('test.csv', encoding='utf-8')
df_pred.head()

# Applies same preprocessing as training set
df_pred['review'] = df_pred['review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x.lower()))
print(df_pred['review'].values) # for checking that it is correctly formatted

In [0]:
# Predicts ratings
y_pred = clf.predict(count_vect.transform(df_pred['review'].values))
print('Predictions:', y_pred)

In [0]:
# Convert to CSV
output = pd.DataFrame({'id': df_pred.id, 'rating': y_pred})
output.to_csv('sentiment-analysis-submission.csv', index=False)

In [0]:
# Submit to CTFSG Grader
import urllib.request, os
urllib.request.urlretrieve('https://raw.githubusercontent.com/alttablabs/ctfsg-utils/master/pyctfsglib.py', './pyctfsglib.py')
print('Downloaded pyctfsglib.py:', 'pyctfsglib.py' in os.listdir())

import pyctfsglib as ctfsg
import random

USER_TOKEN = "REDACTED" # You need to fill this up
GRADER_URL = random.choice([
"http://challenges.csdc20t.ctf.sg:30011/",
"http://challenges.csdc20t.ctf.sg:30012/"
])

grader = ctfsg.DSGraderClient(GRADER_URL, USER_TOKEN)

# Submitting a file
grader.submitFile('./sentiment-analysis-submission.csv')