In [1]:
import csv
import numpy as np

# Let's load in the data from our training file 
train_data = [line for line in csv.reader(open('train_data.csv'))]

# Get the sentences
train_sentences = [sentence for sentence,label in train_data]

# Get the labels
labels = np.array([int(label) for sentence,label in train_data])

UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 1942: character maps to <undefined>

In [None]:
# Get the test data
test_data = [line for line in csv.reader(open('test_data.csv'))]

# Get the test sentences
test_sentences = [line[0] for line in test_data]

In [None]:
from nltk import word_tokenize

# Let's define a method to split a sentence into words
word_tokenize(train_sentences[0].lower())

In [None]:
# Now let's tokenize all of the words in the training_data
tokenized = []
for i,sent in enumerate(train_sentences):
    tokenized.append(word_tokenize(sent.lower()))

In [None]:
print(tokenized[0])

In [None]:
# Let's flatten our list of (list of words) into a list of words
all_words = [word for sent in tokenized for word in sent]
print(all_words[:100])

In [None]:
# Identify the 1000 most common words in the corpus and use them as our vocabulary
from collections import Counter

counter = Counter(all_words)
vocabulary = [word for word,count in counter.most_common(1000)]
print(vocabulary[:100])

In [None]:
# We add an "<UNK>" token to represent all out-of-vocabulary words
vocabulary = ["<UNK>"] + vocabulary

# Now we invert the array to have a mapping of words to indices
word2index = {word:i for i,word in enumerate(vocabulary)}

print(vocabulary[10])
print(word2index["a"])

In [None]:
# Let's create a count vectorization of every sentence. 
# The value at vector[i] will be the number of times vocabulary[i] appears in the sentence.
def count_vectorize(sent):
    vect = np.zeros(len(vocabulary))
    for word in sent:
        vect[word2index.get(word, 0)] += 1
    
    return vect

print("Original sentence:", sentences[200])
print("")
print("Tokenized sentence:", tokenized[200])
print("")
print("Vectorized:", count_vectorize(tokenized[200]))

In [None]:
# Vectorize all of the training data
features = np.stack([count_vectorize(sent) for sent in tokenized])

In [None]:
# Now we have both the features and the labels
print(features.shape)
print(labels.shape)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into 95%/5%.
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.05)

print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Hmmm would I really make make the starter file have the best model...
clf = GaussianNB()
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(sum(y_test == y_pred)/len(y_test))

In [None]:
# Now, let's train on all of the data 
clf.fit(features, labels)

# Prepare the testing data
test_tokenized = [word_tokenize(sent.lower()) for sent in test_sentences]

# Count vectorize the sentences
test_features = np.stack([count_vectorize(sent) for sent in test_tokenized])

y_pred = clf.predict(test_features)

print(y_pred)

In [None]:
# Write the results to a file
open("predictions.csv", "w+").writelines([str(pred) + "\n" for pred in y_pred])