In [3]:
import csv
import numpy as np

# Let's load in the data from our training file 
train_data = [line for line in csv.reader(open('train_data.csv'))]

# Get the sentences
train_sentences = [sentence for sentence,label in train_data]

# Get the labels
labels = np.array([int(label) for sentence,label in train_data])

UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 1942: character maps to <undefined>

In [202]:
# Get the test data
test_data = [line for line in csv.reader(open('test_data.csv'))]

# Get the test sentences
test_sentences = [line[0] for line in test_data]

In [2]:
from nltk import word_tokenize

# Let's define a method to split a sentence into words
word_tokenize(train_sentences[0].lower())

NameError: name 'train_sentences' is not defined

In [None]:
# Now let's tokenize all of the words in the training_data
tokenized = []
for i,sent in enumerate(train_sentences):
    tokenized.append(word_tokenize(sent.lower()))

In [None]:
print(tokenized[0])

In [102]:
# Let's flatten our list of (list of words) into a list of words
all_words = [word for sent in tokenized for word in sent]
print(all_words[:100])

['``', '==edit', 'warring==', 'you', 'appear', 'to', 'be', 'engaged', 'in', 'an', 'edit', 'war', 'with', 'one', 'or', 'more', 'editors', 'according', 'to', 'your', 'reverts', 'at', 'constitution', 'of', 'massachusetts', '.', 'although', 'repeatedly', 'reverting', 'or', 'undoing', 'another', 'editor', "'s", 'contributions', 'may', 'seem', 'necessary', 'to', 'protect', 'your', 'preferred', 'version', 'of', 'a', 'page', ',', 'on', 'wikipedia', 'this', 'is', 'usually', 'seen', 'as', 'obstructing', 'the', 'normal', 'editing', 'process', ',', 'and', 'often', 'creates', 'animosity', 'between', 'editors', '.', 'instead', 'of', 'edit', 'warring', ',', 'please', 'discuss', 'the', 'situation', 'with', 'the', 'editor', '(', 's', ')', 'involved', 'and', 'try', 'to', 'reach', 'a', 'consensus', 'on', 'the', 'talk', 'page', '.', '—', 'preceding', 'unsigned', 'comment', 'added', 'by']


In [103]:
# Identify the 1000 most common words in the corpus and use them as our vocabulary
from collections import Counter

counter = Counter(all_words)
vocabulary = [word for word,count in counter.most_common(1000)]
print(vocabulary[:100])

['.', 'the', ',', 'to', 'i', "''", 'and', 'of', 'you', 'a', '!', 'is', 'that', '``', 'it', 'in', 'for', 'this', 'not', 'on', ')', 'be', '(', 'as', ':', 'have', 'are', '?', "'s", 'your', 'do', 'article', 'with', 'if', 'was', "n't", 'or', 'but', 'an', 'page', 'wikipedia', 'my', 'from', 'me', 'at', 'by', 'can', 'about', 'talk', 'so', 'what', 'there', 'has', 'would', 'please', 'all', 'will', 'they', 'no', 'he', 'just', 'like', 'one', 'should', 'which', '-', 'any', 'been', 'we', 'here', 'more', 'some', '...', 'other', 'who', 'see', 'up', ';', 'edit', 'his', 'also', 'did', 'think', "'m", 'how', 'shit', 'know', 'because', 'does', 'only', 'why', 'out', "'", 'people', 'when', 'articles', 'am', 'use', 'then', 'now']


In [104]:
# We add an "<UNK>" token to represent all out-of-vocabulary words
vocabulary = ["<UNK>"] + vocabulary

# Now we invert the array to have a mapping of words to indices
word2index = {word:i for i,word in enumerate(vocabulary)}

print(vocabulary[10])
print(word2index["a"])

a
10


In [110]:
# Let's create a count vectorization of every sentence. 
# The value at vector[i] will be the number of times vocabulary[i] appears in the sentence.
def count_vectorize(sent):
    vect = np.zeros(len(vocabulary))
    for word in sent:
        vect[word2index.get(word, 0)] += 1
    
    return vect

print("Original sentence:", sentences[200])
print("")
print("Tokenized sentence:", tokenized[200])
print("")
print("Vectorized:", count_vectorize(tokenized[200]))

Original sentence: Mind your own business, you troll.

Tokenized sentence: ['mind', 'your', 'own', 'business', ',', 'you', 'troll', '.']

Vectorized: [ 1.  1.  0. ...,  0.  0.  0.]


In [111]:
# Vectorize all of the training data
features = np.stack([count_vectorize(sent) for sent in tokenized])

In [112]:
# Now we have both the features and the labels
print(features.shape)
print(labels.shape)

(10000, 1001)
(10000,)


In [133]:
from sklearn.model_selection import train_test_split

# Split the data into 95%/5%.
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.05)

print(X_train.shape)
print(X_test.shape)

(9500, 1001)
(500, 1001)


In [195]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Hmmm would I really make make the starter file have the best model...
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB(priors=None)

In [196]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(sum(y_test == y_pred)/len(y_test))

             precision    recall  f1-score   support

          0       0.99      0.48      0.64       446
          1       0.18      0.94      0.30        54

avg / total       0.90      0.53      0.60       500

0.526


In [197]:
# Now, let's train on all of the data 
clf.fit(features, labels)

# Prepare the testing data
test_tokenized = [word_tokenize(sent.lower()) for sent in test_sentences]

# Count vectorize the sentences
test_features = np.stack([count_vectorize(sent) for sent in test_tokenized])

y_pred = clf.predict(test_features)

print(y_pred)

[1 0 0 0 1 1 1 0 1 0 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 1 0 1 0 1 0 0 1 1 1
 1 0 1 0 0 1 1 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 1 0 1 1 0 1 0 0
 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 1 0 1 1 0
 0 1 1 1 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 1 1 1 1 1 0 0 1 0 0 1 0 1 1 1 1 1
 1 1 0 1 1 0 0 1 0 1 1 1 0 0 1 1 0 1 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 1
 1 0 0 0 1 0 1 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1
 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 1 1 1 1 1 0 1 1 0 0 0 0 1 1 0 1 0 1 0
 1 1 0 0 1 0 1 0 1 0 1 1 1 0 1 0 1 0 0 0 1 1 1 1 0 0 1 1 1 0 0 1 1 1 0 0 0
 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 1 0 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 0 0 1
 0 1 1 1 1 1 1 0 1 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0
 1 0 1 1 1 1 0 1 0 1 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0
 0 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 1 1 1 0 1 1 1 1 1 1 0 0 1 0 0 1 0 0 0 1 0
 1 1 0 0 0 1 1 1 0 0 1 0 

In [198]:
# Write the results to a file
open("predictions.csv", "w+").writelines([str(pred) + "\n" for pred in y_pred])