In [31]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from gensim.models import KeyedVectors

In [32]:
train = pd.read_csv('./r8-train-all-terms.txt', header=None, sep='\t') 
test = pd.read_csv('./r8-test-all-terms.txt', header=None, sep='\t')
train.columns = ['label', 'content']
test.columns = ['label', 'content']

In [33]:
class GloveVectorizer:
    """Glove Vectorizer Fit Transform class"""
    def __init__(self):
        print('Loading word vectors...')
        word2vec = {}   # dict
        embedding = [] # list of vectors
        idx2word = [] # list of words
        with open('./glove.6B.50d.txt', encoding='utf-8') as f: 
            # Is just a space-separated text file in the format:
            #  - word vec[0] vec[1] vec[2] ...
            for line in f:
                values = line.split()
                word = values[0]
                vec = np.asarray(values[1:], dtype='float32')
                word2vec[word] = vec
                embedding.append(vec)
                idx2word.append(word)
        print(f'Found {len(word2vec)} word vectors.')
        
        self.word2vec = word2vec
        self.embedding = np.array(embedding)
        self.word2idx = {v:k for k,v in enumerate(idx2word)} # reverse index
        self.V, self.D = self.embedding.shape # Vocab size, dimensionality
        
    def fit(self, data):
        pass # nothing to do here
    
    def transform(self, data):
        X = np.zeros((len(data), self.D)) # initialize data matrix X
        n = 0  # index's data
        emptycount = 0 # how many sentences had words we coudn't find vectors for. 
        for sentence in data: # Loop through each sentence in the data
            tokens = sentence.lower().split()
            vecs = [] # stores all word vectors we encounter for this document (sentence)
            for word in tokens: # Loop through each words
                if word in self.word2vec:  # if word is in vocabularly, append its vector to vecs
                    vec = self.word2vec[word]
                    vecs.append(vec)
            if len(vecs) > 0:            # Check if vecs has any vectors. If yes, assign mean to X[n]
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n += 1
        print(f'Number of samples with no words found: {emptycount} / {len(data)}')
        return X
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data) 

In [34]:
vectorizer = GloveVectorizer() # Instantiate vectorizer

Xtrain = vectorizer.fit_transform(train.content)
Ytrain = train.label

Xtest = vectorizer.transform(test.content)
Ytest = test.label

Loading word vectors...
Found 400000 word vectors.
Number of samples with no words found: 0 / 5485
Number of samples with no words found: 0 / 2189


In [35]:
# create the model, train it, print scores
model = RandomForestClassifier(n_estimators=200)
model.fit(Xtrain, Ytrain)
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))

train score: 0.9992707383773929
test score: 0.9333028780264961
