In [1]:
import pandas as pd
import pickle

In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib



In [3]:
with open('data_words.pickle', 'rb') as handle:
    data_words = pickle.load(handle)

In [4]:
train_words = data_words["train_words"]
test_words = data_words["test_words"]

In [5]:
train_words.head()

Unnamed: 0,target,final_text
0,1,"[last, night, finish, watch, jane, eyr, 1983, ..."
1,1,"[mayb, sap, sweetest, movi, ever, saw, first, ..."
2,0,"[keep, disney, well, known, practic, steal, me..."
3,1,"[john, water, given, us, genuin, enjoy, film, ..."
4,0,"[start, write, review, break, watch, movi, fir..."


# SAMPLE CORPUS

In [6]:
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?',
]


sample_vectorizer = CountVectorizer()

X = sample_vectorizer.fit_transform(corpus)

print(sample_vectorizer.get_feature_names())
print(X.toarray())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [7]:
sample_vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X2 = sample_vectorizer2.fit_transform(corpus)
print(sample_vectorizer2.get_feature_names())
print(X2.toarray())

['and this', 'document is', 'first document', 'is the', 'is this', 'second document', 'the first', 'the second', 'the third', 'third one', 'this document', 'this is', 'this the']
[[0 0 1 1 0 0 1 0 0 0 0 1 0]
 [0 1 0 1 0 1 0 1 0 0 1 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 1 0]
 [0 0 1 0 1 0 1 0 0 0 0 0 1]]


# SAMPLE CORPUS DONE

In [8]:
vocabulary_size = 500

In [9]:
vectorizer = CountVectorizer(
    max_features=vocabulary_size,
    preprocessor=lambda x: x, 
    tokenizer=lambda x: x
)  # 

In [10]:
features_train = vectorizer.fit_transform(train_words["final_text"]).toarray()
features_test = vectorizer.transform(test_words["final_text"]).toarray()

In [11]:
features_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 1, 0, 0]])

In [12]:
train_words.shape

(25000, 2)

In [13]:
train_words["features"] = list(features_train)
test_words["features"] = list(features_test)

In [14]:
train_words.head()

Unnamed: 0,target,final_text,features
0,1,"[last, night, finish, watch, jane, eyr, 1983, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, ..."
1,1,"[mayb, sap, sweetest, movi, ever, saw, first, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,0,"[keep, disney, well, known, practic, steal, me...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,1,"[john, water, given, us, genuin, enjoy, film, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,0,"[start, write, review, break, watch, movi, fir...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, ..."


In [15]:
test_words.head()

Unnamed: 0,target,final_text,features
0,1,"[late, sydney, pollack, come, grown, love, sto...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,1,"[fairli, interest, look, charact, india, burge...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1,"[opportun, see, last, even, local, film, festi...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,1,"[anoth, raquel, welch, classic, pictur, hit, t...","[0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ..."
4,0,"[rocketship, x, view, seriou, movi, buff, foll...","[1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ..."


In [16]:
cache_data = {
    "train_transform":train_words, 
    "test_transform":test_words,
    "vocabulary_size":vocabulary_size
}

In [17]:
with open('data_features.pickle', "wb") as f:
                pickle.dump(cache_data, f)