In [1]:
import numpy as np
import pandas as pd
import gensim
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

data = pd.read_csv('nmf_result.csv')
data.head()


Unnamed: 0.1,Unnamed: 0,original_text,processed_text,nmf_topic,topic_score
0,0,From: lerxst@wam.umd.edu (where's my thing)\nS...,thing car nntp_poste host rac_wam park line wo...,6,0.150893
1,1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,final call summary final call si clock report ...,9,0.022004
2,2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,question organization purdue_university engine...,9,0.030069
3,3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,division line distribution_world nntp_poste ho...,9,0.032953
4,4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,question organization smithsonian_astrophysica...,9,0.016379


In [2]:
def cleanText(text):
    cleaned = re.sub("[^a-zA-Z0-9']"," ",text)
    return cleaned.strip()

In [3]:
x,y = np.asarray(data["processed_text"]),np.asarray(data["nmf_topic"])

x_cleaned = [cleanText(t) for t in x]
x_cleaned[:4]

['thing car nntp poste host rac wam park line wonder enlighten car see day door sport car look late early call door really small addition front bumper separate rest body know tellme model name engine spec year production car make history info funky look car mail thank bring neighborhood lerxst',
 'final call summary final call si clock report keyword si acceleration clock upgrade article line nntp poste host fair number brave soul upgrade si clock oscillator share experience poll send brief message detail experience procedure top speed attain cpu rate speed add card adapter heat sink hour usage day floppy disk functionality floppy especially request summarize next day add network knowledge base do clock upgrade answer poll thank',
 'question organization purdue university engineering computer network line well folk finally give ghost weekend start life way back sooo m market new machine bit soon intend m look pick powerbook maybe bunch question hopefully answer know dirt next round pow

In [4]:
x_tokenized = [[w for w in sentence.split(" ") if w != ""] for sentence in x_cleaned]
x_tokenized[0]


['thing',
 'car',
 'nntp',
 'poste',
 'host',
 'rac',
 'wam',
 'park',
 'line',
 'wonder',
 'enlighten',
 'car',
 'see',
 'day',
 'door',
 'sport',
 'car',
 'look',
 'late',
 'early',
 'call',
 'door',
 'really',
 'small',
 'addition',
 'front',
 'bumper',
 'separate',
 'rest',
 'body',
 'know',
 'tellme',
 'model',
 'name',
 'engine',
 'spec',
 'year',
 'production',
 'car',
 'make',
 'history',
 'info',
 'funky',
 'look',
 'car',
 'mail',
 'thank',
 'bring',
 'neighborhood',
 'lerxst']

In [5]:
# Now we'll create our model 
import time

start = time.time()

model = gensim.models.Word2Vec(x_tokenized,
                 size=100
                 # Size is the length of our vector.
                )

end = round(time.time()-start,2)
print("This process took",end,"seconds.")

This process took 5.76 seconds.


In [6]:
model.wv.most_similar("free")

[('maintain', 0.7264387607574463),
 ('customer', 0.7227741479873657),
 ('fee', 0.7188107967376709),
 ('directly', 0.7004418969154358),
 ('interest', 0.6920909881591797),
 ('existant', 0.6852169632911682),
 ('business', 0.6830903887748718),
 ('redistribute', 0.6816473007202148),
 ('service', 0.6813581585884094),
 ('privacy', 0.6771520376205444)]

## Writing A Class To Create Sequences

In [7]:
class Sequencer():
    
    def __init__(self,
                 all_words,
                 max_words,
                 seq_len,
                 embedding_matrix
                ):
        
        self.seq_len = seq_len
        self.embed_matrix = embedding_matrix
        """
        temp_vocab = Vocab which has all the unique words
        self.vocab = Our last vocab which has only most used N words.
    
        """
        temp_vocab = list(set(all_words))
        self.vocab = []
        self.word_cnts = {}
        """
        Now we'll create a hash map (dict) which includes words and their occurencies
        """
        for word in temp_vocab:
            # 0 does not have a meaning, you can add the word to the list
            # or something different.
            count = len([0 for w in all_words if w == word])
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))
        
        # Now we'll sort counts and while sorting them also will sort indexes.
        # We'll use those indexes to find most used N word.
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1
        
        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])
                    
    def textToVector(self,text):
        # First we need to split the text into its tokens and learn the length
        # If length is shorter than the max len we'll add some spaces (100D vectors which has only zero values)
        # If it's longer than the max len we'll trim from the end.
        tokens = text.split()
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1
        vec = []
        for tok in tokens[:len_v]:
            try:
                vec.append(self.embed_matrix[tok])
            except Exception as E:
                pass
        
        last_pieces = self.seq_len - len(vec)
        for i in range(last_pieces):
            vec.append(np.zeros(100,))
        
        return np.asarray(vec).flatten()

In [8]:
sequencer = Sequencer(all_words = [token for seq in x_tokenized for token in seq],
              max_words = 1200,
              seq_len = 15,
              embedding_matrix = model.wv
             )


In [9]:
test_vec = sequencer.textToVector("i am in love with you")
test_vec

array([0.23834062, 0.86124551, 0.09608242, ..., 0.        , 0.        ,
       0.        ])

In [10]:
test_vec.shape

(1500,)

## PCA (Principal Component Analysis)

In [11]:
x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in x_tokenized])
print(x_vecs.shape)

(11314, 1500)


In [12]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=50)
pca_model.fit(x_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.7076590097263021


In [13]:
x_comps = pca_model.transform(x_vecs)
x_comps.shape

(11314, 50)

In [14]:
label_map = {cat:index for index,cat in enumerate(np.unique(y))}
y_prep = np.asarray([label_map[l] for l in y])

label_map

{'0': 0,
 '1': 1,
 '10': 2,
 '11': 3,
 '12': 4,
 '13': 5,
 '14': 6,
 '15': 7,
 '16': 8,
 '17': 9,
 '18': 10,
 '19': 11,
 '2': 12,
 '3': 13,
 '4': 14,
 '5': 15,
 '6': 16,
 '7': 17,
 '8': 18,
 '9': 19,
 'None': 20}

In [15]:
y

array(['6', '9', '9', ..., '9', '13', '9'], dtype=object)

In [16]:
x_train,x_test,y_train,y_test = train_test_split(x_comps,y,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(9051, 50)
(2263, 50)
(9051,)
(2263,)


In [None]:
start = time.time() 

svm_classifier = SVC(kernel="linear", C=1000)
svm_classifier.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("Support Vector Machine Classifier has fitted, this process took {} seconds".format(process))

In [None]:
svm_classifier.score(x_test,y_test)

In [None]:
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
print("Score of RFC",rfc.score(x_test,y_test))

logreg = LogisticRegression()
logreg.fit(x_train,y_train)
print("Score of LogReg",logreg.score(x_test,y_test))

gnb = GaussianNB()
gnb.fit(x_train,y_train)
print("Score of GaussianNB",gnb.score(x_test,y_test))

bnb = BernoulliNB()
bnb.fit(x_train,y_train)
print("Score of BernoulliNB",bnb.score(x_test,y_test))