In [96]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import time

In [97]:
df = pd.read_csv("../dataset/facebook_comment_cleaned.tsv",sep='\t',encoding='utf-8')
df['token_arr'] = df['token'].apply(lambda x:eval(x))

def giveClass(score):
    if(score==-32 ):
        return -32
    if(score==0):return 0
    if(score>0):return 1
    if(score<0):return -1
df['class'] = df['score'].apply(giveClass)

In [98]:
df.head()
scored = df[df['scoredAmt']>0]

In [99]:
comments = [i for i in scored['token_arr'].values]
vocabs = Counter([word for comment in comments for word in comment])
comments = [" ".join(i) for i in scored['token_arr'].values]

In [100]:
%%time
#Creating tf-idf
print("Creating the tf-idf matrix...\n")
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
tfidf_vectorizer = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = [" "],   \
                             max_features = 15000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
tfidf_train_features = tfidf_vectorizer.fit_transform(comments)
print("tf-idf features created")

Creating the tf-idf matrix...

tf-idf features created
CPU times: user 648 ms, sys: 0 ns, total: 648 ms
Wall time: 647 ms


In [101]:
%%time
X = tfidf_train_features
y = scored['class']

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 288 µs


In [102]:
%%time
X_train,X_test , y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 8.95 ms


In [103]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [70]:
%%time
clf1 = AdaBoostClassifier(random_state=1)
clf2 =  RandomForestClassifier(n_estimators = 50,random_state=1,criterion='entropy')
clf3 = MLPClassifier(hidden_layer_sizes =(500,250),random_state = 1,verbose =True)
clf4 = MLPClassifier()
clf5 = LogisticRegression()
clf6 = KNeighborsClassifier()


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 210 µs


In [29]:
%%time
clfs = [clf1,clf2,clf3,clf4,clf5,clf6]
names = ['Ada','RandomForest','MultinomialNB','Neural Network','Logistic Regression','KNeighbors']
for i,clf in enumerate(clfs):
    start = time.time()
    print("Start training model: %s"%names[i])
    clf.fit(X_train,y_train)
    end = time.time()
    print("Finished training in {} seconds".format(round(end - start),2))
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print("Accuracy: {}%".format(round((acc*100),2)))
    print(cm)

Start training model: Ada
Finished training in 4 seconds
Accuracy: 59.85%
[[529  67  18  21]
 [ 95 587  39  41]
 [153  99  53  69]
 [108  92  62 119]]
Start training model: RandomForest
Finished training in 3 seconds
Accuracy: 61.71%
[[508  81  23  23]
 [ 80 624  36  22]
 [ 76 134  87  77]
 [ 75 122  75 109]]
Start training model: MultinomialNB
Finished training in 0 seconds
Accuracy: 54.04%
[[304 314   1  16]
 [  3 757   0   2]
 [ 19 301   9  45]
 [ 31 252   5  93]]
Start training model: Neural Network
Finished training in 267 seconds
Accuracy: 60.69%
[[464  69  35  67]
 [ 69 550  81  62]
 [ 74  97 110  93]
 [ 58  62  79 182]]
Start training model: Logistic Regression
Finished training in 0 seconds
Accuracy: 65.8%
[[533  72  11  19]
 [ 78 652  13  19]
 [108 110  63  93]
 [ 90  79  44 168]]
Start training model: KNeighbors
Finished training in 0 seconds
Accuracy: 34.43%
[[604  11   8  12]
 [694  60   4   4]
 [308  17  29  20]
 [308   6  19  48]]
CPU times: user 4min 35s, sys: 3.61 s, t

In [108]:
%%time
clf1 = RandomForestClassifier(n_estimators = 70,random_state=1,criterion='entropy')
clf2 = LogisticRegression()
clf3 = MLPClassifier(hidden_layer_sizes =(400,250,150),random_state = 1,verbose =True,max_iter=10)

#We will choose RandomForest, LogisticRegression, and neural network to form ensemble model
eclf1 = VotingClassifier(estimators=[('rf', clf1), ('lr', clf2), ('mlp', clf3)], voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
y_pred = eclf1.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(acc)
print(cm)

Iteration 1, loss = 1.15264062
Iteration 2, loss = 0.74704961
Iteration 3, loss = 0.53885464
Iteration 4, loss = 0.39832123
Iteration 5, loss = 0.29024543
Iteration 6, loss = 0.23208796
Iteration 7, loss = 0.17231346
Iteration 8, loss = 0.14794951
Iteration 9, loss = 0.10948597
Iteration 10, loss = 0.09378809




0.66124535316
[[545  62  12  16]
 [ 69 663  17  13]
 [100 136  57  81]
 [ 91  88  44 158]]
CPU times: user 3min 39s, sys: 20min 53s, total: 24min 33s
Wall time: 3min 10s


In [1]:
#Try with cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(eclf1, X, y, cv=5)

NameError: name 'eclf1' is not defined

In [44]:
#Let's tune random forest first
n_estimators = 50
start = time.time()
clf2 = RandomForestClassifier(n_estimators = n_estimators,random_state=1,criterion='entropy')
print("Start training model: Random Forest")
clf2.fit(X_train,y_train)
end = time.time()
print("Finished training in {} seconds".format(round(end - start),2))
y_pred = clf2.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy: {}%".format(round((acc*100),2)))
print(cm)

Start training model: Random Forest
Finished training in 14 seconds
Accuracy: 65.2%
[[507  85  21  22]
 [ 47 687  18  10]
 [ 65 148  66  95]
 [ 61 118  59 143]]


In [None]:

#Let's tune random forest first
n_estimators = 50
start = time.time()
clf4 = MLPClassifier(hidden_layer_sizes =(1000,500),random_state = 1,verbose =True)
print("Start training model: Neural Network")
clf4.fit(X_train,y_train)
end = time.time()
print("Finished training in {} seconds".format(round(end - start),2))
y_pred = clf4.predict(X_test)
acc = accuracy_score(clf4.predict(X_train), y_train)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy(train): {}%".format(round((acc*100),2)))
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {}%".format(round((acc*100),2)))
print(cm)

Start training model: Neural Network
Iteration 1, loss = 1.06975231
Iteration 2, loss = 0.67920089
Iteration 3, loss = 0.45709190
Iteration 4, loss = 0.31590554
Iteration 5, loss = 0.22215293
Iteration 6, loss = 0.16337140
Iteration 7, loss = 0.14133826
Iteration 8, loss = 0.11043541
Iteration 9, loss = 0.09355110
Iteration 10, loss = 0.07802286
Iteration 11, loss = 0.06853948
Iteration 12, loss = 0.06668588
Iteration 13, loss = 0.06750333
