In [1]:
import pandas as pd
import numpy as np
import os
import time

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
from sklearn.svm import SVC

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lemcm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lemcm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lemcm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
df = pd.read_csv("model_dev_data.csv", encoding = "ISO-8859-1", header=None)

In [6]:
df = df.loc[:, (0,5)]

df.columns = ["Rating","Text"]

df["Rating"] = df["Rating"].map({4 : 1, 0:0}) #mapping positive to 1 instead of 4

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
df = df.sample(100000, random_state=0) #scaling down so processing time is achievable 
# df = df.sample(20, random_state=0) #scaling down so processing time is achievable 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
  df["Text"], df["Rating"], test_size=0.33, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [10]:
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words("english"))

In [11]:
def word_tokens(row):
    return word_tokenize(row)

In [12]:
def clean_data(df):
    df = df.apply(word_tokenize)
    df = df.map(lambda x: [i.lower() for i in x if i.lower() not in stop_words])
    df = df.map(lambda x: [lemmatizer.lemmatize(i) for i in x if i.lower() if i.isalnum()])
    df = df.map(lambda x: ' '.join(x))

    return df

In [13]:
X_train = clean_data(X_train)

In [14]:
len(X_test)

33000

In [15]:
# X_test_1 = X_test[:2]
# y_test_1 = y_test[:2]


# X_test_2 = X_test[2:4].reset_index(drop=True)
# y_test_2 = y_test[2:4].reset_index(drop=True)


# X_test_3 = X_test[4:].reset_index(drop=True)
# y_test_3 = y_test[4:].reset_index(drop=True)

In [16]:
X_test_1 = X_test[:500]
y_test_1 = y_test[:500]


X_test_2 = X_test[500:1000].reset_index(drop=True)
y_test_2 = y_test[500:1000].reset_index(drop=True)


X_test_3 = X_test[1000:1500].reset_index(drop=True)
y_test_3 = y_test[1000:1500].reset_index(drop=True)

## TFIDF RF or SVC

In [17]:
from sklearn.ensemble import RandomForestClassifier as RF

In [18]:
results_df = pd.DataFrame()

In [19]:
vect2 = TfidfVectorizer()

x_train_tfidf = vect2.fit_transform(X_train)


In [20]:
x_test_1_tfidf = vect2.transform(X_test_1)
x_test_2_tfidf = vect2.transform(X_test_2)
x_test_3_tfidf = vect2.transform(X_test_3)

In [21]:
import pickle

In [22]:
#### RF: TFIDF
clf = RF()
clf.fit(x_train_tfidf, y_train)


filename = 'SVC_TFIDF.sav'
pickle.dump(clf, open(filename, 'wb'))

In [23]:
#Test set 1:
start = time.time()
y_pred = clf.predict(x_test_1_tfidf) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_1, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_1, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time= (end - start)/60

results_df = results_df.append(pd.Series(["RF, TFIDF",1, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.768
precision is 0.8170212765957446 
recall is 0.7245283018867924


In [24]:
#Test set 1:
start = time.time()
y_pred = clf.predict(x_test_2_tfidf) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_2, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_2, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time= (end - start)/60

results_df = results_df.append(pd.Series(["RF, TFIDF",2, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.73
precision is 0.723404255319149 
recall is 0.7083333333333334


In [25]:
#### SVC: TFIDF
clf = RF()
clf.fit(x_train_tfidf, y_train)


filename = 'SVC_TFIDF.sav'
pickle.dump(clf, open(filename, 'wb'))

In [26]:
#Test set 1:
start = time.time()
y_pred = clf.predict(x_test_1_tfidf) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_1, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_1, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time= (end - start)/60

results_df = results_df.append(pd.Series(["RF, TFIDF",1, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.768
precision is 0.8143459915611815 
recall is 0.7283018867924528


In [27]:
#Test set 2:
start = time.time()
y_pred = clf.predict(x_test_2_tfidf) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_2, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_2, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time= (end - start)/60

results_df = results_df.append(pd.Series(["RF, TFIDF",2, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.738
precision is 0.746606334841629 
recall is 0.6875


In [28]:
#Test set 3:
start = time.time()
y_pred = clf.predict(x_test_3_tfidf) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_3, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_3, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time= (end - start)/60

results_df = results_df.append(pd.Series(["RF, TFIDF",3, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.768
precision is 0.784688995215311 
recall is 0.6978723404255319


In [29]:
results_df.to_csv("Results.csv")

In [30]:
clf = SVC()
clf.fit(x_train_tfidf, y_train)

SVC()

In [31]:
filename = 'SVC_TFIDF.sav'
pickle.dump(clf, open(filename, 'wb'))

In [32]:
#Test set 1:
start = time.time()
y_pred = clf.predict(x_test_1_tfidf) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_1, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_1, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time= (end - start)/60

results_df = results_df.append(pd.Series(["RF, TFIDF",1, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.766
precision is 0.8057851239669421 
recall is 0.7358490566037735


In [33]:
#Test set 1:
start = time.time()
y_pred = clf.predict(x_test_2_tfidf) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_2, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_2, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time= (end - start)/60

results_df = results_df.append(pd.Series(["SVC, TFIDF",2, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.756
precision is 0.7610619469026548 
recall is 0.7166666666666667


In [34]:
#Test set 3:
start = time.time()
y_pred = clf.predict(x_test_3_tfidf) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_3, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_3, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time= (end - start)/60

results_df = results_df.append(pd.Series(["SVC, TFIDF",3, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.8
precision is 0.8054298642533937 
recall is 0.7574468085106383


In [35]:
results_df.to_csv("Results.csv")

## Word2Vec

In [36]:
import gensim
import gensim.downloader as gensim_api
embeddings = gensim_api.load("word2vec-google-news-300")

In [37]:
def word_to_vec_clean(df):
    docs_vects = pd.DataFrame()
    for i in range(0, len(df)):
        temp = pd.DataFrame()
        for word in df[i].split(" "):
            try:
                word_vec = embeddings[word]
                temp = temp.append(pd.Series(word_vec), ignore_index=True)
            except:
                pass

        doc_vect = temp.mean()
        docs_vects = docs_vects.append(doc_vect, ignore_index = True)
    return docs_vects

In [38]:
X_train_vec = word_to_vec_clean(X_train)

In [39]:
X_train_vec.to_csv("X_train_vec.csv")

In [40]:
start = time.time()
X_test_1_vec = word_to_vec_clean(X_test_1)
end = time.time()

In [41]:
X_test_2_vec = word_to_vec_clean(X_test_2)
X_test_3_vec = word_to_vec_clean(X_test_3)

In [42]:
X_test_1_vec.to_csv("X_test_1_vec.csv")
X_test_2_vec.to_csv("X_test_2_vec.csv")
X_test_3_vec.to_csv("X_test_3_vec.csv")

In [43]:
X_train_vec = pd.read_csv("X_train_vec.csv", index_col = "Unnamed: 0")

In [44]:
X_test_1_vec = pd.read_csv("X_test_1_vec.csv", index_col = "Unnamed: 0")
X_test_2_vec = pd.read_csv("X_test_2_vec.csv", index_col = "Unnamed: 0")
X_test_3_vec = pd.read_csv("X_test_3_vec.csv", index_col = "Unnamed: 0")

In [45]:
drop_index_train = np.where(X_train_vec.isna().any(axis=1) == True)[0]

drop_index_test_1 = np.where(X_test_1_vec.isna().any(axis=1) == True)[0]
drop_index_test_2 = np.where(X_test_2_vec.isna().any(axis=1) == True)[0]
drop_index_test_3 =  np.where(X_test_3_vec.isna().any(axis=1) == True)[0]

In [46]:
X_train_vec = X_train_vec.drop(drop_index_train).reset_index(drop=True)

In [47]:
y_train_vec = y_train.drop(drop_index_train).reset_index(drop=True)

In [48]:
X_test_1_vec = X_test_1_vec.drop(drop_index_test_1).reset_index(drop=True)
X_test_2_vec = X_test_2_vec.drop(drop_index_test_2).reset_index(drop=True)
X_test_3_vec = X_test_3_vec.drop(drop_index_test_3).reset_index(drop=True)

In [49]:
y_test_1_vec = y_test_1.drop(drop_index_test_1).reset_index(drop=True)
y_test_2_vec = y_test_2.drop(drop_index_test_2).reset_index(drop=True)
y_test_3_vec = y_test_3.drop(drop_index_test_3).reset_index(drop=True)

In [50]:
vec_time = (end-start)/60

In [51]:
import sklearn.preprocessing as pre

In [52]:
scaler = pre.MinMaxScaler()
X_train_NB = scaler.fit_transform(X_train_vec)
X_test_NB_1 = scaler.fit_transform(X_test_1_vec)
X_test_NB_2 = scaler.fit_transform(X_test_2_vec)
X_test_NB_3 = scaler.fit_transform(X_test_3_vec)

In [53]:
#### NB: WORDVEC

clf = MultinomialNB()
clf.fit(X_train_NB, y_train_vec)


filename = 'NB_Word2Vec.sav'
pickle.dump(clf, open(filename, 'wb'))

In [54]:
start = time.time()
y_pred = clf.predict(X_test_NB_1) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_1_vec, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_1_vec, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["NB, WORDVEC", 1, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.5665322580645161
precision is 0.5524017467248908 
recall is 0.9619771863117871


In [55]:
start = time.time()
y_pred = clf.predict(X_test_NB_2) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_2_vec, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_2_vec, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["NB, WORDVEC", 2, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.597165991902834
precision is 0.5520833333333334 
recall is 0.8870292887029289


In [56]:
start = time.time()
y_pred = clf.predict(X_test_NB_3) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_3_vec, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_3_vec, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["NB, WORDVEC", 3, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.6016260162601627
precision is 0.5508021390374331 
recall is 0.8803418803418803


In [57]:
#### Logistic Regression: WORDVEC

clf = LogisticRegression()
clf.fit(X_train_vec, y_train_vec)


filename = 'Log_Word2Vec.sav'
pickle.dump(clf, open(filename, 'wb'))

In [58]:
start = time.time()
y_pred = clf.predict(X_test_1_vec) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_1_vec, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_1_vec, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["Log, WORDVEC", 1, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.7137096774193549
precision is 0.712280701754386 
recall is 0.7718631178707225


In [59]:
start = time.time()
y_pred = clf.predict(X_test_2_vec) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_2_vec, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_2_vec, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["Log, WORDVEC", 2, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.6700404858299596
precision is 0.6417910447761194 
recall is 0.7196652719665272


In [60]:
#Test set 3:
start = time.time()
y_pred = clf.predict(X_test_3_vec) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_3_vec, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_3_vec, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time= (end - start)/60

results_df = results_df.append(pd.Series(["Log, WORDVEC",3, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.7215447154471545
precision is 0.691699604743083 
recall is 0.7478632478632479


In [61]:
results_df.to_csv("Results.csv")

In [62]:
#### RF: WORDVEC

clf = RF()
clf.fit(X_train_vec, y_train_vec)


filename = 'RF_Word2Vec.sav'
pickle.dump(clf, open(filename, 'wb'))

In [63]:
start = time.time()
y_pred = clf.predict(X_test_1_vec) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_1_vec, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_1_vec, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["RF, WORDVEC", 1, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.6754032258064516
precision is 0.6554878048780488 
recall is 0.8174904942965779


In [64]:
start = time.time()
y_pred = clf.predict(X_test_2_vec) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_2_vec, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_2_vec, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["RF, WORDVEC", 2, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.6295546558704453
precision is 0.5897435897435898 
recall is 0.7698744769874477


In [65]:
start = time.time()
y_pred = clf.predict(X_test_3_vec) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_3_vec, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_3_vec, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["RF, WORDVEC", 3, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.6869918699186992
precision is 0.636986301369863 
recall is 0.7948717948717948


In [66]:
#### SVC: WORDVEC

clf = SVC()
clf.fit(X_train_vec, y_train_vec)


filename = 'SVC_Word2Vec.sav'
pickle.dump(clf, open(filename, 'wb'))

In [67]:
start = time.time()
y_pred = clf.predict(X_test_1_vec) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_1_vec, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_1_vec, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["SVC, WORDVEC", 1, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.7439516129032258
precision is 0.7463768115942029 
recall is 0.7832699619771863


In [68]:
start = time.time()
y_pred = clf.predict(X_test_2_vec) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_2_vec, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_2_vec, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["SVC, WORDVEC", 2, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.6862348178137652
precision is 0.65 
recall is 0.7615062761506276


In [69]:
start = time.time()
y_pred = clf.predict(X_test_3_vec) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_3_vec, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_3_vec, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["SVC, WORDVEC", 3, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.7459349593495935
precision is 0.7154150197628458 
recall is 0.7735042735042735


In [70]:
results_df.to_csv("Results.csv")

## BOW

In [71]:
vect = CountVectorizer()
x_train_bow = vect.fit_transform(X_train)

In [72]:
x_test_bow_1 = vect.transform(X_test_1)
x_test_bow_2 = vect.transform(X_test_2)
x_test_bow_3 = vect.transform(X_test_3)

In [73]:
#### RF: BOW

clf = RF()
clf.fit(x_train_bow, y_train)

filename = 'RF_BOW.sav'
pickle.dump(clf, open(filename, 'wb'))

In [74]:
start = time.time()
y_pred = clf.predict(x_test_bow_1) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_1, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_1, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["RF, BOW", 1, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.772
precision is 0.7870722433460076 
recall is 0.7811320754716982


In [75]:
start = time.time()
y_pred = clf.predict(x_test_bow_2) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_2, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_2, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["RF, BOW", 2, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.728
precision is 0.7166666666666667 
recall is 0.7166666666666667


In [76]:
start = time.time()
y_pred = clf.predict(x_test_bow_3) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_3, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_3, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["RF, BOW", 3, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.726
precision is 0.6991869918699187 
recall is 0.7319148936170212


In [77]:
#### RF: SVC

clf = SVC()
clf.fit(x_train_bow, y_train)

filename = 'SVC_BOW.sav'
pickle.dump(clf, open(filename, 'wb'))

In [78]:
start = time.time()
y_pred = clf.predict(x_test_bow_1) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_1, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_1, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["SVC, BOW", 1, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.762
precision is 0.8067226890756303 
recall is 0.7245283018867924


In [79]:
start = time.time()
y_pred = clf.predict(x_test_bow_2) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_2, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_2, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["SVC, BOW", 2, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.752
precision is 0.7660550458715596 
recall is 0.6958333333333333


In [80]:
start = time.time()
y_pred = clf.predict(x_test_bow_3) #prediction from model
end = time.time()
accuracy = accuracy_score(y_test_3, y_pred)
print('Test Accuracy: ', accuracy)

cm = confusion_matrix(y_test_3, y_pred)

precision = cm[0][0]/(cm[0][0] + cm[1][0])
print(f"precision is {precision} ")

recall = cm[0][0]/(cm[0][0] + cm[0][1])
print(f"recall is {recall}")

f = 2*(precision * recall)/(precision + recall)

tot_time =  ((end-start)/60) + vec_time

results_df = results_df.append(pd.Series(["SVC, BOW", 3, precision, recall, accuracy, f, tot_time]), ignore_index=True)

Test Accuracy:  0.786
precision is 0.7962962962962963 
recall is 0.7319148936170212


In [81]:
results_df.to_csv("Results.csv")