In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn import linear_model, metrics, naive_bayes, svm, neural_network

from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, Dropout
from keras.layers.embeddings import Embedding

import pickle

In [17]:
X_train = pd.read_csv('X_trainFNC.csv').squeeze()
X_test = pd.read_csv('X_testFNC.csv').squeeze()
X_valid = pd.read_csv('X_validFNC.csv').squeeze()
y_train = pd.read_csv('y_trainFNC.csv').squeeze()
y_test = pd.read_csv('y_testFNC.csv').squeeze()
y_valid = pd.read_csv('y_validFNC.csv').squeeze()

In [3]:

vectorizer = TfidfVectorizer(       
    lowercase=False, 
    ngram_range=(1,1), 
    token_pattern=r"(?u)\b\w\w+\b|<DATE>|<NUM>|<EMAIL>|<URL>",
    min_df=3
)

In [15]:
vectorizer.fit(X_train)

len(vectorizer.get_feature_names())

98724

In [19]:
n_train = X_train.shape[0]
n_val = X_valid.shape[0]

X_trainval = np.concatenate((X_train, X_valid))
y_trainval = np.concatenate((y_train, y_valid))

test_fold = -1*np.ones(X_trainval.shape[0])
test_fold[n_train:] = 0
pds = PredefinedSplit(test_fold)

TFIDF_Xtrainval = vectorizer.transform(X_trainval)

TFIDF_Xtrain = vectorizer.transform(X_train)
TFIDF_Xvalid = vectorizer.transform(X_valid)
TFIDF_Xtest = vectorizer.transform(X_test)

### Lin Reg

In [20]:
LR_model = linear_model.LogisticRegression()

LR_param = [
    {'penalty' : ['l1'], 'C' : np.logspace(-4, 4, 20), 'solver' : ['liblinear']},
    {'penalty' : ['l2'], 'C' : np.logspace(-4, 4, 20), 'solver' : ['lbfgs','newton-cg','saga']}
]

LR_gs = GridSearchCV(LR_model, param_grid = LR_param, cv=pds, verbose = 5, n_jobs = -1)

In [21]:
LR_base = LR_model.fit(TFIDF_Xtrain, y_train)

LR_base_fn = 'LR_base.sav'

pickle.dump(LR_base, open(LR_base_fn, 'wb'))

In [29]:
LR_gs.fit(TFIDF_Xtrainval, y_trainval)
LR_bestparam = LR_gs.best_params_

with open('LR_bestparam,txt', 'w') as f:
    print(LR_bestparam, file=f)

print(LR_bestparam)

{'C': [1]}


In [None]:

LR_tuned = linear_model.LogisticRegression(**LR_bestparam)
LR_tuned.fit(TFIDF_Xtrain, y_train)

LR_tuned_fn = 'LR_tuned.sav'
pickle.dump(LR_tuned, open(LR_tuned_fn, 'wb'))

In [26]:
# load_LR = pickle.load(open('LR_base.sav', 'rb'))

0.8694711649100197

### Naive Bayes

In [None]:
# Naive Bayes

NB_model = naive_bayes.MultinomialNB()

NB_param = [
    {'alpha' : [0.0, 0.01, 0.05] + np.linspace(0.1, 1, 10).tolist() + [5.0, 10.0, 100.0]}
]

NB_gs = GridSearchCV(NB_model, param_grid = NB_param, cv=pds, verbose = 5, n_jobs = -1)

In [None]:
NB_base = NB_model.fit(TFIDF_Xtrain, y_train)

NB_base_fn = 'NB_base.sav'
pickle.dump(NB_base, open(NB_base_fn, 'wb'))

In [None]:
NB_gs.fit(TFIDF_Xtrainval, y_trainval)
NB_bestparam = NB_gs.best_params_

with open('NB_bestparam,txt', 'w') as f:
    print(NB_bestparam, file=f)

print(NB_bestparam)

In [None]:
NB_tuned = naive_bayes.MultinomialNB(**NB_bestparam)
NB_tuned.fit(TFIDF_Xtrain, y_train)

NB_tuned_fn = 'NB_tuned.sav'
pickle.dump(NB_tuned, open(NB_tuned_fn, 'wb'))

### Support Vector Machine

In [None]:
SVM_model = svm.SVC(probability=True)

SVM_param = [
    {'kernel' : ['linear'], 'C' : [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}
]

SVM_gs = GridSearchCV(SVM_model, param_grid = SVM_param, cv=pds, verbose = 5, n_jobs = -1)

In [None]:
SVM_base = SVM_model.fit(TFIDF_Xtrain, y_train)

SVM_base_fn = 'SVM_base.sav'
pickle.dump(SVM_base, open(SVM_base_fn, 'wb'))

In [None]:
SVM_gs.fit(TFIDF_Xtrainval, y_trainval)
SVM_bestparam = SVM_gs.best_params_

with open('SVM_bestparam,txt', 'w') as f:
    print(SVM_bestparam, file=f)

print(SVM_bestparam)

In [None]:
SVM_tuned = svm.SVC(probability=True, **SVM_bestparam)
SVM_tuned.fit(TFIDF_Xtrain, y_train)

SVM_tuned_fn = 'SVM_tuned.sav'
pickle.dump(SVM_tuned, open(SVM_tuned_fn, 'wb'))

### FEED FORWARD NEURAL NETWORK

In [None]:
ANN_model = neural_network.MLPClassifier()

ANN_param = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)], 'activation': ['logistic', 'relu'], 
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive']
}

ANN_gs = GridSearchCV(ANN_model, param_grid = ANN_param, cv=pds, verbose = 5, n_jobs = -1)

In [None]:
ANN_base= ANN_model.fit(TFIDF_Xtrain, y_train)

ANN_base_fn = 'ANN_base.sav'
pickle.dump(ANN_base, open(ANN_base_fn, 'wb'))

In [None]:
ANN_gs.fit(TFIDF_Xtrainval, y_trainval)
ANN_bestparam = ANN_gs.best_params_

with open('ANN_bestparam,txt', 'w') as f:
    print(ANN_bestparam, file=f)

print(ANN_bestparam)

In [None]:
ANN_tuned = neural_network.MLPClassifier(**ANN_bestparam)
ANN_tuned.fit(TFIDF_Xtrain, y_train)

ANN_tuned_fn = 'ANN_tuned.sav'
pickle.dump(ANN_tuned, open(ANN_tuned_fn, 'wb'))