In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn import linear_model, metrics, naive_bayes, svm, neural_network

from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, Dropout
from keras.layers.embeddings import Embedding

import pickle

In [17]:
X_train = pd.read_csv('X_trainFNC.csv').squeeze()
X_test = pd.read_csv('X_testFNC.csv').squeeze()
X_valid = pd.read_csv('X_validFNC.csv').squeeze()
y_train = pd.read_csv('y_trainFNC.csv').squeeze()
y_test = pd.read_csv('y_testFNC.csv').squeeze()
y_valid = pd.read_csv('y_validFNC.csv').squeeze()

In [3]:

vectorizer = TfidfVectorizer(       
    lowercase=False, 
    ngram_range=(1,1), 
    token_pattern=r"(?u)\b\w\w+\b|<DATE>|<NUM>|<EMAIL>|<URL>",
    min_df=3
)

In [15]:
vectorizer.fit(X_train)

len(vectorizer.get_feature_names())

98724

In [19]:
n_train = X_train.shape[0]
n_val = X_valid.shape[0]

X_trainval = np.concatenate((X_train, X_valid))
y_trainval = np.concatenate((y_train, y_valid))

test_fold = -1*np.ones(X_trainval.shape[0])
test_fold[n_train:] = 0
pds = PredefinedSplit(test_fold)

TFIDF_Xtrainval = vectorizer.transform(X_trainval)

TFIDF_Xtrain = vectorizer.transform(X_train)
TFIDF_Xvalid = vectorizer.transform(X_valid)
TFIDF_Xtest = vectorizer.transform(X_test)

### Lin Reg

In [20]:
LR_model = linear_model.LogisticRegression()

LR_param = [
    {'penalty' : ['l1'], 'C' : np.logspace(-4, 4, 20), 'solver' : ['liblinear']},
    {'penalty' : ['l2'], 'C' : np.logspace(-4, 4, 20), 'solver' : ['lbfgs','newton-cg','saga']}
]

LR_gs = GridSearchCV(LR_model, param_grid = LR_param, cv=pds, verbose = 5, n_jobs = -1)

In [21]:
LR_base = LR_model.fit(TFIDF_Xtrain, y_train)

LR_base_fn = 'LR_base.sav'

pickle.dump(LR_base, open(LR_base_fn, 'wb'))

In [27]:
LR_gs.fit(TFIDF_Xtrainval, y_trainval)
LR_bestparam = LR_gs.best_params_

with open('LR_bestparam,txt', 'w') as f:
    print(LR_bestparam, file=f)

print(LR_bestparam)

Fitting 1 folds for each of 80 candidates, totalling 80 fits


MemoryError: Unable to allocate 30.2 MiB for an array with shape (3955182,) and data type float64

In [None]:

LR_tuned = linear_model.LogisticRegression(**LR_bestparam)
LR_tuned.fit(TFIDF_Xtrain, y_train)


In [26]:
# load_LR = pickle.load(open('LR_base.sav', 'rb'))

0.8694711649100197

### Naive Bayes

In [None]:
# Naive Bayes

NB_model = naive_bayes.MultinomialNB()

NB_param = [
    {'alpha' : [0.0, 0.01, 0.05] + np.linspace(0.1, 1, 10).tolist() + [5.0, 10.0, 100.0]}
]

NB_gs = GridSearchCV(NB_model, param_grid = NB_param, cv=pds, verbose = 5, n_jobs = -1)

In [None]:
NB_base = NB_model.fit(TFIDF_Xtrain, y_train)

NB_base_fn = 'NB_base.sav'

pickle.dump(NB_base, open(NB_base_fn, 'wb'))