Author of code: William Godel 

Date: 07/02

Purpose: to train ML models for larger crowds data, and build select crowds data

## Data IN: 

train_data_large.csv

val_data_large.csv

train_data_large_covid.csv

val_data_large_covid.csv

train_data_large_noncovid.csv

val_data_large_noncovid.csv


## Data OUT:

### models

grid_rf_large.p

grid_en_large.p

model_large.pt

Machine: My laptop or Imac

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from collections import Counter
laptop = True
import pickle

import logging
logging.basicConfig(filename='ML_large.log',level=logging.DEBUG)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from ml_functions import feature_creation, conf_eval, data_prep, test_model

from functions import count_mode, bayes_probs, bayes_binary

from sklearn.model_selection import train_test_split

#crowd_size, feature_transform

from path import *

In [2]:
train_data = pd.read_csv(prepared_data + "train_data_large.csv")
val_data = pd.read_csv(prepared_data + "val_data_large.csv")


train_data_covid = pd.read_csv(prepared_data + "train_data_large_covid.csv")
val_data_covid = pd.read_csv(prepared_data + "val_data_large_covid.csv")


train_data_noncovid = pd.read_csv(prepared_data + "train_data_large_noncovid.csv")
val_data_noncovid = pd.read_csv(prepared_data + "val_data_large_noncovid.csv")



train_data_large = train_data.append(train_data_covid)
train_data_large = train_data_large.append(train_data_noncovid)

train_data_large.reset_index(inplace = True, drop = True)


val_data_large = val_data.append(val_data_covid)
val_data_large = val_data_large.append(val_data_noncovid)

val_data_large.reset_index(inplace = True, drop = True)


train_target_large = train_data_large['mode'] == 'FM'
val_target_large = val_data_large['mode'] == 'FM'


## Building ML pipelines:

1. Elastic net 
2. Random Forest
3. Gradient Boosted Trees
4. NN



In [3]:
# Preparing the data
X_data = feature_creation(train_data_large, crowd_num = 25)
Y_data = train_target_large

X_data_val = feature_creation(val_data_large, crowd_num = 25)
Y_data_val = val_target_large

X_data_all = X_data.append(X_data_val)
Y_data_all = Y_data.append(Y_data_val)

X_data_all.reset_index(inplace = True, drop = True)
Y_data_all.reset_index(inplace = True, drop = True)

fold_list = [-1 for x in range(X_data.shape[0])]
fold_list.extend([0 for x in range(X_data_val.shape[0])])

test_fold = np.array(fold_list)
ps = PredefinedSplit(test_fold)

#encoding the label
#le = LabelEncoder()
#le.fit(Y_data)
#Y_data = le.transform(Y_data)

In [4]:
test_data_large = pd.read_csv(prepared_data + "test_data_large.csv")

test_target_large = test_data_large['mode'] == 'FM'

X_data_test = feature_creation(test_data_large, crowd_num = 25)
Y_data_test = test_target_large


## Numeric Features


In [5]:
numeric_features = []

resp_var = [x for x in X_data.columns if 'resp_veracity_' in x]
new_cols = [x for x in X_data.columns if 'new' in x]

#numeric_features.extend(resp_var)
numeric_features.extend(resp_var) #rempve this if it doesn't work
numeric_features.extend(new_cols)
numeric_features.extend(["crowd_means",'crowd_median','crowd_full_range', 'crowd_IQR_range', \
                         'crowd_variance', 'crowd_bayes'])

### Random Forest

In [6]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

parameteres = {'classifier__max_depth':[1,2,3,4,5,10,20,None], \
               'classifier__min_samples_split':[2,3,5,10], \
               'classifier__min_samples_leaf':[1,2,3,5,10], \
               'classifier__n_estimators':[100,250,300,400,500]}

grid_rf = GridSearchCV(clf, scoring = 'accuracy', param_grid=parameteres, cv=ps)

grid_rf.fit(X_data_all,Y_data_all)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('scaler',
                                                                                          StandardScaler(copy=True,
                                                                              

In [7]:
pickle.dump(grid_rf, open(models + "grid_rf_large.p", "wb"))

## Elastic Net

In [8]:
# Elastic Net
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(penalty = 'elasticnet', solver = 'saga'))])

parameteres = {'classifier__l1_ratio':[0.9,.8,.05,.2,.15,0.1,1], 'classifier__C':[.1,.5,1,5,10,20]}

grid_en = GridSearchCV(clf, scoring = 'accuracy', param_grid=parameteres, cv=ps)

grid_en.fit(X_data_all,Y_data_all)



GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('scaler',
                                                                                          StandardScaler(copy=True,
                                                                              

In [9]:
pickle.dump(grid_en, open(models + "grid_en_large.p", "wb"))

## Simple Neural Network

In [10]:
# Preparing the data


# Preparing the data
#X_data = feature_creation(train_data_fakesource.iloc[:,:])
#Y_data = train_target_large_lowcred[:]


test = StandardScaler()
test.fit(X_data[numeric_features])

#training data
X_train_stand = X_data.copy()
X_train_stand[numeric_features] = test.transform(X_train_stand[numeric_features])

#val data
X_val_stand = X_data_val.copy()
X_val_stand[numeric_features] = test.transform(X_val_stand[numeric_features])




In [11]:
test_data = pd.read_csv(prepared_data + "test_data_large.csv")
test_data_covid = pd.read_csv(prepared_data + "test_data_large_covid.csv")
test_data_noncovid = pd.read_csv(prepared_data + "test_data_large_noncovid.csv")

test_data_large = test_data.append(test_data_covid)
test_data_large = test_data_large.append(test_data_noncovid)

test_data_large.reset_index(inplace = True, drop = True)
test_target_large = test_data_large['mode'] == 'FM'

X_test = feature_creation(test_data_large, crowd_num= 25)

X_test_stand = X_test.copy()
X_test_stand[numeric_features] = test.transform(X_test_stand[numeric_features])

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):

    def __init__(self, emb_dim):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(emb_dim, 500) 
        self.fc2 = nn.Linear(500, 100)
        self.fc3 = nn.Linear(100, 25)
        self.fc4 = nn.Linear(25, 25)
        self.fc5 = nn.Linear(25, 10)
        self.fc6 = nn.Linear(10, 1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        
        return x



In [13]:
#general model function for testing hyper parameters


def model_train(emb_dim, model, learning_rate, \
                num_epochs, criterion, optimizer, \
                train_loader, val_loader):
    


    loss_vals = []
    acc_est = []
    train_est = []
    
    for epoch in range(num_epochs):
        for i, (data, labels) in enumerate(train_loader):        
            model.train()
            data_batch, label_batch = data, labels
            label_batch = torch.reshape(label_batch, (-1,1))
            optimizer.zero_grad()
            outputs = model(data_batch.float())
            loss = criterion(outputs, label_batch.float())
            loss_vals.append(loss/labels.size(0))
            loss.backward()
            optimizer.step()
        
    
            # validate every 100 iterations
            if i > 0 and i % 1000 == 0:
                # validate
                train_acc = test_model(train_loader, model)
                train_est.append(train_acc)
                val_acc = test_model(val_loader, model)
                acc_est.append(val_acc)
                #loss_vals.append(test_model_LOSS(train_loader,model))
                print('Epoch: [{}/{}], Step: [{}/{}],Train Acc:{}, Validation Acc: {}'.format( 
                           epoch+1, num_epochs, i+1, len(train_loader), train_acc, val_acc))
                
            
    
    return loss_vals, train_est, acc_est

In [14]:
train_data = data_prep(X_train_stand,Y_data)
trainloader = torch.utils.data.DataLoader(train_data, batch_size=10,
                                          shuffle=True, num_workers=2)

val_data = data_prep(X_val_stand,Y_data_val)
valloader = torch.utils.data.DataLoader(val_data, batch_size=10,
                                          shuffle=True, num_workers=2)



#test_data_nn = data_loader(X_test_stand,test_target_large)


In [15]:
#for use in feature importance
pickle.dump(X_train_stand, open(data_pickles + "X_train_stand.p", "wb"))
pickle.dump(X_val_stand, open(data_pickles + "X_val_stand.p", "wb"))
pickle.dump(Y_data, open(data_pickles + "Y_data.p", "wb"))
pickle.dump(Y_data_val, open(data_pickles + "Y_data_val.p", "wb"))

In [16]:
#Final Model
num_epochs = 8# number epoch to train
learning_rates = .0001
emb_dim = 121

loss_performance = []
acc_performance = []

model = Net(emb_dim)
    
criterion = torch.nn.BCEWithLogitsLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rates)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=.1)
    
loss_vals, train_est, acc_est = model_train(emb_dim = emb_dim, model = model, learning_rate = learning_rates, \
                                 num_epochs = num_epochs, criterion = criterion, optimizer = optimizer, \
                                 train_loader = trainloader, val_loader = valloader)
    
loss_performance.append(loss_vals)
acc_performance.append(test_model(trainloader, model))

Epoch: [1/8], Step: [1001/4300],Train Acc:66.17209302325581, Validation Acc: 57.53333333333333
Epoch: [1/8], Step: [2001/4300],Train Acc:67.45116279069768, Validation Acc: 58.19047619047619
Epoch: [1/8], Step: [3001/4300],Train Acc:69.46279069767442, Validation Acc: 58.75238095238095
Epoch: [1/8], Step: [4001/4300],Train Acc:71.68837209302326, Validation Acc: 61.02857142857143
Epoch: [2/8], Step: [1001/4300],Train Acc:73.85116279069767, Validation Acc: 65.43809523809524
Epoch: [2/8], Step: [2001/4300],Train Acc:75.25813953488372, Validation Acc: 65.16190476190476
Epoch: [2/8], Step: [3001/4300],Train Acc:76.05813953488372, Validation Acc: 69.06666666666666
Epoch: [2/8], Step: [4001/4300],Train Acc:76.36976744186046, Validation Acc: 68.2
Epoch: [3/8], Step: [1001/4300],Train Acc:75.61395348837209, Validation Acc: 69.03809523809524
Epoch: [3/8], Step: [2001/4300],Train Acc:77.1093023255814, Validation Acc: 68.43809523809524
Epoch: [3/8], Step: [3001/4300],Train Acc:77.22093023255815, Val

In [17]:
logging.info('NN Ran')
torch.save(model.state_dict(),models + "model_large.pt")