Author of code: William Godel 

Date: 07/02

Purpose: to train ML models, and build select crowds data

## Data IN: 

train_data_large.csv

val_data_large.csv

train_data_large_covid.csv

val_data_large_covid.csv

train_data_large_noncovid.csv

val_data_large_noncovid.csv


## Data OUT:

### models

grid_rf.p

grid_en.p

model.pt

model_nn_highpol.pt


Machine: My laptop or Imac

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from collections import Counter
laptop = True
import pickle

import logging
logging.basicConfig(filename='ML.log',level=logging.DEBUG)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from ml_functions import feature_creation, conf_eval, data_prep, test_model

from functions import count_mode, bayes_probs, bayes_binary

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F

#crowd_size, feature_transform
from path import *

In [2]:
train_data = pd.read_csv(prepared_data + "train_data_large.csv")
val_data = pd.read_csv(prepared_data + "val_data_large.csv")

train_data_covid = pd.read_csv(prepared_data + "train_data_large_covid.csv")
val_data_covid = pd.read_csv(prepared_data + "val_data_large_covid.csv")

train_data_noncovid = pd.read_csv(prepared_data + "train_data_large_noncovid.csv")
val_data_noncovid = pd.read_csv(prepared_data + "val_data_large_noncovid.csv")


train_data_large = train_data.append(train_data_covid)
train_data_large = train_data_large.append(train_data_noncovid)

train_data_large.reset_index(inplace = True, drop = True)


val_data_large = val_data.append(val_data_covid)
val_data_large = val_data_large.append(val_data_noncovid)

val_data_large.reset_index(inplace = True, drop = True)


train_target_large = train_data_large['mode'] == 'FM'
val_target_large = val_data_large['mode'] == 'FM'

## Functions for Feature creation

## Building ML pipelines:

1. Elastic net 
2. Random Forest
3. Gradient Boosted Trees
4. NN



In [3]:
# Preparing the data
X_data = feature_creation(train_data_large)
Y_data = train_target_large

X_data_val = feature_creation(val_data_large)
Y_data_val = val_target_large

X_data_all = X_data.append(X_data_val)
Y_data_all = Y_data.append(Y_data_val)

X_data_all.reset_index(inplace = True, drop = True)
Y_data_all.reset_index(inplace = True, drop = True)

fold_list = [-1 for x in range(X_data.shape[0])]
fold_list.extend([0 for x in range(X_data_val.shape[0])])

test_fold = np.array(fold_list)
ps = PredefinedSplit(test_fold)

## Numeric Features

In [4]:
numeric_features = []

resp_var = [x for x in X_data.columns if 'resp_veracity_' in x]
new_cols = [x for x in X_data.columns if 'new' in x]

numeric_features.extend(resp_var)
numeric_features.extend(new_cols)
numeric_features.extend(["crowd_means",'crowd_median','crowd_full_range', 'crowd_IQR_range', \
                         'crowd_variance', 'crowd_bayes'])

### Random Forest

In [5]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

parameters = {'classifier__max_depth':[1,2,3,4,5,10,20,None], \
               'classifier__min_samples_split':[2,3,5,10], \
               'classifier__min_samples_leaf':[1,2,3,5,10], \
               'classifier__n_estimators':[500,1000,2000]}


grid_rf = GridSearchCV(clf, scoring = 'accuracy', param_grid=parameters, cv=ps)

grid_rf.fit(X_data_all,Y_data_all)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('scaler',
                                                                                          StandardScaler(copy=True,
                                                                              

In [6]:
np.unique(grid_rf.predict(X_data_all))

array([False,  True])

In [7]:
grid_rf.best_score_

0.5700952380952381

In [8]:
1- Y_data_all.sum()/Y_data_all.shape[0]

0.5902056074766355

In [9]:
X_data_all.head()

Unnamed: 0,resp_veracity_0,resp_veracity_1,resp_veracity_2,resp_veracity_3,resp_veracity_4,resp_veracity_5,resp_veracity_6,resp_veracity_7,resp_veracity_8,resp_veracity_9,...,resp_cat_6_f,resp_cat_6_t,resp_cat_7_f,resp_cat_7_t,resp_cat_8_f,resp_cat_8_t,resp_cat_9_f,resp_cat_9_t,crowd_mode_f,crowd_mode_t
0,6.0,4.0,3.0,4.0,5.0,4.0,3.0,3.0,4.0,3.0,...,0,0,0,0,0,0,0,0,0,0
1,4.0,4.0,4.0,4.0,3.0,6.0,4.0,5.0,6.0,5.0,...,0,0,0,0,1,0,0,1,1,0
2,4.0,2.0,4.0,4.0,6.0,7.0,4.0,4.0,5.0,1.0,...,0,1,0,1,0,1,1,0,0,1
3,5.0,5.0,7.0,3.0,2.0,5.0,1.0,4.0,2.0,3.0,...,1,0,0,0,1,0,1,0,0,1
4,5.0,4.0,7.0,5.0,7.0,5.0,5.0,5.0,4.0,4.0,...,0,1,0,1,0,0,0,0,0,1


In [10]:
pickle.dump(grid_rf, open(models + "grid_rf.p", "wb"))

## Elastic Net

In [11]:
# Elastic Net
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(penalty = 'elasticnet', solver = 'saga'))])

parameters = {'classifier__l1_ratio':[0.9,.8,.05,.2,.15,0.1,1], 'classifier__C':[.1,.5,1,5,10,20]}

grid_en = GridSearchCV(clf, scoring = 'accuracy', param_grid=parameters, cv=ps)

grid_en.fit(X_data_all,Y_data_all)



GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('scaler',
                                                                                          StandardScaler(copy=True,
                                                                              

In [12]:
pickle.dump(grid_en, open(models + "grid_en.p", "wb"))

## Simple Neural Network

In [13]:
# Preparing the data
#X_data = feature_creation(train_data_fakesource.iloc[:,:])
#Y_data = train_target_large_lowcred[:]

y_data_bin = np.where(Y_data == True, 1, 0)

test = StandardScaler()
test.fit(X_data[numeric_features])

#training data
X_train_stand = X_data.copy()
X_train_stand[numeric_features] = test.transform(X_train_stand[numeric_features])

#test data
X_val_stand = X_data_val.copy()
X_val_stand[numeric_features] = test.transform(X_val_stand[numeric_features])



In [14]:
class Net(nn.Module):

    def __init__(self, emb_dim):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(emb_dim, 500) 
        self.fc2 = nn.Linear(500, 100)
        self.fc3 = nn.Linear(100, 25)
        self.fc4 = nn.Linear(25, 25)
        self.fc5 = nn.Linear(25, 10)
        self.fc6 = nn.Linear(10, 1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        
        return x
    
    
class Net_dropout(nn.Module):

    def __init__(self, emb_dim):
        super(Net_dropout, self).__init__()
        self.fc1 = nn.Linear(emb_dim, 500) 
        self.fc2 = nn.Linear(500, 100)
        self.fc3 = nn.Linear(100, 25)
        self.fc4 = nn.Linear(25, 25)
        self.fc5 = nn.Linear(25, 10)
        self.fc6 = nn.Linear(10, 1)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.25)
        
        
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))  
        x = self.dropout(x)   
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        
        return x
    



In [15]:
#general model function for testing hyper parameters


def model_train(emb_dim, model, learning_rate, \
                num_epochs, criterion, optimizer, \
                train_loader, val_loader):
    


    loss_vals = []
    acc_est = []
    train_est = []
    
    for epoch in range(num_epochs):
        for i, (data, labels) in enumerate(train_loader):        
            model.train()
            data_batch, label_batch = data, labels
            label_batch = torch.reshape(label_batch, (-1,1))
            optimizer.zero_grad()
            outputs = model(data_batch.float())
            loss = criterion(outputs, label_batch.float())
            loss_vals.append(loss/labels.size(0))
            loss.backward()
            optimizer.step()
        
    
            # validate every 100 iterations
            if i > 0 and i % 1000 == 0:
                # validate
                train_acc = test_model(train_loader, model)
                train_est.append(train_acc)
                val_acc = test_model(val_loader, model)
                acc_est.append(val_acc)
                #loss_vals.append(test_model_LOSS(train_loader,model))
                print('Epoch: [{}/{}], Step: [{}/{}],Train Acc:{}, Validation Acc: {}'.format( 
                           epoch+1, num_epochs, i+1, len(train_loader), train_acc, val_acc))
                
            
    
    return loss_vals, train_est, acc_est

In [16]:
train_data = data_prep(X_train_stand,Y_data)
trainloader = torch.utils.data.DataLoader(train_data, batch_size=10,
                                          shuffle=True, num_workers=2)

val_data = data_prep(X_val_stand,Y_data_val)
valloader = torch.utils.data.DataLoader(val_data, batch_size=10,
                                          shuffle=True, num_workers=2)

In [17]:
#Final Model
num_epochs = 8# number epoch to train
learning_rates = .0001
emb_dim = 61

loss_performance = []
acc_performance = []

model = Net_dropout(emb_dim)
    
criterion = torch.nn.BCEWithLogitsLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rates)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=.1)
    
loss_vals, train_est, acc_est = model_train(emb_dim = emb_dim, model = model, learning_rate = learning_rates, \
                                 num_epochs = num_epochs, criterion = criterion, optimizer = optimizer, \
                                 train_loader = trainloader, val_loader = valloader)
    
loss_performance.append(loss_vals)
acc_performance.append(test_model(trainloader, model))

Epoch: [1/8], Step: [1001/4300],Train Acc:64.28837209302326, Validation Acc: 57.03809523809524
Epoch: [1/8], Step: [2001/4300],Train Acc:64.92325581395349, Validation Acc: 57.94285714285714
Epoch: [1/8], Step: [3001/4300],Train Acc:66.2, Validation Acc: 59.58095238095238
Epoch: [1/8], Step: [4001/4300],Train Acc:67.34418604651162, Validation Acc: 62.733333333333334
Epoch: [2/8], Step: [1001/4300],Train Acc:69.1, Validation Acc: 62.67619047619048
Epoch: [2/8], Step: [2001/4300],Train Acc:70.58604651162791, Validation Acc: 66.61904761904762
Epoch: [2/8], Step: [3001/4300],Train Acc:70.49767441860465, Validation Acc: 65.85714285714286
Epoch: [2/8], Step: [4001/4300],Train Acc:70.92093023255813, Validation Acc: 66.5904761904762
Epoch: [3/8], Step: [1001/4300],Train Acc:71.04186046511627, Validation Acc: 67.23809523809524
Epoch: [3/8], Step: [2001/4300],Train Acc:71.52558139534884, Validation Acc: 67.60952380952381
Epoch: [3/8], Step: [3001/4300],Train Acc:71.42325581395349, Validation Acc:

In [18]:
torch.save(model.state_dict(), models + "model.pt")
#pickle.dump(model, open(local_pickles + "model.p", "wb"))

## Qualified Crowds

## NN for High Pol knowledge

In [19]:
pol_knowledge_df_orig = pd.read_pickle(data_pickles + 'pol_knowledge_df_train_orig.p')
pol_knowledge_df_val = pd.read_pickle(data_pickles + 'pol_knowledge_df_val.p')

In [20]:
# Preparing the data
X_data_pol_train = feature_creation(pol_knowledge_df_orig)
X_data_pol_val = feature_creation(pol_knowledge_df_val)


# Preparing the data
#X_data = feature_creation(train_data_fakesource.iloc[:,:])
#Y_data = train_target_large_lowcred[:]

y_data_bin = np.where(Y_data == True, 1, 0)

test = StandardScaler()
test.fit(X_data_pol_train[numeric_features])

#training data
X_train_stand = X_data_pol_train.copy()
X_train_stand[numeric_features] = test.transform(X_train_stand[numeric_features])

#test data
X_val_stand = X_data_pol_val.copy()
X_val_stand[numeric_features] = test.transform(X_val_stand[numeric_features])

#test data
#X_test_stand = X_data_pol_test.copy()
#X_test_stand[numeric_features] = test.transform(X_test_stand[numeric_features])


In [21]:
train_data = data_prep(X_train_stand,Y_data)
trainloader = torch.utils.data.DataLoader(train_data, batch_size=10,
                                          shuffle=True, num_workers=2)

val_data = data_prep(X_val_stand,Y_data_val)
valloader = torch.utils.data.DataLoader(val_data, batch_size=10,
                                          shuffle=True, num_workers=2)

#test_data = data_loader(X_test_stand,test_target_large)

In [22]:
#Final Model
num_epochs = 15# number epoch to train
learning_rates = .0001
emb_dim = 61

loss_performance = []
acc_performance = []

model = Net(emb_dim)
    
criterion = torch.nn.BCEWithLogitsLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rates)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=.1)
    
loss_vals, train_est, acc_est = model_train(emb_dim = emb_dim, model = model, learning_rate = learning_rates, \
                                 num_epochs = num_epochs, criterion = criterion, optimizer = optimizer, \
                                 train_loader = trainloader, val_loader = valloader)
    
loss_performance.append(loss_vals)
acc_performance.append(test_model(trainloader, model))

Epoch: [1/15], Step: [1001/4300],Train Acc:68.37906976744186, Validation Acc: 61.61904761904762
Epoch: [1/15], Step: [2001/4300],Train Acc:69.42558139534884, Validation Acc: 62.12380952380953
Epoch: [1/15], Step: [3001/4300],Train Acc:70.50232558139535, Validation Acc: 63.20952380952381
Epoch: [1/15], Step: [4001/4300],Train Acc:72.30465116279069, Validation Acc: 64.8
Epoch: [2/15], Step: [1001/4300],Train Acc:72.6, Validation Acc: 65.46666666666667
Epoch: [2/15], Step: [2001/4300],Train Acc:73.37906976744186, Validation Acc: 66.67619047619047
Epoch: [2/15], Step: [3001/4300],Train Acc:73.32790697674419, Validation Acc: 68.47619047619048
Epoch: [2/15], Step: [4001/4300],Train Acc:73.19534883720931, Validation Acc: 68.82857142857142
Epoch: [3/15], Step: [1001/4300],Train Acc:73.8953488372093, Validation Acc: 67.28571428571429
Epoch: [3/15], Step: [2001/4300],Train Acc:74.05813953488372, Validation Acc: 67.64761904761905
Epoch: [3/15], Step: [3001/4300],Train Acc:74.02790697674419, Valid

In [23]:
torch.save(model.state_dict(), models + "model_nn_highpol.pt")

### Random forest high political knowledge

In [24]:

pol_knowledge_df_all = X_data_pol_train.append(X_data_pol_val)

pol_knowledge_df_all.reset_index(inplace = True, drop = True)

In [25]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

parameters = {'classifier__max_depth':[1,2,3,4,5,10,20,None], \
               'classifier__min_samples_split':[2,3,5,10], \
               'classifier__min_samples_leaf':[1,2,3,5,10], \
               'classifier__n_estimators':[500, 1000, 2000]}

grid_rf_pol = GridSearchCV(clf, scoring = 'accuracy', param_grid=parameters, cv=ps)

grid_rf_pol.fit(pol_knowledge_df_all,Y_data_all)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('scaler',
                                                                                          StandardScaler(copy=True,
                                                                              

In [26]:
pickle.dump(grid_rf_pol, open(models + "grid_rf_pol.p", "wb"))

In [27]:
np.unique(grid_rf.predict(X_data_all))

array([False,  True])