Author of code: William Godel

Date: 07/02

Purpose: to test performance of models on alternative data distributions

## Data IN: 

test_data_98.csv

test_data_90.csv

test_data_75.csv

test_data_10.csv


test_data_98_highpol.csv

test_data_90_highpol.csv

test_data_75_highpol.csv

test_data_10_highpol.csv

grid_rf.p

model_nn_highpol.pt

model_large.pt

## Data OUT:

no data out, just statistics


Machine: My laptop or Imac


In [2]:
%load_ext autoreload
%autoreload 

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from collections import Counter
laptop = True
import pickle

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from functions import count_mode, bayes_probs, bayes_binary
from ml_functions import feature_creation

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F

#crowd_size, feature_transform

from path import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
test_data_98 = pd.read_csv(prepared_data + "test_data_98.csv")
test_data_90 = pd.read_csv(prepared_data + "test_data_90.csv")
test_data_75 = pd.read_csv(prepared_data + "test_data_75.csv")
test_data_10 = pd.read_csv(prepared_data + "test_data_10.csv")

test_data_98_target = test_data_98['mode'] == 'FM'
test_data_90_target = test_data_90['mode'] == 'FM'
test_data_75_target = test_data_75['mode'] == 'FM'
test_data_10_target = test_data_10['mode'] == 'FM'

In [4]:
X_test_data_98 = feature_creation(test_data_98)
X_test_data_90 = feature_creation(test_data_90)
X_test_data_75 = feature_creation(test_data_75)
X_test_data_10 = feature_creation(test_data_10)

X_test_data_98_large = feature_creation(test_data_98, crowd_num = 25)
X_test_data_90_large = feature_creation(test_data_90, crowd_num = 25)
X_test_data_75_large = feature_creation(test_data_75, crowd_num = 25)
X_test_data_10_large = feature_creation(test_data_10, crowd_num = 25)

In [5]:
numeric_features = []

resp_var = [x for x in X_test_data_98.columns if 'resp_veracity_' in x]
new_cols = [x for x in X_test_data_98.columns if 'new' in x]

#numeric_features.extend(resp_var)
numeric_features.extend(resp_var) #rempve this if it doesn't work
numeric_features.extend(new_cols)
numeric_features.extend(["crowd_means",'crowd_median','crowd_full_range', 'crowd_IQR_range', \
                         'crowd_variance', 'crowd_bayes'])



In [8]:
def conf_eval(preds, truth):
    
    if True in np.unique(truth):
        
        condition_pos = True
        condition_neg = False
        
    else:
        
        condition_pos = 1
        condition_neg = 0
        
    con_pos_count = np.sum(truth == condition_pos)
    con_neg_count = np.sum(truth == condition_neg)
        
    accuracy = np.sum((preds == truth))/truth.size
    
    True_pos = np.sum(preds[truth == condition_pos] == truth[truth ==condition_pos])
    False_pos = np.sum(preds[truth == condition_pos] != truth[truth ==condition_pos])
    
    True_neg = np.sum(preds[truth == condition_neg] == truth[truth == condition_neg])
    False_neg = np.sum(preds[truth == condition_neg] != truth[truth == condition_neg])
    
    
    return accuracy, True_pos, False_pos, True_neg, False_neg
    
    
def conf_perc(a_list):
    
    total_pos = sum(a_list[2:4])
    total_neg = sum(a_list[4:])
    print("TP", "FP", "TN", "FN")
    return a_list[2]/total_pos, a_list[3]/total_pos, a_list[4]/total_neg, a_list[5]/total_neg

    

### Random Forest
### standard crowd
### size 10

In [9]:
performance_dic_98 = {}
performance_dic_90 = {}
performance_dic_75 = {}
performance_dic_10 = {}

In [14]:
grid_rf = pickle.load(open(models + 'grid_rf.p', "rb" ))

algo = grid_rf

algo_name = "randomforest_10"

all_results = [algo_name]
results = list(conf_eval(algo.predict(X_test_data_98), test_data_98_target))
all_results.extend(results)
performance_dic_98[algo_name] = all_results

all_results = [algo_name]
results = list(conf_eval(algo.predict(X_test_data_90), test_data_90_target))
all_results.extend(results)
performance_dic_90[algo_name] = all_results

all_results = [algo_name]
results = list(conf_eval(algo.predict(X_test_data_75), test_data_75_target))
all_results.extend(results)
performance_dic_75[algo_name] = all_results

results = [algo_name]
all_results = list(conf_eval(algo.predict(X_test_data_10), test_data_10_target))
all_results.extend(results)
performance_dic_10[algo_name] = all_results


In [15]:
conf_perc(performance_dic_98['randomforest_10'])

TP FP TN FN


(0.38421052631578945,
 0.6157894736842106,
 0.7682062298603652,
 0.2317937701396348)

In [16]:
performance_dic_98['randomforest_10']

['randomforest_10', 0.7605263157894737, 73, 117, 7152, 2158]

## Qualified Crowds

### loading data

In [17]:
test_data_98_highpk = pd.read_csv(prepared_data + 'test_data_98_highpol.csv')
test_data_90_highpk = pd.read_csv(prepared_data + "test_data_90_highpol.csv")
test_data_75_highpk = pd.read_csv(prepared_data + "test_data_75_highpol.csv")
test_data_10_highpk = pd.read_csv(prepared_data + "test_data_10_highpol.csv")


X_test_data_98_highpk = feature_creation(test_data_98_highpk)
X_test_data_90_highpk = feature_creation(test_data_90_highpk)
X_test_data_75_highpk = feature_creation(test_data_75_highpk)
X_test_data_10_highpk = feature_creation(test_data_10_highpk)

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):

    def __init__(self, emb_dim):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(emb_dim, 500) 
        self.fc2 = nn.Linear(500, 100)
        self.fc3 = nn.Linear(100, 25)
        self.fc4 = nn.Linear(25, 25)
        self.fc5 = nn.Linear(25, 10)
        self.fc6 = nn.Linear(10, 1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        
        return x
    
    
from torch.utils.data import Dataset

class data_loader(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list.values
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        data = self.data_list[key]
        label = self.target_list[key]
        return [data, label]


In [21]:

    
emb_dim = 61   
net_large = Net(emb_dim)
net_large.load_state_dict(torch.load(models + "model_nn_highpol.pt"))
net_large.eval()

# Preparing the data
y_data_bin_large = np.where(test_data_98_target == True, 1, 0)
data = X_test_data_98_highpk
nn_list_large = ["nn_10_pk"]
test = StandardScaler()
test.fit(data[numeric_features])
#training data
X_train_stand_large = data.copy()
X_train_stand_large[numeric_features] = test.transform(data[numeric_features])
train_dat_nn_large = data_loader(X_train_stand_large,y_data_bin_large)
nn_preds  = torch.sigmoid(net_large(torch.tensor(train_dat_nn_large[:][0]).float()))
nn_preds = np.where(nn_preds > .5, 1,0).reshape(-1,)
all_results = list(conf_eval(nn_preds, y_data_bin_large))

nn_list_large.extend(all_results)
performance_dic_98['nn_10_pk'] = nn_list_large



# Preparing the data
y_data_bin_large = np.where(test_data_90_target == True, 1, 0)
data = X_test_data_90_highpk
nn_list_large = ["nn_25"]
test = StandardScaler()
test.fit(data[numeric_features])
#training data
X_train_stand_large = data.copy()
X_train_stand_large[numeric_features] = test.transform(data[numeric_features])
train_dat_nn_large = data_loader(X_train_stand_large,y_data_bin_large)
nn_preds  = torch.sigmoid(net_large(torch.tensor(train_dat_nn_large[:][0]).float()))
nn_preds = np.where(nn_preds > .5, 1,0).reshape(-1,)
all_results = list(conf_eval(nn_preds, y_data_bin_large))

nn_list_large.extend(all_results)
performance_dic_90['nn_10_pk'] = nn_list_large


# Preparing the data
y_data_bin_large = np.where(test_data_75_target == True, 1, 0)
data = X_test_data_75_highpk
nn_list_large = ["nn_10_pk"]
test = StandardScaler()
test.fit(data[numeric_features])
#training data
X_train_stand_large = data.copy()
X_train_stand_large[numeric_features] = test.transform(data[numeric_features])
train_dat_nn_large = data_loader(X_train_stand_large,y_data_bin_large)
nn_preds  = torch.sigmoid(net_large(torch.tensor(train_dat_nn_large[:][0]).float()))
nn_preds = np.where(nn_preds > .5, 1,0).reshape(-1,)
all_results = list(conf_eval(nn_preds, y_data_bin_large))

nn_list_large.extend(all_results)
performance_dic_75['nn_10_pk'] = nn_list_large




# Preparing the data
y_data_bin_large = np.where(test_data_10_target == True, 1, 0)
data = X_test_data_10_highpk
nn_list_large = ["nn_10_pk"]
test = StandardScaler()
test.fit(data[numeric_features])
#training data
X_train_stand_large = data.copy()
X_train_stand_large[numeric_features] = test.transform(data[numeric_features])
train_dat_nn_large = data_loader(X_train_stand_large,y_data_bin_large)
nn_preds  = torch.sigmoid(net_large(torch.tensor(train_dat_nn_large[:][0]).float()))
nn_preds = np.where(nn_preds > .5, 1,0).reshape(-1,)
all_results = list(conf_eval(nn_preds, y_data_bin_large))

nn_list_large.extend(all_results)
performance_dic_10['nn_10_pk'] = nn_list_large



In [22]:
performance_dic_10['nn_10_pk']

['nn_10_pk', 0.4709473684210526, 3656, 4894, 818, 132]

In [23]:
performance_dic_90

{'randomforest_10': ['randomforest_10',
  0.7349194652068639,
  359,
  590,
  6622,
  1928],
 'nn_10_pk': ['nn_25', 0.7534477313401411, 486, 463, 6671, 1879]}

### Neural Network
### Large crowd
### size 25


In [24]:
numeric_features_large = []

resp_var = [x for x in X_test_data_98_large.columns if 'resp_veracity_' in x]
new_cols = [x for x in X_test_data_98_large.columns if 'new' in x]

#numeric_features.extend(resp_var)
numeric_features_large.extend(resp_var) #rempve this if it doesn't work
numeric_features_large.extend(new_cols)
numeric_features_large.extend(["crowd_means",'crowd_median','crowd_full_range', 'crowd_IQR_range', \
                         'crowd_variance', 'crowd_bayes'])

In [25]:

emb_dim = 121   
net_large = Net(emb_dim)
net_large.load_state_dict(torch.load(models + "model_large.pt"))
net_large.eval()



# Preparing the data
y_data_bin_large = np.where(test_data_98_target == True, 1, 0)
data = X_test_data_98_large
nn_list_large = ["nn_25"]
test = StandardScaler()
test.fit(data[numeric_features_large])
#training data
X_train_stand_large = data.copy()
X_train_stand_large[numeric_features_large] = test.transform(data[numeric_features_large])
train_dat_nn_large = data_loader(X_train_stand_large,y_data_bin_large)
nn_preds  = torch.sigmoid(net_large(torch.tensor(train_dat_nn_large[:][0]).float()))
nn_preds = np.where(nn_preds > .5, 1,0).reshape(-1,)
all_results = list(conf_eval(nn_preds, y_data_bin_large))

nn_list_large.extend(all_results)
performance_dic_98['nn_25'] = nn_list_large



# Preparing the data
y_data_bin_large = np.where(test_data_90_target == True, 1, 0)
data = X_test_data_90_large
nn_list_large = ["nn_25"]
test = StandardScaler()
test.fit(data[numeric_features_large])
#training data
X_train_stand_large = data.copy()
X_train_stand_large[numeric_features_large] = test.transform(data[numeric_features_large])
train_dat_nn_large = data_loader(X_train_stand_large,y_data_bin_large)
nn_preds  = torch.sigmoid(net_large(torch.tensor(train_dat_nn_large[:][0]).float()))
nn_preds = np.where(nn_preds > .5, 1,0).reshape(-1,)
all_results = list(conf_eval(nn_preds, y_data_bin_large))

nn_list_large.extend(all_results)
performance_dic_90['nn_25'] = nn_list_large



# Preparing the data
y_data_bin_large = np.where(test_data_75_target == True, 1, 0)
data = X_test_data_75_large
nn_list_large = ["nn_25"]
test = StandardScaler()
test.fit(data[numeric_features_large])
#training data
X_train_stand_large = data.copy()
X_train_stand_large[numeric_features_large] = test.transform(data[numeric_features_large])
train_dat_nn_large = data_loader(X_train_stand_large,y_data_bin_large)
nn_preds  = torch.sigmoid(net_large(torch.tensor(train_dat_nn_large[:][0]).float()))
nn_preds = np.where(nn_preds > .5, 1,0).reshape(-1,)
all_results = list(conf_eval(nn_preds, y_data_bin_large))

nn_list_large.extend(all_results)
performance_dic_75['nn_25'] = nn_list_large




# Preparing the data
y_data_bin_large = np.where(test_data_10_target == True, 1, 0)
data = X_test_data_10_large
nn_list_large = ["nn_25"]
test = StandardScaler()
test.fit(data[numeric_features_large])
#training data
X_train_stand_large = data.copy()
X_train_stand_large[numeric_features_large] = test.transform(data[numeric_features_large])
train_dat_nn_large = data_loader(X_train_stand_large,y_data_bin_large)
nn_preds  = torch.sigmoid(net_large(torch.tensor(train_dat_nn_large[:][0]).float()))
nn_preds = np.where(nn_preds > .5, 1,0).reshape(-1,)
all_results = list(conf_eval(nn_preds, y_data_bin_large))

nn_list_large.extend(all_results)
performance_dic_10['nn_25'] = nn_list_large



In [26]:
print(performance_dic_10['nn_25'])
conf_perc(performance_dic_10['nn_25'])

['nn_25', 0.4708421052631579, 3644, 4906, 829, 121]
TP FP TN FN


(0.42619883040935674,
 0.5738011695906433,
 0.8726315789473684,
 0.12736842105263158)

In [27]:
print(performance_dic_75['nn_25'])
conf_perc(performance_dic_75['nn_25'])

['nn_25', 0.7433684210526316, 1249, 1126, 5813, 1312]
TP FP TN FN


(0.5258947368421053,
 0.47410526315789475,
 0.8158596491228071,
 0.184140350877193)

In [28]:
print(performance_dic_90['nn_25'])
conf_perc(performance_dic_90['nn_25'])

['nn_25', 0.77260764290978, 527, 422, 6812, 1738]
TP FP TN FN


(0.5553213909378293,
 0.4446786090621707,
 0.7967251461988304,
 0.20327485380116958)

In [29]:
print(performance_dic_98['nn_25'])
conf_perc(performance_dic_98['nn_25'])

['nn_25', 0.7830526315789473, 101, 89, 7338, 1972]
TP FP TN FN


(0.531578947368421,
 0.46842105263157896,
 0.7881847475832439,
 0.21181525241675617)

In [30]:
print(performance_dic_10['nn_10_pk'])
conf_perc(performance_dic_10['nn_10_pk'])

['nn_10_pk', 0.4709473684210526, 3656, 4894, 818, 132]
TP FP TN FN


(0.42760233918128654,
 0.5723976608187135,
 0.8610526315789474,
 0.13894736842105262)

In [31]:
print(performance_dic_75['nn_10_pk'])
conf_perc(performance_dic_75['nn_10_pk'])

['nn_10_pk', 0.7293684210526316, 1234, 1141, 5695, 1430]
TP FP TN FN


(0.519578947368421,
 0.48042105263157897,
 0.7992982456140351,
 0.2007017543859649)

In [32]:
print(performance_dic_90['nn_10_pk'])
conf_perc(performance_dic_90['nn_10_pk'])

['nn_25', 0.7534477313401411, 486, 463, 6671, 1879]
TP FP TN FN


(0.512118018967334,
 0.48788198103266595,
 0.780233918128655,
 0.21976608187134503)

In [33]:
print(performance_dic_98['nn_10_pk'])
conf_perc(performance_dic_98['nn_10_pk'])

['nn_10_pk', 0.765578947368421, 101, 89, 7172, 2138]
TP FP TN FN


(0.531578947368421,
 0.46842105263157896,
 0.7703544575725026,
 0.22964554242749732)