In [42]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import glob
from sklearn import datasets
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn import mixture
import matplotlib.colors
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.inspection import permutation_importance
from sklearn.impute import SimpleImputer
import pyodbc
from scipy import stats
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import kruskal
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import RadiusNeighborsClassifier


One Hot Encoding and Normalization


In [43]:
# function to create Pandas dataframe from list of column tuples

# inputs: df (Pandas DataFrame), escape (columns that don't need to be normalized or one-hot encoded)
# output: normalized Pandas DataFrame

def norm_one_hot(df, escape=[]):
    data_list = list()
    for field in list(df.columns):
        if field in escape:
            field_df = df[field]
        elif df[field].dtype in ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']:
            if df[field].std() == 0:
                field_df = df[field] * 0
            else:
                field_df = (df[field] - df[field].mean()) / df[field].std() # normalize column
        else:
            field_df = pd.get_dummies(df[field], drop_first=True) # one-hot embedding

        data_list.append(field_df)

    # concatenate dataframes
    df_normalized = pd.concat(data_list, axis=1)

    return df_normalized

Data Imputation 

In [53]:
df=pd.read_csv('tariff_model_data_FILTERED.csv') 

In [55]:
X = df.drop(['mfn','Crop ID', 'Unnamed: 0'], axis = 1)   
X = norm_one_hot(X, ['democracy level', 'production quantity', 'edible', 'modified', 'fresh', 'frozen', 'shelled', 'dried', 'chilled', 'seed', 'ground', 'preserved'])
col_names = list(X.columns)

In [64]:
Y = df['mfn']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
model = LinearRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)


print(model)
#r squared, coefficient of determination
print('Correlation Coefficient:', model.score(X_test, Y_test))

#mean squared error
print('Mean abolute error in predictions:', metrics.mean_absolute_error(Y_test, Y_pred))

actual = list(Y_test)
predicted = list(Y_pred)


print('Actual results:', np.array(Y_test))
print('Model-Generated Results:', np.array(Y_pred))

#Check the accuracy of the model, usng adjusted maringal error of 1-5%

moe = 5
num_accurate = 0
num_inaccurate = 0
for index, element in enumerate(actual):
    if abs(element - predicted[index]) < moe:
        num_accurate += 1
    else:
        num_inaccurate += 1
        
print('Accuracy Results: Found with a Margin of Error of 土' + str(moe) + '%. \n')
print('Number of predicted tariffs within ' + str(moe) + '% of actual:', num_accurate)
print('Number of predicted tariffs outside ' + str(moe) + '% of actual:', num_inaccurate)

print('Prediction Accuracy of Model:', round(num_accurate/(num_accurate+num_inaccurate),4))


LinearRegression()
Correlation Coefficient: 0.4647388093016639
Mean abolute error in predictions: 5.553586165970603
Actual results: [ 3.20000005  2.5         2.5        ... 40.          0.
 24.5       ]
Model-Generated Results: [ 3.84478053  4.54487042  7.76609016 ... 17.54856089  0.9594329
  1.37940056]
Accuracy Results: Found with a Margin of Error of 土5%. 

Number of predicted tariffs within 5% of actual: 840
Number of predicted tariffs outside 5% of actual: 513
Prediction Accuracy of Model: 0.6208


In [27]:
# Generate histogram model for mfn rates

df2 = pd.read_csv('../Raw_Data/wits_tariff_2018.csv')  
mfn_rates = list(df2['mfn_rate'])
mfn_rates.sort()

freq_dict = {}
for mfn in mfn_rates:
    if round(mfn,1) not in freq_dict:
        freq_dict[round(mfn,1)] = 1
    else:
        freq_dict[round(mfn,1)] += 1



age_bins = {
        0: 0,
        1: 0,
        2: 0,
        3: 0,
        4: 0,
        5: 0,
        6: 0,
        7: 0,
        8: 0,
        9: 0,
        10: 0,
        11: 0,
        12: 0,
        13: 0,
        14: 0,
        '15+': 0
    }

for index, val in enumerate(mfn_rates):
    age_bin = (val)//5
    if age_bin > 14:
        age_bins['15+'] += 1
    else:
        age_bins[age_bin] += 1
print('age bins:', age_bins)


age bins: {0: 263450, 1: 189203, 2: 96791, 3: 38438, 4: 56080, 5: 22042, 6: 9809, 7: 4492, 8: 3333, 9: 3723, 10: 763, 11: 149, 12: 418, 13: 39, 14: 182, '15+': 771}


In [41]:
# Generate histogram model for mfn rates
tariff = pd.read_csv('../Raw_Data/wits_tariff_2018.csv')  
polyarchy = pd.read_csv('polyarchy_data.csv')  
polyarchy_bins = {
        0: [],
        1: [],
        2: [],
        3: [],
        4: [],
        5: [],
        6: [],
        7: [],
        8: [],
        9: [],
    }


bins = {
        '0.01-0.1': [],
        '0.11-0.2': [],
        '0.21-0.3': [],
        '0.31-0.4': [],
        '0.41-0.5': [],
        '0.51-0.6': [],
        '0.61-0.7': [],
        '0.71-0.8': [],
        '0.81-0.9': [],
        '0.91-1.0': [],
    }

p_dict = {}

for i, r in polyarchy.iterrows():
    p_dict[r['country_text_id']] = r['v2x_polyarchy']

for index, row in tariff.iterrows():
    country = row['iso3code']
    if index % 100000 == 0:
        print(index)
    mfn = row['mfn_rate']
    if country not in set(polyarchy['country_text_id']):
        continue
    elif country in p_dict:
        num = int(str(p_dict[country])[2])
        polyarchy_bins[num].append(mfn)
                
for mfn in polyarchy_bins:
    polyarchy_bins[mfn] = sum(polyarchy_bins[mfn])/len(polyarchy_bins[mfn])
    
print(polyarchy_bins)
                

0
100000
200000
300000
400000
500000
600000
{0: 7.049446243843255, 1: 8.321319097385196, 2: 9.640762712663422, 3: 11.989473327465635, 4: 7.9513759605110055, 5: 8.191305188783032, 6: 8.486187867841618, 7: 6.163457859264203, 8: 5.184417311718916, 9: 4.888306904028574}
