# Data Preprocessing

In [169]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score
import tensorflow as tf
import keras_tuner as kt

In [170]:
# TODO: Read in from DB
conflict_path = os.path.join('Resources', 'cleaned_conflict_1.csv')
stocks_path = os.path.join('Resources', 'joined_stocks.csv')

conflict_df = pd.read_csv(conflict_path)
stocks_df = pd.read_csv(stocks_path)

In [171]:
# convert strings to datetimes
stocks_df['Date'] = pd.to_datetime(stocks_df['Date'])
conflict_df['start_date'] = pd.to_datetime(conflict_df['start_date'])
conflict_df['start_date2'] = pd.to_datetime(conflict_df['start_date2'])

In [172]:
# Trying merge
combined_df = conflict_df.merge(stocks_df, how = 'outer', left_on='start_date2', right_on = 'Date')

In [173]:
# gather conflict information that is within the stock availability window.
sorted_stock_dates = stocks_df.Date.sort_values()
available_conflicts = (conflict_df.start_date >= sorted_stock_dates[0]) & (conflict_df.start_date <= sorted_stock_dates.iloc[-1] )

In [174]:
conflict_df[available_conflicts]

Unnamed: 0,location,side_a,side_a_2nd,side_b,side_b_2nd,incompatibility,territory_name,cumulative_intensity,type_of_conflict,start_date,start_date2,ep_end,region
0,India,Government of India,,GNLA,,1,Garoland,0,3,1997-05-29,2012-11-15,1,3
1,India,Government of India,,GNLA,,1,Garoland,0,3,1997-05-29,2014-07-01,1,3
6,Sudan,Government of Sudan,,Republic of South Sudan,,1,Abyei,0,3,2011-05-01,2011-05-19,1,4
7,South Sudan,Government of South Sudan,,"SSDM/A, SSLM/A",,2,,0,3,2011-08-20,2011-08-20,0,4
8,South Sudan,Government of South Sudan,,SSLM/A,,2,,0,3,2011-08-20,2011-08-20,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,Ethiopia,Government of Ethiopia,,SLM,,1,Sidamaland,1,3,1981-03-21,1983-03-16,1,4
2491,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2009-08-29,1,3
2492,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2014-12-22,0,3
2493,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2014-12-22,1,3


In [175]:
# create a function to identify the nearest date to get relevant stock rows
def find_nearest_date(date, stock_dates, row_list, not_used_index):
    index = stock_dates.index.get_loc(date, method = 'nearest')
    row_list.append(index)
    if index in not_used_index:
        not_used_index.remove(index)
    return row_list, not_used_index

In [176]:
# create stock_dates_df which has ordered dates as the index for searching
stock_dates_df = stocks_df.set_index('Date').sort_index()

In [177]:
# create a new DF to append stock data to
available_conflicts_df = conflict_df[available_conflicts].copy()
available_conflicts_df[stock_dates_df.columns] = -999 # default value for easy identification

In [178]:
# create lists for storing stock row indices
stock_rows = []
double_count_index = list(range(stock_dates_df.shape[0])) # used to avoid double counting when merging data later

for index, date in available_conflicts_df.start_date2.iteritems():
    stock_rows, double_count_index = \
    find_nearest_date(date, stock_dates_df, stock_rows, double_count_index)

In [179]:
# insert stock data to available_conflicts_df
available_conflicts_df[stock_dates_df.columns] = stock_dates_df.iloc[stock_rows].values

In [182]:
available_conflicts_df

Unnamed: 0,location,side_a,side_a_2nd,side_b,side_b_2nd,incompatibility,territory_name,cumulative_intensity,type_of_conflict,start_date,start_date2,ep_end,region,S&P_500,NASDAQ,Dow_Jones
0,India,Government of India,,GNLA,,1,Garoland,0,3,1997-05-29,2012-11-15,1,3,1359.88,2853.13,12588.31
1,India,Government of India,,GNLA,,1,Garoland,0,3,1997-05-29,2014-07-01,1,3,1985.44,4485.93,17068.26
6,Sudan,Government of Sudan,,Republic of South Sudan,,1,Abyei,0,3,2011-05-01,2011-05-19,1,4,1333.27,2803.32,12512.04
7,South Sudan,Government of South Sudan,,"SSDM/A, SSLM/A",,2,,0,3,2011-08-20,2011-08-20,0,4,1123.53,2341.84,10817.65
8,South Sudan,Government of South Sudan,,SSLM/A,,2,,0,3,2011-08-20,2011-08-20,0,4,1123.53,2341.84,10817.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,Ethiopia,Government of Ethiopia,,SLM,,1,Sidamaland,1,3,1981-03-21,1983-03-16,1,4,149.90,265.01,1117.74
2491,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2009-08-29,1,3,1028.93,2028.77,9544.20
2492,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2014-12-22,0,3,2070.65,4765.38,17804.80
2493,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2014-12-22,1,3,2070.65,4765.38,17804.80


In [185]:
available_conflicts_df[['location',
                        'side_a',
                        'side_b',
                        'territory_name',
                        'start_date',
                        'start_date2',
                        'region',
                        'type_of_conflict',
                        'S&P_500',
                        'NASDAQ',
                        'Dow_Jones']].to_csv('Resources/geo_conflict_data.csv')

In [12]:
# create a function to bin the categorical data
def create_bins(df, col, thresh):
    output = df[col].copy()
    frequencies = output.value_counts()
    categories_to_keep = frequencies[:thresh].index
    output = output.apply(lambda x: x if x in categories_to_keep else 'Other')
    return output

In [13]:
available_conflicts_df.dtypes[available_conflicts_df.dtypes == 'object'].index

Index(['location', 'side_a', 'side_a_2nd', 'side_b', 'side_b_2nd',
       'territory_name', 'region'],
      dtype='object')

In [14]:
# get string columns for encoding
string_cols = available_conflicts_df.dtypes[available_conflicts_df.dtypes == 'object'].index
ignore_cols = ['side_a_2nd', 'side_b_2nd', 'region'] # can handle later if necessary
string_cols = string_cols.drop(labels = ignore_cols)

# bin the categorical data 
names_to_keep = 25 # number of bins = names_to_keep + 1
for col in string_cols:
    available_conflicts_df[col] = create_bins(available_conflicts_df, col, names_to_keep)

# hand encode region columns 1,2,3,4,5 before encoding the rest
available_conflicts_df['region_1'] = 0
available_conflicts_df['region_2'] = 0
available_conflicts_df['region_3'] = 0
available_conflicts_df['region_4'] = 0
available_conflicts_df['region_5'] = 0

# store the conflict region strings as a series
conflict_regions = available_conflicts_df.region.copy()

# loop over the conflict_region strings
for row, regions_string in enumerate(conflict_regions):
    
    # split the strings
    regions = regions_string.split(',')
    # extract the row
    row_data = available_conflicts_df.iloc[row,-5:].copy()
    for region in regions:
        row_data['region_' + str(int(region))] = 1
    available_conflicts_df.iloc[row, -5:] = row_data
available_conflicts_df
# encode the data
binned_df = pd.get_dummies(available_conflicts_df, columns = string_cols)

# add a Date column to binned_df for merging
binned_df['Date'] = binned_df.start_date2

In [15]:
# merge on date and stock data
combined_df = binned_df.merge(
    stock_dates_df.iloc[double_count_index],
    on=['Date', 'S&P_500','NASDAQ','Dow_Jones'],
    how = 'outer'
)

# create year, month, day columns
combined_df['year'] = combined_df.Date.apply(lambda date: date.year)
combined_df['month'] = combined_df.Date.apply(lambda date: date.month)
combined_df['day'] = combined_df.Date.apply(lambda date: date.day)

In [16]:
# drop rows that have missing stock data
combined_df = combined_df[combined_df['S&P_500'].notna()]
combined_df = combined_df[combined_df['NASDAQ'].notna()]
combined_df = combined_df[combined_df['Dow_Jones'].notna()]

In [17]:
# drop start_date and start_date2
combined_df = combined_df.drop(columns = ['start_date', 'start_date2'])

# fill in NaNs
combined_df = combined_df.fillna(0)

In [18]:
# output csv file
output_path = os.path.join('Resources', 'combined_data_1.csv')
combined_df.to_csv(output_path, index = False)

# Starting ML

In [36]:
# drop unnecessary columns from data
ignore_cols.append('Date')
data = combined_df.drop(columns = ignore_cols).drop_duplicates()
data

Unnamed: 0,incompatibility,cumulative_intensity,type_of_conflict,ep_end,S&P_500,NASDAQ,Dow_Jones,region_1,region_2,region_3,...,territory_name_Novorossiya,territory_name_Other,territory_name_Punjab/Khalistan,territory_name_Rojava Kurdistan,territory_name_Serb,territory_name_Southern Lebanon,territory_name_Tripura,year,month,day
0,1.0,0.0,3.0,1.0,1359.88,2853.13,12588.31,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2012,11,15
1,1.0,0.0,3.0,1.0,1985.44,4485.93,17068.26,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2014,7,1
2,1.0,0.0,3.0,1.0,1333.27,2803.32,12512.04,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2011,5,19
3,2.0,0.0,3.0,0.0,1123.53,2341.84,10817.65,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2011,8,20
5,2.0,1.0,3.0,0.0,1123.53,2341.84,10817.65,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2011,8,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2600,0.0,0.0,0.0,0.0,2423.41,6140.42,21349.63,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,6,30
2601,0.0,0.0,0.0,0.0,2472.54,6387.75,21580.07,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,7,21
2602,0.0,0.0,0.0,0.0,2472.10,6374.68,21830.31,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,7,28
2603,0.0,0.0,0.0,0.0,2441.32,6256.56,21858.32,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,8,11


In [37]:
# extract X and y from data
y_cols = ['S&P_500', 'NASDAQ', 'Dow_Jones']
X = data.drop(columns = y_cols)
y = data[y_cols]

In [129]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size=.8)

In [130]:
# scale data
X_scaler = StandardScaler()

X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Extra Random Trees Regressor

In [131]:
# create regressor
# n_jobs = -1 means use all the processors on your PC to compute the decision trees in parallel
erf = ExtraTreesRegressor(n_estimators = 100, random_state = 42, n_jobs = -1)

# fit regressor/train model
erf.fit(X_train_scaled, y_train)

# make predictions
y_pred = erf.predict(X_test_scaled)

# covert predictions into dataframe
y_pred_df = pd.DataFrame(y_pred.round(2), index = y_test.index, columns = y_test.columns)

In [167]:
mean_squared_error(y_test, y_pred_df, squared=False, multioutput='raw_values')

array([ 21.60269483,  67.66537027, 190.34830699])

In [158]:
r2_score(y_test, y_pred_df)

0.9984808010779482

In [160]:
(((np.abs(y_test-y_pred_df) / y_test) * 100)).max()

S&P_500      16.042319
NASDAQ       14.768135
Dow_Jones    14.155855
dtype: float64

In [143]:
for importance, feature in sorted(zip(erf.feature_importances_, X.columns), reverse=True):
    print(feature, importance)

year 0.9762558107982837
side_b_IS 0.015193504604093666
month 0.006506574869404977
day 0.0005466295662906452
type_of_conflict 0.0004031323561902178
incompatibility 0.00017557817001122864
region_4 0.00010819653614431413
side_a_Government of Pakistan 9.262190531018133e-05
location_Pakistan 6.252336691408223e-05
region_3 5.178372981824363e-05
side_a_Government of Syria 3.547526118189102e-05
side_a_Government of South Sudan 3.166504460121676e-05
location_South Sudan 2.9816249858706496e-05
location_Indonesia 2.921557358251708e-05
side_b_Other 2.7264980498826277e-05
location_Ukraine 2.1924731389811493e-05
territory_name_Other 2.061322837676647e-05
location_Rwanda 1.9114512828917417e-05
location_Syria 1.9052005472560278e-05
side_a_Government of Ukraine 1.8724276219903482e-05
side_a_Government of Bosnia-Herzegovina 1.8145508160693683e-05
region_2 1.753983777542195e-05
location_Other 1.6618389280391487e-05
region_1 1.6177577468585877e-05
cumulative_intensity 1.58670982384373e-05
side_a_Governmen

## Neural Network