In [1]:
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2

In [2]:
# TODO: Read in from DB
conflict_path = os.path.join('Resources', 'cleaned_conflict_1.csv')
stocks_path = os.path.join('Resources', 'joined_stocks.csv')

conflict_df = pd.read_csv(conflict_path)
stocks_df = pd.read_csv(stocks_path)

In [3]:
# convert strings to datetimes
stocks_df['Date'] = pd.to_datetime(stocks_df['Date'])
conflict_df['start_date'] = pd.to_datetime(conflict_df['start_date'])
conflict_df['start_date2'] = pd.to_datetime(conflict_df['start_date2'])

In [4]:
# Trying merge
combined_df = conflict_df.merge(stocks_df, how = 'outer', left_on='start_date2', right_on = 'Date')

In [5]:
# gather conflict information that is within the stock availability window.
sorted_stock_dates = stocks_df.Date.sort_values()
available_conflicts = (conflict_df.start_date >= sorted_stock_dates[0]) & (conflict_df.start_date <= sorted_stock_dates.iloc[-1] )

In [6]:
conflict_df[available_conflicts]

Unnamed: 0,location,side_a,side_a_2nd,side_b,side_b_2nd,incompatibility,territory_name,cumulative_intensity,type_of_conflict,start_date,start_date2,ep_end,region
0,India,Government of India,,GNLA,,1,Garoland,0,3,1997-05-29,2012-11-15,1,3
1,India,Government of India,,GNLA,,1,Garoland,0,3,1997-05-29,2014-07-01,1,3
6,Sudan,Government of Sudan,,Republic of South Sudan,,1,Abyei,0,3,2011-05-01,2011-05-19,1,4
7,South Sudan,Government of South Sudan,,"SSDM/A, SSLM/A",,2,,0,3,2011-08-20,2011-08-20,0,4
8,South Sudan,Government of South Sudan,,SSLM/A,,2,,0,3,2011-08-20,2011-08-20,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,Ethiopia,Government of Ethiopia,,SLM,,1,Sidamaland,1,3,1981-03-21,1983-03-16,1,4
2491,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2009-08-29,1,3
2492,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2014-12-22,0,3
2493,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2014-12-22,1,3


In [7]:
# create a function to identify the nearest date to get relevant stock rows
def find_nearest_date(date, stock_dates, row_list, used_index):
    index = stock_dates.index.get_loc(date, method = 'nearest')
    row_list.append(index)
    if index not in used_index:
        used_index.append(index)
    return row_list, used_index

In [8]:
# create stock_dates_df which has ordered dates as the index for searching
stock_dates_df = stocks_df.set_index('Date').sort_index()

In [9]:
# create a new DF to append stock data to
available_conflicts_df = conflict_df[available_conflicts].copy()
available_conflicts_df[stock_dates_df.columns] = -999 # default value for easy identification

In [10]:
# create lists for storing stock row indices
stock_rows = []
double_count_index = [] # used to avoid double counting when merging data later

for index, date in available_conflicts_df.start_date2.iteritems():
    stock_rows, double_count_index = \
    find_nearest_date(date, stock_dates_df, stock_rows, double_count_index)

In [11]:
# insert stock data to available_conflicts_df
available_conflicts_df[stock_dates_df.columns] = stock_dates_df.iloc[stock_rows].values

In [12]:
available_conflicts_df

Unnamed: 0,location,side_a,side_a_2nd,side_b,side_b_2nd,incompatibility,territory_name,cumulative_intensity,type_of_conflict,start_date,start_date2,ep_end,region,S&P_500,NASDAQ,Dow_Jones
0,India,Government of India,,GNLA,,1,Garoland,0,3,1997-05-29,2012-11-15,1,3,1359.88,2853.13,12588.31
1,India,Government of India,,GNLA,,1,Garoland,0,3,1997-05-29,2014-07-01,1,3,1985.44,4485.93,17068.26
6,Sudan,Government of Sudan,,Republic of South Sudan,,1,Abyei,0,3,2011-05-01,2011-05-19,1,4,1333.27,2803.32,12512.04
7,South Sudan,Government of South Sudan,,"SSDM/A, SSLM/A",,2,,0,3,2011-08-20,2011-08-20,0,4,1123.53,2341.84,10817.65
8,South Sudan,Government of South Sudan,,SSLM/A,,2,,0,3,2011-08-20,2011-08-20,0,4,1123.53,2341.84,10817.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,Ethiopia,Government of Ethiopia,,SLM,,1,Sidamaland,1,3,1981-03-21,1983-03-16,1,4,149.90,265.01,1117.74
2491,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2009-08-29,1,3,1028.93,2028.77,9544.20
2492,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2014-12-22,0,3,2070.65,4765.38,17804.80
2493,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2014-12-22,1,3,2070.65,4765.38,17804.80


In [13]:
available_conflicts_df.location.value_counts()

India                    121
Turkey                    44
Somalia                   32
Russia (Soviet Union)     32
Algeria                   30
                        ... 
Suriname                   1
Togo                       1
South Yemen                1
Panama                     1
Ecuador, Peru              1
Name: location, Length: 89, dtype: int64

In [14]:
frequencies = available_conflicts_df.location.value_counts()
frequencies.describe().loc['75%']

10.0

In [15]:
# create a function to bin the categorical data
def create_bins(df, col, thresh):
    output = df[col].copy()
    frequencies = output.value_counts()
    categories_to_keep = frequencies[:thresh].index
    output = output.apply(lambda x: x if x in categories_to_keep else 'Other')
    return output

In [16]:
# get string columns for encoding
string_cols = available_conflicts_df.dtypes[available_conflicts_df.dtypes == 'object'].index
ignore_cols = ['side_a_2nd', 'side_b_2nd'] # can handle later if necessary
string_cols = string_cols.drop(labels = ['side_a_2nd', 'side_b_2nd'])

In [17]:
# bin the categorical data 
names_to_keep = 25 # number of bins = names_to_keep + 1
for col in string_cols:
    available_conflicts_df[col] = create_bins(available_conflicts_df, col, names_to_keep)

In [23]:
pd.get_dummies(available_conflicts_df, columns = string_cols)

Unnamed: 0,side_a_2nd,side_b_2nd,incompatibility,cumulative_intensity,type_of_conflict,start_date,start_date2,ep_end,S&P_500,NASDAQ,...,territory_name_Tripura,region_1,"region_1, 2, 3, 5","region_1, 3","region_1, 3, 5","region_1, 5",region_2,region_3,region_4,region_5
0,,,1,0,3,1997-05-29,2012-11-15,1,1359.88,2853.13,...,0,0,0,0,0,0,0,1,0,0
1,,,1,0,3,1997-05-29,2014-07-01,1,1985.44,4485.93,...,0,0,0,0,0,0,0,1,0,0
6,,,1,0,3,2011-05-01,2011-05-19,1,1333.27,2803.32,...,0,0,0,0,0,0,0,0,1,0
7,,,2,0,3,2011-08-20,2011-08-20,0,1123.53,2341.84,...,0,0,0,0,0,0,0,0,1,0
8,,,2,0,3,2011-08-20,2011-08-20,0,1123.53,2341.84,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,,,1,1,3,1981-03-21,1983-03-16,1,149.90,265.01,...,0,0,0,0,0,0,0,0,1,0
2491,,,1,0,3,2009-08-27,2009-08-29,1,1028.93,2028.77,...,0,0,0,0,0,0,0,1,0,0
2492,,,1,0,3,2009-08-27,2014-12-22,0,2070.65,4765.38,...,0,0,0,0,0,0,0,1,0,0
2493,,,1,0,3,2009-08-27,2014-12-22,1,2070.65,4765.38,...,0,0,0,0,0,0,0,1,0,0
