In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# TODO: Read in from DB
conflict_path = os.path.join('Resources', 'cleaned_conflict_1.csv')
stocks_path = os.path.join('Resources', 'joined_stocks.csv')

conflict_df = pd.read_csv(conflict_path)
stocks_df = pd.read_csv(stocks_path)

In [3]:
# convert strings to datetimes
stocks_df['Date'] = pd.to_datetime(stocks_df['Date'])
conflict_df['start_date'] = pd.to_datetime(conflict_df['start_date'])
conflict_df['start_date2'] = pd.to_datetime(conflict_df['start_date2'])

In [4]:
# Trying merge
combined_df = conflict_df.merge(stocks_df, how = 'outer', left_on='start_date2', right_on = 'Date')

In [5]:
# gather conflict information that is within the stock availability window.
sorted_stock_dates = stocks_df.Date.sort_values()
available_conflicts = (conflict_df.start_date >= sorted_stock_dates[0]) & (conflict_df.start_date <= sorted_stock_dates.iloc[-1] )

In [6]:
conflict_df[available_conflicts]

Unnamed: 0,location,side_a,side_a_2nd,side_b,side_b_2nd,incompatibility,territory_name,cumulative_intensity,type_of_conflict,start_date,start_date2,ep_end,region
0,India,Government of India,,GNLA,,1,Garoland,0,3,1997-05-29,2012-11-15,1,3
1,India,Government of India,,GNLA,,1,Garoland,0,3,1997-05-29,2014-07-01,1,3
6,Sudan,Government of Sudan,,Republic of South Sudan,,1,Abyei,0,3,2011-05-01,2011-05-19,1,4
7,South Sudan,Government of South Sudan,,"SSDM/A, SSLM/A",,2,,0,3,2011-08-20,2011-08-20,0,4
8,South Sudan,Government of South Sudan,,SSLM/A,,2,,0,3,2011-08-20,2011-08-20,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,Ethiopia,Government of Ethiopia,,SLM,,1,Sidamaland,1,3,1981-03-21,1983-03-16,1,4
2491,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2009-08-29,1,3
2492,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2014-12-22,0,3
2493,Myanmar (Burma),Government of Myanmar (Burma),,MNDAA,,1,Kokang,0,3,2009-08-27,2014-12-22,1,3


In [7]:
# create a function to identify the nearest date to get relevant stock rows
def find_nearest_date(date, stock_dates, row_list, not_used_index):
    index = stock_dates.index.get_loc(date, method = 'nearest')
    row_list.append(index)
    if index in not_used_index:
        not_used_index.remove(index)
    return row_list, not_used_index

In [8]:
# create stock_dates_df which has ordered dates as the index for searching
stock_dates_df = stocks_df.set_index('Date').sort_index()

In [9]:
# create a new DF to append stock data to
available_conflicts_df = conflict_df[available_conflicts].copy()
available_conflicts_df[stock_dates_df.columns] = -999 # default value for easy identification

In [10]:
# create lists for storing stock row indices
stock_rows = []
double_count_index = list(range(stock_dates_df.shape[0])) # used to avoid double counting when merging data later

for index, date in available_conflicts_df.start_date2.iteritems():
    stock_rows, double_count_index = \
    find_nearest_date(date, stock_dates_df, stock_rows, double_count_index)

In [11]:
# insert stock data to available_conflicts_df
available_conflicts_df[stock_dates_df.columns] = stock_dates_df.iloc[stock_rows].values

In [12]:
# create a function to bin the categorical data
def create_bins(df, col, thresh):
    output = df[col].copy()
    frequencies = output.value_counts()
    categories_to_keep = frequencies[:thresh].index
    output = output.apply(lambda x: x if x in categories_to_keep else 'Other')
    return output

In [13]:
available_conflicts_df.dtypes[available_conflicts_df.dtypes == 'object'].index

Index(['location', 'side_a', 'side_a_2nd', 'side_b', 'side_b_2nd',
       'territory_name', 'region'],
      dtype='object')

In [14]:
# get string columns for encoding
string_cols = available_conflicts_df.dtypes[available_conflicts_df.dtypes == 'object'].index
ignore_cols = ['side_a_2nd', 'side_b_2nd', 'region'] # can handle later if necessary
string_cols = string_cols.drop(labels = ignore_cols)

# bin the categorical data 
names_to_keep = 25 # number of bins = names_to_keep + 1
for col in string_cols:
    available_conflicts_df[col] = create_bins(available_conflicts_df, col, names_to_keep)

# hand encode region columns 1,2,3,4,5 before encoding the rest
available_conflicts_df['region_1'] = 0
available_conflicts_df['region_2'] = 0
available_conflicts_df['region_3'] = 0
available_conflicts_df['region_4'] = 0
available_conflicts_df['region_5'] = 0

# store the conflict region strings as a series
conflict_regions = available_conflicts_df.region.copy()

# loop over the conflict_region strings
for row, regions_string in enumerate(conflict_regions):
    
    # split the strings
    regions = regions_string.split(',')
    # extract the row
    row_data = available_conflicts_df.iloc[row,-5:].copy()
    for region in regions:
        row_data['region_' + str(int(region))] = 1
    available_conflicts_df.iloc[row, -5:] = row_data
available_conflicts_df
# encode the data
binned_df = pd.get_dummies(available_conflicts_df, columns = string_cols)

# add a Date column to binned_df for merging
binned_df['Date'] = binned_df.start_date2

In [15]:
# merge on date and stock data
combined_df = binned_df.merge(
    stock_dates_df.iloc[double_count_index],
    on=['Date', 'S&P_500','NASDAQ','Dow_Jones'],
    how = 'outer'
)

# create year, month, day columns
combined_df['year'] = combined_df.Date.apply(lambda date: date.year)
combined_df['month'] = combined_df.Date.apply(lambda date: date.month)
combined_df['day'] = combined_df.Date.apply(lambda date: date.day)

In [16]:
# drop rows that have missing stock data
combined_df = combined_df[combined_df['S&P_500'].notna()]
combined_df = combined_df[combined_df['NASDAQ'].notna()]
combined_df = combined_df[combined_df['Dow_Jones'].notna()]

In [20]:
# drop start_date and start_date2
combined_df = combined_df.drop(columns = ['start_date', 'start_date2'])

# fill in NaNs
combined_df = combined_df.fillna(0)

KeyError: "['start_date' 'start_date2'] not found in axis"

In [18]:
# output csv file
output_path = os.path.join('Resources', 'combined_data_1.csv')
combined_df.to_csv(output_path, index = False)

In [19]:
combined_df

Unnamed: 0,side_a_2nd,side_b_2nd,incompatibility,cumulative_intensity,type_of_conflict,ep_end,region,S&P_500,NASDAQ,Dow_Jones,...,territory_name_Other,territory_name_Punjab/Khalistan,territory_name_Rojava Kurdistan,territory_name_Serb,territory_name_Southern Lebanon,territory_name_Tripura,Date,year,month,day
0,,,1.0,0.0,3.0,1.0,3,1359.88,2853.13,12588.31,...,1.0,0.0,0.0,0.0,0.0,0.0,2012-11-15,2012,11,15
1,,,1.0,0.0,3.0,1.0,3,1985.44,4485.93,17068.26,...,1.0,0.0,0.0,0.0,0.0,0.0,2014-07-01,2014,7,1
2,,,1.0,0.0,3.0,1.0,4,1333.27,2803.32,12512.04,...,1.0,0.0,0.0,0.0,0.0,0.0,2011-05-19,2011,5,19
3,,,2.0,0.0,3.0,0.0,4,1123.53,2341.84,10817.65,...,1.0,0.0,0.0,0.0,0.0,0.0,2011-08-20,2011,8,20
4,,,2.0,0.0,3.0,0.0,4,1123.53,2341.84,10817.65,...,1.0,0.0,0.0,0.0,0.0,0.0,2011-08-20,2011,8,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2600,,,,,,,,2423.41,6140.42,21349.63,...,,,,,,,2017-06-30,2017,6,30
2601,,,,,,,,2472.54,6387.75,21580.07,...,,,,,,,2017-07-21,2017,7,21
2602,,,,,,,,2472.10,6374.68,21830.31,...,,,,,,,2017-07-28,2017,7,28
2603,,,,,,,,2441.32,6256.56,21858.32,...,,,,,,,2017-08-11,2017,8,11
