In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
%matplotlib inline

In [2]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly import tools

In [3]:
pd.set_option('display.max_columns', 50)

In [4]:
data=pd.read_csv('ACLED_data_India.csv')

In [5]:
# load codes description data
inter_codes=pd.read_csv('Inter_codes.csv')
geo_precision_codes=pd.read_csv('geo_precision_code.csv')
time_precision_codes=pd.read_csv('time_precision_code.csv')

In [6]:
# Data cleaning: 1. drop non-important features
#                2. fix null values
#                3. for columns source and source scale split the combined strings                    
#                3. process source, source_scale, actors, date features to create new features

In [7]:
# Count unique values for every column
unique_count=pd.Series()
for column in data.columns:
    unique_count[column]=data[column].unique().size
# drop columns with unique value 1
for column in unique_count.index:
    if unique_count[column]==1:
        data.drop(columns=column,inplace=True)
# drop other non-relevent features
features_to_drop=['country','data_id','event_id_no_cnty','timestamp','year','interaction']
Data1=data.drop(columns=features_to_drop)
Data1['event_date_formatted']=pd.to_datetime(Data1['event_date'])
Data1.drop(columns='event_date',inplace=True)

In [8]:
Data1.columns

Index(['event_id_cnty', 'time_precision', 'event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'admin1', 'admin2', 'admin3', 'location', 'latitude', 'longitude',
       'geo_precision', 'source', 'source_scale', 'notes', 'fatalities',
       'event_date_formatted'],
      dtype='object')

In [9]:
# Process source and source-scale, actors features
Data1.source_scale.value_counts()

National                     23839
Subnational                  18882
Regional                       892
Subnational-National           345
National-Regional               88
International                   81
Other                           80
Subnational-Regional            79
Subnational-Other                4
Subnational-International        3
National-International           2
National; National               1
National-Other                   1
Other-National                   1
Subnational- National            1
National-Subnational             1
Regional-Other                   1
regional                         1
Name: source_scale, dtype: int64

In [10]:
Data1.replace('regional','Regional',inplace=True) 

In [11]:
#Need splitting Columns: source_scale 
SplitNeeded_for_list= ['Subnational-National','National-Regional',
'Subnational-Regional','Subnational-Other','Subnational-International',
'National-International','National-Other','National-Subnational',
'Subnational- National','Regional-Other','Other-National']

In [12]:
def Source_Split(datacut):
    SourceScaleSplit=pd.DataFrame(datacut.source_scale.str.split(pat="-"))
    SourceSplit=pd.DataFrame(datacut.source.str.split(pat=";"))
    FinalSplit=pd.concat([SourceScaleSplit,SourceSplit],axis=1)
    datacut=datacut.drop(columns=['source','source_scale'])
    datacut=datacut.join(FinalSplit)
    return datacut

In [13]:
for SplitNeeded in SplitNeeded_for_list:
    datacut=Data1[Data1.source_scale==SplitNeeded]    
    Data1.drop(index=datacut.index,inplace=True)
    Data1=Data1.append(Source_Split(datacut))


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [14]:
Data1.replace(' National','National',inplace=True) 

In [15]:
datacut=Data1[Data1.source_scale=='National; National']
SourceScaleSplit=pd.DataFrame(datacut.source_scale.str.split(pat="; "))
SourceSplit=pd.DataFrame(datacut.source.str.split(pat=";"))
FinalSplit=pd.concat([SourceScaleSplit,SourceSplit],axis=1)
datacut=datacut.drop(columns=['source','source_scale'])
datacut=datacut.join(FinalSplit)
Data1.drop(index=datacut.index,inplace=True)
Data1=Data1.append(datacut)

In [16]:
# Check for null values and fix the columns
Null_list=Data1.isnull().sum()
Null_list[Null_list>0]

actor2           31780
admin2              11
admin3            1594
assoc_actor_1    16120
assoc_actor_2    40911
source_scale        77
dtype: int64

In [17]:
Source_missing_scale=Data1['source'][Data1.source_scale.isnull()].unique()
Source_missing_scale_filllist=pd.Series()
for source_name in Source_missing_scale:
    Source_missing_scale_filllist[source_name]=Data1[Data1['source']==source_name]['source_scale'].dropna().unique()
Source_missing_scale_filllist

Telegraph (India)                                           [National]
Sangai Express (India)                         [Subnational, National]
Times of India                                 [National, Subnational]
Asian News International           [Regional, International, National]
Chandigarh Tribune                             [Subnational, National]
Indian Express                       [National, Subnational, Regional]
Pioneer (India)                                [National, Subnational]
Hindustan Times (India)                        [National, Subnational]
Pioneer (India); Times of India                                     []
dtype: object

In [18]:
Index_of_NaN=Data1.loc[Data1['source']=='Telegraph (India)'][Data1.source_scale.isnull()].index
Data1.loc[Index_of_NaN,'source_scale']='National'
Index=Data1.loc[Data1['source']=='Sangai Express (India)'][Data1.source_scale=='National'].index
Data1.loc[Index,'source_scale']='Subnational'
Index_of_NaN=Data1.loc[Data1['source']=='Sangai Express (India)'][Data1.source_scale.isnull()].index
Data1.loc[Index_of_NaN,'source_scale']='Subnational'
Index=Data1.loc[Data1['source']=='Times of India'][Data1.source_scale=='Subnational'].index
Data1.loc[Index,'source_scale']='National'
Index_of_NaN=Data1.loc[Data1['source']=='Times of India'][Data1.source_scale.isnull()].index
Data1.loc[Index_of_NaN,'source_scale']='National'
Index=Data1.loc[Data1['source']=='Pioneer (India); Times of India'].index
Data1.loc[Index,'source_scale']='National-National'
datacut=Data1[Data1.source_scale=='National-National']    
Data1.drop(index=datacut.index,inplace=True)
Data1=Data1.append(Source_Split(datacut))
Index=Data1.loc[Data1['source']=='Pioneer (India)'][Data1.source_scale=='Subnational'].index
Data1.loc[Index,'source_scale']='National'
Index_of_NaN=Data1.loc[Data1['source']=='Pioneer (India)'][Data1.source_scale.isnull()].index
Data1.loc[Index_of_NaN,'source_scale']='National'
Index=Data1.loc[Data1['source']=='Hindustan Times (India)'][Data1.source_scale=='Subnational'].index
Data1.loc[Index,'source_scale']='National'
Index_of_NaN=Data1.loc[Data1['source']=='Hindustan Times (India)'][Data1.source_scale.isnull()].index
Data1.loc[Index_of_NaN,'source_scale']='National'
Index=Data1.loc[Data1['source']=='Indian Express'][Data1.source_scale=='Regional'].index
Data1.loc[Index,'source_scale']='National'
Index=Data1.loc[Data1['source']=='Indian Express'][Data1.source_scale=='Subnational'].index
Data1.loc[Index,'source_scale']='National'
Index_of_NaN=Data1.loc[Data1['source']=='Indian Express'][Data1.source_scale.isnull()].index
Data1.loc[Index_of_NaN,'source_scale']='National'
Source_scale_main=Data1.loc[Data1['source']=='Chandigarh Tribune'].source_scale.value_counts().idxmax()
Index=Data1.loc[Data1['source']=='Chandigarh Tribune'].index
Data1.loc[Index,'source_scale']=Source_scale_main
Source_scale_main=Data1.loc[Data1['source']=='Asian News International'].source_scale.value_counts().idxmax()
Index=Data1.loc[Data1['source']=='Asian News International'].index
Data1.loc[Index,'source_scale']=Source_scale_main


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



In [19]:
Actors=pd.Series(index=Data1.index)
Actor2_null_imask=Data1.actor2.notnull()
AssoActor1_null_imask=Data1.assoc_actor_1.notnull()
AssoActor2_null_imask=Data1.assoc_actor_2.notnull()
for idx in Data1.index:
    if Actor2_null_imask[idx] and AssoActor1_null_imask[idx] and AssoActor2_null_imask[idx]:
        Actors[idx]=[Data1.loc[idx,'actor1'],Data1.loc[idx,'actor2'],Data1.loc[idx,'assoc_actor_1'],Data1.loc[idx,'assoc_actor_2']]
    elif Actor2_null_imask[idx] and AssoActor1_null_imask[idx] and not AssoActor2_null_imask[idx]:
        Actors[idx]=[Data1.loc[idx,'actor1'],Data1.loc[idx,'actor2'],Data1.loc[idx,'assoc_actor_1']]
    elif Actor2_null_imask[idx] and not AssoActor1_null_imask[idx] and AssoActor2_null_imask[idx]:
        Actors[idx]=[Data1.loc[idx,'actor1'],Data1.loc[idx,'actor2'],Data1.loc[idx,'assoc_actor_2']]
    elif not Actor2_null_imask[idx] and AssoActor1_null_imask[idx] and AssoActor2_null_imask[idx]:
        Actors[idx]=[Data1.loc[idx,'actor1'],Data1.loc[idx,'assoc_actor_1'],Data1.loc[idx,'assoc_actor_2']]
    elif Actor2_null_imask[idx] and not AssoActor1_null_imask[idx] and not AssoActor2_null_imask[idx]:
        Actors[idx]=[Data1.loc[idx,'actor1'],Data1.loc[idx,'actor2']]
    elif not Actor2_null_imask[idx] and not AssoActor1_null_imask[idx] and AssoActor2_null_imask[idx]:
        Actors[idx]=[Data1.loc[idx,'actor1'],Data1.loc[idx,'assoc_actor_2']]
    elif not Actor2_null_imask[idx] and AssoActor1_null_imask[idx] and not AssoActor2_null_imask[idx]:
        Actors[idx]=[Data1.loc[idx,'actor1'],Data1.loc[idx,'assoc_actor_1']]
    else:
        Actors[idx]=Data1.loc[idx,'actor1']
Data1=Data1.join(pd.DataFrame(Actors,columns=['Actors']))
Data1.drop(columns=['actor1','actor2','assoc_actor_1', 'assoc_actor_2'],inplace=True)

In [20]:
Administration=pd.Series(index=Data1.index)
Admin2_null_imask=Data1.admin2.notnull()
Admin3_null_imask=Data1.admin3.notnull()
for idx in Data1.index:
    if Admin2_null_imask[idx] and Admin3_null_imask[idx]:
        Administration[idx]=[Data1.loc[idx,'admin1'],Data1.loc[idx,'admin2'],Data1.loc[idx,'admin3']]
    elif Admin2_null_imask[idx] and not Admin3_null_imask[idx]:
        Administration[idx]=[Data1.loc[idx,'admin1'],Data1.loc[idx,'admin2']]    
    elif not Admin2_null_imask[idx] and Admin3_null_imask[idx]:
        Administration[idx]=[Data1.loc[idx,'admin1'],Data1.loc[idx,'admin3']]    
    else:
        Administration[idx]=Data1.loc[idx,'admin1']
Data1=Data1.join(pd.DataFrame(Administration,columns=['Administration_levels']))
Data1.drop(columns=['admin2','admin3'],inplace=True)

In [21]:
# New feature created 
#Number of sources
def No_of_source(source):
    if type(source)==list:
        return len(source)
    else:
        return 1
Data1['No_of_sources']=Data1['source'].apply(No_of_source)
#Number of actors
def No_of_Actors(actors):
    if type(actors)==list:
        return len(actors)
    else:
        return 1
Data1['No_of_actors']=Data1['Actors'].apply(No_of_Actors)
# Month number 1 to 37
def month_of(date_formatted):
    return date_formatted.month+12*(date_formatted.year-2016)
Data1['month']=Data1.event_date_formatted.apply(month_of)

In [22]:
# Month-wise event distribution
Month_distribution=pd.DataFrame(Data1.month.value_counts())
Month_distribution['Normalize']=Data1.month.value_counts(normalize=True)

In [23]:
# Normalized Frequency data for states over all months
State_All=Data1.admin1.value_counts(normalize=True)
State_All=pd.DataFrame(State_All).rename(columns={'admin1':'Normalized_Freq'})
State_month=pd.DataFrame(data=None,index=State_All.index)
for m in Month_distribution.index.sort_values():
    Data_m=Data1[Data1.month==m]
    State_month[m]=Data1.admin1[Data1.month==m].value_counts(normalize=True)
State_month.fillna(0,inplace=True)

In [24]:
Statewise_fatalities=pd.DataFrame(Data1.groupby(['admin1']).fatalities.sum())
Statewise_fatalities['Normalize']=pd.DataFrame(Data1.groupby(['admin1']).fatalities.sum()/Data1.fatalities.sum())
del Statewise_fatalities.index.name
State_monthwise_fatalities=pd.DataFrame(data=None,index=State_All.index)
for m in Month_distribution.index.sort_values():
    Data_m=Data1[Data1.month==m]
    State_monthwise_fatalities[m]=Data1[Data1.month==m].groupby(['admin1']).fatalities.sum()
    State_monthwise_fatalities.fillna(0,inplace=True)

In [25]:
def get_Mapdata_slider2(input_data):
    trace=[]
    for m in Month_distribution.index.sort_values():
             trace.append(go.Bar(
                            x=input_data.index,
                            y=input_data[m]
                                )
                          )
    return [trace]
Slider_data2=get_Mapdata_slider2(State_month)

In [28]:
steps = []
for i in range(Month_distribution.index.sort_values().max()):
    step = dict(
                method = 'restyle',  
                args = ['visible', [False] * Month_distribution.index.sort_values().max()],
                label= 'Month ' + str(i+1)
                )
    step['args'][1][i] = True # Toggle i'th trace to "visible"
    steps.append(step)
    sliders = [dict(
                    active = 0,
                    currentvalue = {"prefix": ""},
                    pad = {"t": Month_distribution.index.sort_values().max()},
                    steps = steps,
                    name= 'Month',
                    y= 1.5
                    )]
layout = dict(sliders=sliders,
              title='Plot#1: Monthwise Event Frequency',
              autosize=True,
              yaxis=go.layout.YAxis(
                                    title='Normalized Frequency',
                                    automargin=True,
                                   ),
              xaxis=go.layout.XAxis(
                                    title='States',
                                    automargin=True,
                                   ),
             )
fig = dict(data=Slider_data2[0], layout=layout)
iplot(fig, filename='Slider 2')

In [29]:
Slider_data2=get_Mapdata_slider2(State_monthwise_fatalities)
steps = []
for i in range(Month_distribution.index.sort_values().max()):
    step = dict(
                method = 'restyle',  
                args = ['visible', [False] * Month_distribution.index.sort_values().max()],
                label= 'Month ' + str(i+1)
                )
    step['args'][1][i] = True # Toggle i'th trace to "visible"
    steps.append(step)
    sliders = [dict(
                    active = 0,
                    currentvalue = {"prefix": ""},
                    pad = {"t": Month_distribution.index.sort_values().max()},
                    steps = steps,
                    name= 'Month',
                    y= 1.5
                    )]
layout = dict(sliders=sliders,
              title='Plot#2: Monthwise Fatalities',
              autosize=True,
              yaxis=go.layout.YAxis(
                                    title='Count',
                                    automargin=True,
                                   ),
              xaxis=go.layout.XAxis(
                                    title='States',
                                    automargin=True,
                                   ),
             )
fig = dict(data=Slider_data2[0], layout=layout)
iplot(fig, filename='Slider 2')