In [5]:
import pandas as pd
import os

def get_gtd_df():
    """This function converts the user's local excel file into a dataframe. The excel file can be
    retrived by registeering with https://www.start.umd.edu/gtd/.
    This fucnction returns a dataframe with  """
    df = pd.read_excel('globalterrorismdb_0522dist.xlsx')
    return df


def clean_gt_df(df):
    """ This function takes in a df from the get_gtd_df() function and removes columns with more than 20k null values.
    Args: takes in a dataframe
    Returns: dataframe with 44 columns, each column <10% null"""
    cols_to_drop = []
    for col in list(df.columns):
        if df[col].isna().sum() > 20_000:
            cols_to_drop.append(col)
    df = df.drop(columns = cols_to_drop)
    return df

def get_and_clean_gtd():
    df = get_gtd_df()
    df = clean_gt_df(df)
    df = pd.DataFrame(df.groupby('gname').filter(lambda x : len(x)>100))
    #renames columns
    df = df.rename(columns={'eventid':'event_id', 'iyear':'year', 'imonth':'month', 'iday':'day', 
                    'country_txt':'country', 'region_txt':'region', 'attacktype1':'attacktype_n', 
                    'attacktype1_txt': 'attack_type', 'targtype1':'targtype_n','targtype1_txt':'targtype', 
                    'targsubtype1':'targsubtype_n', 'targsubtype1_txt':'targsubtype', 'corp':'target_id', 
                    'target1':'target_description', 'natlty1_txt':'nationality', 'weaptype1_txt':'weaptype', 
                    'weapsubtype1_txt':'weapdesc'})
    # removes impossible dates
    df = df[df.day != 0]
     # create date-time 
    df['date'] = pd.to_datetime(df.year.astype(str) + '/' + df.month.astype(str) + '/' + df.day.astype(str))
    # drops uneeeded columns
    #df = df.drop(columns={'Unnamed: 0', 'Unnamed: 0.1'})
    # set index
    df = df.set_index('date')
    # drop columns where gname = Unknown
    # alt code for this identical function: df = df[df['gname'].str.contains('Unknown) == False]
    df = df[df.gname!='Unknown']
    # drop columns where gname is null
    df.to_csv('maximum_gtd_df.csv')
    return df

def get_maximum_df(use_cache=True):
    """This function returns the maximum dataframe where there is a label for terror group (gname). It drops rows where gname is null or 'unknown'."""
    filename = "'maximum_gtd_df.csv'"
    if os.path.isfile(filename) and use_cache:
        # .values returns a list of values from Series, instead of
        # a Series, which this acquire cannot process.
        return pd.read_csv(filename).values
    else:
        #obtains data and sets to dataframe.
        df = get_and_clean_gtd()
        df.to_csv('maximum_gtd_df.csv')
    return df


def get_perpetrator_df():
    """This function returns the smaller df used to build the initial predictive model. 
    It drops all nulls, and returns approx 40k rows"""
    #obtains data and sets to dataframe.
    df = pd.read_csv('final_df.csv')
    df = pd.DataFrame(df.groupby('gname').filter(lambda x : len(x)>300))
    
    #renames columns
    df = df.rename(columns={'eventid':'event_id', 'iyear':'year', 'imonth':'month', 'iday':'day', 
                    'country_txt':'country', 'region_txt':'region', 'attacktype1':'attacktype_n', 
                    'attacktype1_txt': 'attack_type', 'targtype1':'targtype_n','targtype1_txt':'targtype', 
                    'targsubtype1':'targsubtype_n', 'targsubtype1_txt':'targsubtype', 'corp':'target_id', 
                    'target1':'target_description', 'natlty1_txt':'nationality', 'weaptype1_txt':'weaptype', 
                    'weapsubtype1_txt':'weapdesc'})
    
    # removes impossible dates
    df = df[df.day != 0]
    
    # create date-time 
    df['date'] = pd.to_datetime(df.year.astype(str) + '/' + df.month.astype(str) + '/' + df.day.astype(str))
    
    # drops uneeeded columns
    #df = df.drop(columns={'Unnamed: 0', 'Unnamed: 0.1'})
    
    # set index
    df = df.set_index('date')
    
    df = df.dropna()
    
    return df

# How is the tree making decisions? print the diagram, 
# this is a highly interpretable model
# KNN to impute nulls


In [6]:
# df = get_and_clean_gtd()

In [7]:
# df.head()

In [8]:

df = get_gtd_df()
df = clean_gt_df(df)
df = pd.DataFrame(df.groupby('gname').filter(lambda x : len(x)>100))
#renames columns
df = df.rename(columns={'eventid':'event_id', 'iyear':'year', 'imonth':'month', 'iday':'day', 
                'country_txt':'country', 'region_txt':'region', 'attacktype1':'attacktype_n', 
                'attacktype1_txt': 'attack_type', 'targtype1':'targtype_n','targtype1_txt':'targtype', 
                'targsubtype1':'targsubtype_n', 'targsubtype1_txt':'targsubtype', 'corp':'target_id', 
                'target1':'target_description', 'natlty1_txt':'nationality', 'weaptype1_txt':'weaptype', 
                'weapsubtype1_txt':'weapdesc'})
df.head()


Unnamed: 0,event_id,year,month,day,extended,country,country.1,region,region.1,provstate,...,weaptype,nkill,nwound,property,ishostkid,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY
2,197001000001,1970,1,0,0,160,Philippines,5,Southeast Asia,Tarlac,...,Unknown,1.0,0.0,0,0.0,PGIS,-9,-9,1,1
3,197001000002,1970,1,0,0,78,Greece,8,Western Europe,Attica,...,Explosives,,,1,0.0,PGIS,-9,-9,1,1
4,197001000003,1970,1,0,0,101,Japan,4,East Asia,Fukouka,...,Incendiary,,,1,0.0,PGIS,-9,-9,1,1
7,197001020002,1970,1,2,0,217,United States,1,North America,California,...,Explosives,0.0,0.0,1,0.0,Hewitt Project,-9,-9,0,-9
11,197001060001,1970,1,6,0,217,United States,1,North America,Colorado,...,Incendiary,0.0,0.0,1,0.0,Hewitt Project,-9,-9,0,-9


In [9]:
# removes impossible dates
df = df[df.day != 0]
    # create date-time 
df['date'] = pd.to_datetime(df.year.astype(str) + '/' + df.month.astype(str) + '/' + df.day.astype(str))
# drops uneeeded columns
#df = df.drop(columns={'Unnamed: 0', 'Unnamed: 0.1'})
# set index
df = df.set_index('date')
df.head()


Unnamed: 0_level_0,event_id,year,month,day,extended,country,country,region,region,provstate,...,weaptype,nkill,nwound,property,ishostkid,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-01-02,197001020002,1970,1,2,0,217,United States,1,North America,California,...,Explosives,0.0,0.0,1,0.0,Hewitt Project,-9,-9,0,-9
1970-01-06,197001060001,1970,1,6,0,217,United States,1,North America,Colorado,...,Incendiary,0.0,0.0,1,0.0,Hewitt Project,-9,-9,0,-9
1970-01-08,197001080001,1970,1,8,0,98,Italy,8,Western Europe,Lazio,...,Firearms,0.0,0.0,0,0.0,Hijacking DB,-9,-9,1,1
1970-01-09,197001090001,1970,1,9,0,217,United States,1,North America,Michigan,...,Incendiary,0.0,0.0,1,0.0,Hewitt Project,-9,-9,0,-9
1970-01-19,197001190004,1970,1,19,0,217,United States,1,North America,New Jersey,...,Incendiary,0.0,0.0,1,0.0,Hewitt Project,-9,-9,0,-9


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 186932 entries, 1970-01-02 to 2020-12-31
Data columns (total 45 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   event_id            186932 non-null  int64  
 1   year                186932 non-null  int64  
 2   month               186932 non-null  int64  
 3   day                 186932 non-null  int64  
 4   extended            186932 non-null  int64  
 5   country             186932 non-null  int64  
 6   country             186932 non-null  object 
 7   region              186932 non-null  int64  
 8   region              186932 non-null  object 
 9   provstate           186932 non-null  object 
 10  city                186516 non-null  object 
 11  latitude            183043 non-null  float64
 12  longitude           183042 non-null  float64
 13  specificity         186932 non-null  float64
 14  vicinity            186932 non-null  int64  
 15  crit1             

In [11]:
df.to_csv('int_maximum_gtd_df.csv', index=False)

In [18]:
df = pd.read_csv('int_maximum_gtd_df.csv')
df.gname.isna().sum()

0

In [19]:
len(df)

186932

In [20]:
df.gname.value_counts()

Unknown                                                          91581
Taliban                                                          11973
Islamic State of Iraq and the Levant (ISIL)                       7254
Shining Path (SL)                                                 4541
Al-Shabaab                                                        4412
                                                                 ...  
United Popular Action Movement                                     109
Cooperative for the Development of the Congo (CODECO) militia      107
Right-wing extremists                                              105
National Socialist Council of Nagaland-Isak-Muivah (NSCN-IM)       105
Bangladesh Nationalist Party (BNP)                                 101
Name: gname, Length: 139, dtype: int64

In [21]:
# # drop columns where gname = Unknown
df = df[df.gname!='Unknown']
len(df)

95351

In [23]:
df.gname.value_counts()

Taliban                                                          11973
Islamic State of Iraq and the Levant (ISIL)                       7254
Shining Path (SL)                                                 4541
Al-Shabaab                                                        4412
New People's Army (NPA)                                           3375
                                                                 ...  
United Popular Action Movement                                     109
Cooperative for the Development of the Congo (CODECO) militia      107
Right-wing extremists                                              105
National Socialist Council of Nagaland-Isak-Muivah (NSCN-IM)       105
Bangladesh Nationalist Party (BNP)                                 101
Name: gname, Length: 138, dtype: int64

In [24]:
df.gname.value_counts().sum()

95351

In [25]:
def get_and_clean_gtd():
    df = get_gtd_df()
    df = clean_gt_df(df)
    df = pd.DataFrame(df.groupby('gname').filter(lambda x : len(x)>100))
    #renames columns
    df = df.rename(columns={'eventid':'event_id', 'iyear':'year', 'imonth':'month', 'iday':'day', 
                    'country_txt':'country', 'region_txt':'region', 'attacktype1':'attacktype_n', 
                    'attacktype1_txt': 'attack_type', 'targtype1':'targtype_n','targtype1_txt':'targtype', 
                    'targsubtype1':'targsubtype_n', 'targsubtype1_txt':'targsubtype', 'corp':'target_id', 
                    'target1':'target_description', 'natlty1_txt':'nationality', 'weaptype1_txt':'weaptype', 
                    'weapsubtype1_txt':'weapdesc'})
    # removes impossible dates
    df = df[df.day != 0]
     # create date-time 
    df['date'] = pd.to_datetime(df.year.astype(str) + '/' + df.month.astype(str) + '/' + df.day.astype(str))
    # drops uneeeded columns
    #df = df.drop(columns={'Unnamed: 0', 'Unnamed: 0.1'})
    # set index
    df = df.set_index('date')
    # drop columns where gname = Unknown
    # alt code for this identical function: df = df[df['gname'].str.contains('Unknown) == False]
    df = df[df.gname!='Unknown']
    # drop columns where gname is null
    df.to_csv('maximum_gtd_df.csv')
    return df

In [26]:
df = get_and_clean_gtd()
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 95351 entries, 1970-01-06 to 2020-12-31
Data columns (total 45 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   event_id            95351 non-null  int64  
 1   year                95351 non-null  int64  
 2   month               95351 non-null  int64  
 3   day                 95351 non-null  int64  
 4   extended            95351 non-null  int64  
 5   country             95351 non-null  int64  
 6   country             95351 non-null  object 
 7   region              95351 non-null  int64  
 8   region              95351 non-null  object 
 9   provstate           95351 non-null  object 
 10  city                95263 non-null  object 
 11  latitude            92764 non-null  float64
 12  longitude           92764 non-null  float64
 13  specificity         95351 non-null  float64
 14  vicinity            95351 non-null  int64  
 15  crit1               95351 non-null  

In [16]:
# df[df.gname == '']

In [17]:
# # drop columns where gname is null
# df = df.dropna(subset=['gname'])
# df.to_csv('maximum_gtd_df.csv')