In [1]:
import pandas as pd

def get_gtd_df():
    """This function converts the user's local excel file into a dataframe. The excel file can be 
    retrived by registeering with https://www.start.umd.edu/gtd/.
    This function returns a dataframe with 135 columns """
    df = pd.read_excel('globalterrorismdb_0522dist.xlsx')
    return df

In [2]:
df = get_gtd_df()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209706 entries, 0 to 209705
Columns: 135 entries, eventid to related
dtypes: datetime64[ns](1), float64(54), int64(23), object(57)
memory usage: 216.0+ MB


In [3]:
def clean_gt_df(df):
    """ This function takes in a df from the get_gtd_df() function and removes columns with more than 20k null values.
    Args: takes in a dataframe
    Returns: dataframe with 44 columns, each column <10% null"""
    cols_to_drop = []
    for col in list(df.columns):
        if df[col].isna().sum() > 20_000:
            cols_to_drop.append(col)
    df = df.drop(columns = cols_to_drop)
    return df

In [4]:
df = clean_gt_df(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209706 entries, 0 to 209705
Data columns (total 45 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   eventid           209706 non-null  int64  
 1   iyear             209706 non-null  int64  
 2   imonth            209706 non-null  int64  
 3   iday              209706 non-null  int64  
 4   extended          209706 non-null  int64  
 5   country           209706 non-null  int64  
 6   country_txt       209706 non-null  object 
 7   region            209706 non-null  int64  
 8   region_txt        209706 non-null  object 
 9   provstate         209706 non-null  object 
 10  city              209280 non-null  object 
 11  latitude          205015 non-null  float64
 12  longitude         205014 non-null  float64
 13  specificity       209705 non-null  float64
 14  vicinity          209706 non-null  int64  
 15  crit1             209706 non-null  int64  
 16  crit2             20

In [5]:
def get_and_clean_gtd():
    df = get_gtd_df()
    df = clean_gt_df(df)
    return df

In [8]:
df = get_and_clean_gtd()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209706 entries, 0 to 209705
Data columns (total 45 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   eventid           209706 non-null  int64  
 1   iyear             209706 non-null  int64  
 2   imonth            209706 non-null  int64  
 3   iday              209706 non-null  int64  
 4   extended          209706 non-null  int64  
 5   country           209706 non-null  int64  
 6   country_txt       209706 non-null  object 
 7   region            209706 non-null  int64  
 8   region_txt        209706 non-null  object 
 9   provstate         209706 non-null  object 
 10  city              209280 non-null  object 
 11  latitude          205015 non-null  float64
 12  longitude         205014 non-null  float64
 13  specificity       209705 non-null  float64
 14  vicinity          209706 non-null  int64  
 15  crit1             209706 non-null  int64  
 16  crit2             20

### further cleaning/preprocessing tasks:
- send df to csv once it looks the way we want: set up local retrieval from memory as opposed to df creation each time
- dropping columns/rows: 
    - filter by date: 2000-17
        - day/month/year into a DTG column, convert to correct type
    - filter by location?: maybe pick Irag or Afghanistan to narrow the focus for our MVP
    

In [11]:
import pandas

def from_codys_csv():
    df = pd.read_csv('final_df.csv')
    return df

df = from_codys_csv()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62724 entries, 0 to 62723
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        62724 non-null  int64  
 1   Unnamed: 0.1      62724 non-null  int64  
 2   eventid           62724 non-null  int64  
 3   iyear             62724 non-null  int64  
 4   imonth            62724 non-null  int64  
 5   iday              62724 non-null  int64  
 6   country_txt       62724 non-null  object 
 7   region_txt        62724 non-null  object 
 8   provstate         62724 non-null  object 
 9   city              62552 non-null  object 
 10  latitude          62344 non-null  float64
 11  longitude         62344 non-null  float64
 12  success           62724 non-null  int64  
 13  suicide           62724 non-null  int64  
 14  attacktype1       62724 non-null  int64  
 15  attacktype1_txt   62724 non-null  object 
 16  targtype1         62724 non-null  int64 

In [12]:
df.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,eventid,iyear,imonth,iday,country_txt,region_txt,provstate,city,...,claimed,weaptype1_txt,weapsubtype1_txt,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,property
0,0,71653,200101010004,2001,1,1,turkey,Middle East & North Africa,Istanbul,Istanbul,...,0.0,Explosives,Pipe Bomb,0.0,0.0,0.0,10.0,0.0,0.0,1


In [15]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'eventid', 'iyear', 'imonth', 'iday',
       'country_txt', 'region_txt', 'provstate', 'city', 'latitude',
       'longitude', 'success', 'suicide', 'attacktype1', 'attacktype1_txt',
       'targtype1', 'targtype1_txt', 'targsubtype1', 'targsubtype1_txt',
       'corp1', 'target1', 'natlty1_txt', 'gname', 'claimed', 'weaptype1_txt',
       'weapsubtype1_txt', 'nkill', 'nkillus', 'nkillter', 'nwound',
       'nwoundus', 'nwoundte', 'property'],
      dtype='object')

In [16]:
df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
df.head(1)

Unnamed: 0,eventid,iyear,imonth,iday,country_txt,region_txt,provstate,city,latitude,longitude,...,claimed,weaptype1_txt,weapsubtype1_txt,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,property
0,200101010004,2001,1,1,turkey,Middle East & North Africa,Istanbul,Istanbul,41.106178,28.689863,...,0.0,Explosives,Pipe Bomb,0.0,0.0,0.0,10.0,0.0,0.0,1


In [17]:
df.isna().sum()

eventid                0
iyear                  0
imonth                 0
iday                   0
country_txt            0
region_txt             0
provstate              0
city                 172
latitude             380
longitude            380
success                0
suicide                0
attacktype1            0
attacktype1_txt        0
targtype1              0
targtype1_txt          0
targsubtype1        4185
targsubtype1_txt    4185
corp1               5889
target1               85
natlty1_txt          902
gname                  0
claimed                0
weaptype1_txt          0
weapsubtype1_txt    4719
nkill               2506
nkillus              264
nkillter            1608
nwound              4338
nwoundus             412
nwoundte            2567
property               0
dtype: int64

In [18]:
df.head(5)

Unnamed: 0,eventid,iyear,imonth,iday,country_txt,region_txt,provstate,city,latitude,longitude,...,claimed,weaptype1_txt,weapsubtype1_txt,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,property
0,200101010004,2001,1,1,turkey,Middle East & North Africa,Istanbul,Istanbul,41.106178,28.689863,...,0.0,Explosives,Pipe Bomb,0.0,0.0,0.0,10.0,0.0,0.0,1
1,200101030001,2001,1,3,turkey,Middle East & North Africa,Istanbul,Istanbul,41.106178,28.689863,...,0.0,Explosives,Suicide (carried bodily by human being),3.0,0.0,1.0,7.0,0.0,0.0,1
2,200101060005,2001,1,6,somalia,Sub-Saharan Africa,Bakool,Tayeeglow,4.010857,44.513226,...,0.0,Firearms,Automatic or Semi-Automatic Rifle,,0.0,0.0,,0.0,,1
3,200101070003,2001,1,7,turkey,Middle East & North Africa,Istanbul,Istanbul,41.106178,28.689863,...,0.0,Firearms,Unknown Gun Type,0.0,0.0,0.0,1.0,0.0,0.0,1
4,200101070007,2001,1,7,afghanistan,South Asia,Bamyan,Yakawlang,34.733333,66.966667,...,0.0,Firearms,Unknown Gun Type,150.0,0.0,0.0,,0.0,0.0,-9
