In [67]:
from bs4 import BeautifulSoup
import requests 
import re
import pandas as pd

## First part - get Data: Basic functions that will be used 
Functions include:
getPagesToCrawl - a function that creates a list of pages that should be used in order to achieve the full data.
getTableDataFromPage - from a single page we get back a table that stores the data
cleanHtmlCode - the first step that we take in order to clean our data, we remove the HTML from it and leave the info.
createDf - gets 8 lists that contains data and return a df

In [82]:
def getPagesToCrawl():
    baseUrl="https://www.start.umd.edu/gtd/search/Results.aspx?page="
    amountPages = "&count=2000" 
    pageList = []
    
    for currentPageNumber in range(1,102):
        pageList.append(baseUrl + str(currentPageNumber) +amountPages)
        
    return pageList

In [83]:
def getTableDataFromPage(pageUrl):
    user_agent = {'User-agent': 'Mozilla/5.0'}
    response = requests.get(pageUrl,headers=user_agent)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find('table')
    return table

In [84]:
def cleanHtmlCode(tableData):
    GTD_ID = []
    DATE = []
    COUNTRY = []
    CITY = [] 
    PERPETRATOR_GROUP = []
    FATALITIES = []
    INJURED = []
    TARGET_TYPE = []

    listRows = []
    for tr in tableData.findAll("tr",attrs={}):
        for td in tr:
            if(td != "\n"):
                listRows.append(td.text)


    # A lot of data that we don't need (The name of the columns (8)) 
    for i in range(8):
        listRows.pop(0)
    
    for i in range(len(listRows)):
        iMod = i%8
        if(iMod == 0):
            GTD_ID.append(listRows[i])
        elif(iMod == 1):
            DATE.append(listRows[i])
        elif(iMod == 2):
            COUNTRY.append(listRows[i])
        elif(iMod == 3):
            CITY.append(listRows[i])
        elif(iMod == 4):
            PERPETRATOR_GROUP.append(listRows[i])
        elif(iMod == 5):
            FATALITIES.append(listRows[i])
        elif(iMod == 6):
            INJURED.append(listRows[i])
        elif(iMod == 7):
            TARGET_TYPE.append(listRows[i])
       
    
    return GTD_ID,DATE,COUNTRY,CITY,PERPETRATOR_GROUP,FATALITIES,INJURED,TARGET_TYPE


In [85]:
def createDf(GTD_ID,DATE,COUNTRY,CITY,PERPETRATOR_GROUP,FATALITIES,INJURED,TARGET_TYPE):
    df = pd.DataFrame({"GTD_ID":GTD_ID,
                   "DATE":DATE,
                   "COUNTRY":COUNTRY,
                   "CITY":CITY,
                   "PERPETRATOR_GROUP":PERPETRATOR_GROUP,
                   "FATALITIES":FATALITIES
                   ,"INJURED":INJURED,
                   "TARGET_TYPE":TARGET_TYPE,
                   })
    return df

## First part - get Data: Flow

In [86]:
# 8 Lists that will store the data.
GTD_ID = []
DATE = []
COUNTRY = []
CITY = [] 
PERPETRATOR_GROUP = []
FATALITIES = []
INJURED = []
TARGET_TYPE = []

In [None]:
# A page list which contains the proper urls.
pageList = getPagesToCrawl()

for page in pageList:
    tableData = getTableDataFromPage(page)
    
    GTD_ID_temp,DATE_temp,COUNTRY_temp,CITY_temp,PERPETRATOR_GROUP_temp,FATALITIES_temp,INJURED_temp,TARGET_TYPE_temp = cleanHtmlCode(tableData)
    
    GTD_ID.extend(GTD_ID_temp)
    DATE.extend(DATE_temp)
    COUNTRY.extend(COUNTRY_temp)
    CITY.extend(CITY_temp)
    PERPETRATOR_GROUP.extend(PERPETRATOR_GROUP_temp)
    FATALITIES.extend(FATALITIES_temp)
    INJURED.extend(INJURED_temp)
    TARGET_TYPE.extend(TARGET_TYPE_temp)
    

df = pd.DataFrame({"GTD_ID":GTD_ID,
                   "DATE":DATE,
                   "COUNTRY":COUNTRY,
                   "CITY":CITY,
                   "PERPETRATOR_GROUP":PERPETRATOR_GROUP,
                   "FATALITIES":FATALITIES
                   ,"INJURED":INJURED,
                   "TARGET_TYPE":TARGET_TYPE,
                   }) 

df.to_csv('GTD_Data_Frame.csv')

In [72]:
pd.read_csv('myDataFrame.csv', header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,,GTD_ID,DATE,COUNTRY,CITY,PERPETRATOR_GROUP,FATALITIES,INJURED,TARGET_TYPE
1,0.0,201912310033,2019-12-31,China,Hong Kong,Unknown,0,0,Government (General)
2,1.0,201912310032,2019-12-31,India,Bagiot Dora,Unknown,0,1,Private Citizens & Property
3,2.0,201912310031,2019-12-31,Sudan,El Geneina,Unknown,2,0,"Government (General),Police"
4,3.0,201912310030,2019-12-31,Sudan,El Geneina,Unknown,2,1,Police
...,...,...,...,...,...,...,...,...,...
1996,1995.0,201909220009,2019-09-22,India,Titagarh,Unknown,0,1,Private Citizens & Property
1997,1996.0,201909220008,2019-09-22,Nigeria,Otan Ife,Fulani extremists,0,2,Transportation
1998,1997.0,201909220007,2019-09-22,Philippines,Kayaga,Unknown,2,2,"Government (General),Private Citizens & Property"
1999,1998.0,201909220006,2019-09-22,Colombia,Sardinata,Revolutionary Armed Forces of Colombia (FARC) ...,0,16,"Police,Private Citizens & Property"
