### Data Cleaning & Interactive Folium Maps for Final Phases: Dhaka Tribune

In [1]:
#!pip install -U spacy
#!python -m spacy download en
#!pip install geopy

In [2]:
# for manipulating dataframes
import pandas as pd
import numpy as np
# for natural language processing: named entity recognition
import spacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
# for visualizations
%matplotlib inline

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="jjj")

In [3]:
df = pd.read_csv("../input/dhakatribunecomplete/Dhaka Tribune Complete Data.csv")
df['Accident date']= pd.to_datetime(df['Accident date'])
df

Unnamed: 0.1,Unnamed: 0,Newspaper Name,Accident date,Header,News title,Deaths,Injured
0,0,Dhaka Tribune,2021-06-26,Two labourers killed in Dhaka road accident,2 killed in Dhaka road crash,2.0,2
1,1,Dhaka Tribune,2021-06-23,Three killed in Muktagacha road accident,Three killed in Muktagacha road...,3.0,0
2,2,Dhaka Tribune,2021-06-19,Truck-pickup van collision leaves 3 dead in Na...,Truck-pickup van collision leaves 3...,3.0,0
3,3,Dhaka Tribune,2021-06-19,"Couple, infant grandson killed in Bogra bus-au...",Bogra road crash kills 3,3.0,0
4,4,Dhaka Tribune,2021-06-19,Policeman killed in Dhaka road accident,Cop killed in Dhaka road crash,1.0,0
...,...,...,...,...,...,...,...
771,836,Dhaka Tribune,2016-08-04,Two motorcyclists killed in Manikganj road acc...,Two motorcyclists killed in Manikganj...,2.0,0
772,837,Dhaka Tribune,2016-07-16,Four die in Manikganj road accident,Four die in Manikganj road,4.0,0
773,838,Dhaka Tribune,2016-07-15,BNP leader hurt in Dinajpur road crash,BNP leader hurt in Dinajpur road...,0.0,1
774,839,Dhaka Tribune,2016-06-25,Foreigners among 25 injured in Ctg road accident,Foreigners among 25 injured in Ctg road...,0.0,25


In [4]:
%%time

col_value = []

for i in range(len(df)):
    tokens = nlp(''.join(str(df['Header'][i])))
    
    location_list = []
    for ent in tokens.ents:
        if ent.label_ == 'GPE':
            location_list.append(ent.text)
    
    
    location_counts = Counter(location_list).most_common(10)
    col_value.append(''.join(filter(str.isalpha, str(location_counts))))
    
df['Location_appox'] = col_value

CPU times: user 4.92 s, sys: 6.93 ms, total: 4.92 s
Wall time: 4.92 s


In [5]:
df['Location_appox'] = col_value

### Text Cleaning

In [7]:
df['Location_appox'] = df['Location_appox'].replace( '', np.nan)
df['Location_appox'] = df['Location_appox'].replace( 'RangpurFeni', 'Rangpur')
df['Location_appox'] = df['Location_appox'].replace( 'Ctg', 'Chittagong')
df['Location_appox'] = df['Location_appox'].replace( 'Cox', "Cox's Bazar")
df['Location_appox'] = df['Location_appox'].replace( 'JessoreDinajpurNilphamari', "Jessore")
df['Location_appox'] = df['Location_appox'].replace( 'ComillaKhulna', "Cumilla")
df['Location_appox'] = df['Location_appox'].replace( 'DhakaGabtoli', "Dhaka")
df['Location_appox'] = df['Location_appox'].replace( 'BograJhenaidah', "Bogra")
df['Location_appox'] = df['Location_appox'].replace( 'ElderlyDhaka', "Dhaka")

X = ['Elderly', 'Ahona', 'Rasel', 'Wasa', 'Bangladesh', 'India', 'Utah', "California", "Macedonia", "Nepal", 'SaudiArabia', "NewYork", "Uganda", "Oman", "Egypt", "Canada", "Mozambique",\
    'BangladeshUSA', 'Australia']

df.loc[df.Location_appox.isin(X), 'Location_appox'] = np.NaN

In [8]:
df.isnull().sum()

Unnamed: 0          0
Newspaper Name      0
Accident date       0
Header              0
News title          0
Deaths             14
Injured             0
Location_appox    488
dtype: int64

In [9]:
a = list(df.Location_appox.unique())

coordinates = []

def loc_errors(a):
    list_of_error_loc = []
    for loc in a:
        try:
            location = geolocator.geocode(str(loc + ", Bangladesh"))
            lat_long = (location.latitude, location.longitude)
            coordinates.append(list(lat_long))
        except Exception:
            list_of_error_loc.append(loc)
            continue
    return list_of_error_loc

%time list_of_error_loc = loc_errors(a)
list_of_error_loc

CPU times: user 122 ms, sys: 10.2 ms, total: 132 ms
Wall time: 25.3 s


[nan]

In [10]:
df = df[['Accident date', 'Header', 'Location_appox']]
df = df.dropna()

In [11]:
df.Location_appox = df['Location_appox'] + ', Bangladesh'
df = df.reset_index(drop = True)
df

Unnamed: 0,Accident date,Header,Location_appox
0,2021-06-26,Two labourers killed in Dhaka road accident,"Dhaka, Bangladesh"
1,2021-06-19,Truck-pickup van collision leaves 3 dead in Na...,"Narsingdi, Bangladesh"
2,2021-06-19,Policeman killed in Dhaka road accident,"Dhaka, Bangladesh"
3,2021-06-18,3 killed in bus-car collision in Comilla,"Comilla, Bangladesh"
4,2021-06-16,"6 die in road accidents in Dhaka, Gazipur","Dhaka, Bangladesh"
...,...,...,...
283,2016-08-15,Road accident kills 1 in Natore,"Natore, Bangladesh"
284,2016-08-04,Two motorcyclists killed in Manikganj road acc...,"Manikganj, Bangladesh"
285,2016-07-16,Four die in Manikganj road accident,"Manikganj, Bangladesh"
286,2016-07-15,BNP leader hurt in Dinajpur road crash,"Dinajpur, Bangladesh"


### Using GeoPy to retrive coordinates of the locations

In [12]:
%%time

loc_bengali = []
lat = []
lon = []

for i in range(len(df)):
    location = geolocator.geocode(df.Location_appox.values[i])
    print(df.Location_appox.values[i])
    loc_bengali.append(location.address)
    lat.append(location.latitude)
    lon.append(location.longitude)   

Dhaka, Bangladesh
Narsingdi, Bangladesh
Dhaka, Bangladesh
Comilla, Bangladesh
Dhaka, Bangladesh
Brahmanbaria, Bangladesh
Gulshan, Bangladesh
Chittagong, Bangladesh
Rajshahi, Bangladesh
Dhaka, Bangladesh
Comilla, Bangladesh
Comilla, Bangladesh
Dhaka, Bangladesh
Brahmanbaria, Bangladesh
Chittagong, Bangladesh
Narsingdi, Bangladesh
Bogra, Bangladesh
Chittagong, Bangladesh
Jessore, Bangladesh
Ashulia, Bangladesh
Narail, Bangladesh
Chapainawabganj, Bangladesh
Chittagong, Bangladesh
Jhenaidah, Bangladesh
Teknaf, Bangladesh
Dhaka, Bangladesh
Narayanganj, Bangladesh
Dhaka, Bangladesh
Dhaka, Bangladesh
Dhaka, Bangladesh
Jhenaidah, Bangladesh
Dhaka, Bangladesh
Dhaka, Bangladesh
Habiganj, Bangladesh
Chandpur, Bangladesh
Bogra, Bangladesh
Chapainawabganj, Bangladesh
Chapainawabganj, Bangladesh
Dhaka, Bangladesh
Kushtia, Bangladesh
Chandpur, Bangladesh
Dhaka, Bangladesh
Dhaka, Bangladesh
Khulna, Bangladesh
Dhaka, Bangladesh
Comilla, Bangladesh
Dhaka, Bangladesh
Magura, Bangladesh
Habiganj, Banglade

In [13]:
df['location'] = loc_bengali
df['Latitude'] = lat
df['Longitude'] = lon

In [14]:
df

Unnamed: 0,Accident date,Header,Location_appox,location,Latitude,Longitude
0,2021-06-26,Two labourers killed in Dhaka road accident,"Dhaka, Bangladesh","ঢাকা, Chanpara Bazar, ঢাকা জেলা, ঢাকা বিভাগ, 1...",23.810651,90.412647
1,2021-06-19,Truck-pickup van collision leaves 3 dead in Na...,"Narsingdi, Bangladesh","নরসিংদী, নরসিংদী জেলা, ঢাকা বিভাগ, 1602, বাংলাদেশ",23.915645,90.698196
2,2021-06-19,Policeman killed in Dhaka road accident,"Dhaka, Bangladesh","ঢাকা, Chanpara Bazar, ঢাকা জেলা, ঢাকা বিভাগ, 1...",23.810651,90.412647
3,2021-06-18,3 killed in bus-car collision in Comilla,"Comilla, Bangladesh","কুমিল্লা, কুমিল্লা জেলা, চট্টগ্রাম বিভাগ, 3500...",23.461061,91.180875
4,2021-06-16,"6 die in road accidents in Dhaka, Gazipur","Dhaka, Bangladesh","ঢাকা, Chanpara Bazar, ঢাকা জেলা, ঢাকা বিভাগ, 1...",23.810651,90.412647
...,...,...,...,...,...,...
283,2016-08-15,Road accident kills 1 in Natore,"Natore, Bangladesh","chakrampur,Natore,Bangladesh, N/A, N502, নাটোর...",24.411130,88.997356
284,2016-08-04,Two motorcyclists killed in Manikganj road acc...,"Manikganj, Bangladesh","মানিকগঞ্জ জেলা, ঢাকা বিভাগ, বাংলাদেশ",23.871163,89.998783
285,2016-07-16,Four die in Manikganj road accident,"Manikganj, Bangladesh","মানিকগঞ্জ জেলা, ঢাকা বিভাগ, বাংলাদেশ",23.871163,89.998783
286,2016-07-15,BNP leader hurt in Dinajpur road crash,"Dinajpur, Bangladesh","দিনাজপুর, দিনাজপুর জেলা, 5216, বাংলাদেশ",25.626071,88.634623


In [15]:
df.to_csv('Dhaka_Tribune.csv', index = False)

In [16]:
df_new = df.copy()

In [17]:
import folium
from folium.plugins import *
from folium import plugins

In [18]:
df_new = df_new.dropna()
df_new = df_new[(df_new['Latitude'] != 0) & (df_new['Longitude'] != 0)]

def map():


    # Create a map centered on Vancouver
    map_bd = folium.Map(location= [23.6850, 90.3563], tiles="cartodbpositron", zoom_start = 7)

    # Create a list with lat and long values and add the list to a heat map, then show map
    heat_data = [[row['Latitude'],row['Longitude']] for index, row in df_new.iterrows()]
    HeatMap(heat_data).add_to(map_bd)

    # instantiate a feature group for the incidents in the dataframe
    incidents = folium.map.FeatureGroup()

    # loop through the 100 crimes and add each to the incidents feature group
    for lat, lng, in zip(df_new.Latitude, df_new.Longitude):
        incidents.add_child(
            folium.CircleMarker(
                [lat, lng],
                radius=5, # define how big you want the circle markers to be
                color='darkred',
                fill=True,
                fill_color='red',
                fill_opacity=0.6
            )
        )
    
    #map_van.add_child(incidents)
    folium.TileLayer('cartodbdark_matter').add_to(map_bd)

    # instantiate a mark cluster object for the incidents in the dataframe
    incident = plugins.MarkerCluster().add_to(map_bd)

    # loop through the dataframe and add each data point to the mark cluster
    for lat, lng, label, in zip(df_new.Latitude, df_new.Longitude, df_new.Location_appox):
        folium.Marker(
            location=[lat, lng],
            icon=None,
            popup=label,
        ).add_to(incident)

    # add incidents to map
    map_bd.add_child(incident)
    return map_bd

map()

In [19]:
df_new["Weight"] = df_new['Accident date'].astype(str)
df_new["Weight"] = df_new["Weight"].str[5:7]
df_new["Weight"] = df_new["Weight"].astype(float)

import datetime 
df_new['year'] = pd.DatetimeIndex(df_new['Accident date']).year
df_new['month'] = pd.DatetimeIndex(df_new['Accident date']).month

lista_tempo = [] 

for x in df_new['month']: 
    monthinteger = x 
    lista_tempo.append(datetime.date(1900, monthinteger, 1).strftime('%B')) 
    
df_new['months_in_full'] = lista_tempo 
df_new['month_year'] = [d.split('-')[1] + " " + d.split('-')[0] for d in df_new['Accident date'].astype(str)]

df_new

Unnamed: 0,Accident date,Header,Location_appox,location,Latitude,Longitude,Weight,year,month,months_in_full,month_year
0,2021-06-26,Two labourers killed in Dhaka road accident,"Dhaka, Bangladesh","ঢাকা, Chanpara Bazar, ঢাকা জেলা, ঢাকা বিভাগ, 1...",23.810651,90.412647,6.0,2021,6,June,06 2021
1,2021-06-19,Truck-pickup van collision leaves 3 dead in Na...,"Narsingdi, Bangladesh","নরসিংদী, নরসিংদী জেলা, ঢাকা বিভাগ, 1602, বাংলাদেশ",23.915645,90.698196,6.0,2021,6,June,06 2021
2,2021-06-19,Policeman killed in Dhaka road accident,"Dhaka, Bangladesh","ঢাকা, Chanpara Bazar, ঢাকা জেলা, ঢাকা বিভাগ, 1...",23.810651,90.412647,6.0,2021,6,June,06 2021
3,2021-06-18,3 killed in bus-car collision in Comilla,"Comilla, Bangladesh","কুমিল্লা, কুমিল্লা জেলা, চট্টগ্রাম বিভাগ, 3500...",23.461061,91.180875,6.0,2021,6,June,06 2021
4,2021-06-16,"6 die in road accidents in Dhaka, Gazipur","Dhaka, Bangladesh","ঢাকা, Chanpara Bazar, ঢাকা জেলা, ঢাকা বিভাগ, 1...",23.810651,90.412647,6.0,2021,6,June,06 2021
...,...,...,...,...,...,...,...,...,...,...,...
283,2016-08-15,Road accident kills 1 in Natore,"Natore, Bangladesh","chakrampur,Natore,Bangladesh, N/A, N502, নাটোর...",24.411130,88.997356,8.0,2016,8,August,08 2016
284,2016-08-04,Two motorcyclists killed in Manikganj road acc...,"Manikganj, Bangladesh","মানিকগঞ্জ জেলা, ঢাকা বিভাগ, বাংলাদেশ",23.871163,89.998783,8.0,2016,8,August,08 2016
285,2016-07-16,Four die in Manikganj road accident,"Manikganj, Bangladesh","মানিকগঞ্জ জেলা, ঢাকা বিভাগ, বাংলাদেশ",23.871163,89.998783,7.0,2016,7,July,07 2016
286,2016-07-15,BNP leader hurt in Dinajpur road crash,"Dinajpur, Bangladesh","দিনাজপুর, দিনাজপুর জেলা, 5216, বাংলাদেশ",25.626071,88.634623,7.0,2016,7,July,07 2016


In [20]:
import folium
from folium.plugins import *
from folium import plugins

In [21]:
df_new['indexx'] = df_new['months_in_full'] + '/' + df_new['year'].astype(str)
lista_index = df_new['indexx'].unique().tolist()

weight_list = []

df_new['conta'] = 1 
for x in df_new['month_year'].sort_values().unique(): 
    weight_list.append(df_new.loc[df_new['month_year'] == x, 
                                        ['Latitude',"Longitude",'conta']].groupby(['Latitude','Longitude']).sum().reset_index().values.tolist()) 
    
base_map = folium.Map(location=[23.6850, 90.3563],tiles="stamen toner",zoom_start = 6) 

#create the heatmap from our List 
HeatMapWithTime(weight_list, radius=20,index= lista_index, gradient={0.1: 'blue',0.5: 'green', 0.5: 'yellow', 0.95: 'orange', 1: 'red'}, \
                                                                     
                        auto_play =True, min_opacity=0.5, max_opacity=1, use_local_extrema=True).add_to(base_map) 
                                                                
                                                                     
base_map

### References

* https://medium.com/nerd-for-tech/time-lapse-heat-maps-with-folium-1847f53ec956