In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import statsmodels.api

df1 = pd.read_csv(r'LFB+Incident+data+from+January+2017.csv', sep = ';')
df2 = pd.read_csv(r'LFB+Mobilisation+data+from+January+2017.csv', sep = ';')
df3 = pd.read_csv(r'CoordoneesFireStations.csv')

df = df2.merge(df1, on = 'IncidentNumber')
df = df3.merge(df, on = 'DeployedFromStation_Name', suffixes=['', '_'])
df = df[['IncidentNumber', 'CalYear', 'IncidentGroup','PropertyCategory', 'AddressQualifier', 'IncGeo_BoroughName', 
         'Easting_rounded', 'Northing_rounded', 'IncidentStationGround', 'FirstPumpArriving_DeployedFromStation',
         'DateAndTimeMobilised', 'DateAndTimeMobile','DateAndTimeArrived', 'DeployedFromStation_Name','DeployedFromLocation', 
         'PumpOrder', 'DelayCode_Description','Latitude', 'Longitude']]

df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,IncidentNumber,CalYear,IncidentGroup,PropertyCategory,AddressQualifier,IncGeo_BoroughName,Easting_rounded,Northing_rounded,IncidentStationGround,FirstPumpArriving_DeployedFromStation,DateAndTimeMobilised,DateAndTimeMobile,DateAndTimeArrived,DeployedFromStation_Name,DeployedFromLocation,PumpOrder,DelayCode_Description,Latitude,Longitude
0,000006-01012017,2017,Fire,Non Residential,Correct incident location,BARKING AND DAGENHAM,544650,184550,Barking,Barking,01/01/2017 00:07,01/01/2017 00:09,01/01/2017 00:12,Barking,Home Station,1,,51.529581,0.090057
1,000285-01012017,2017,Fire,Dwelling,Correct incident location,BARKING AND DAGENHAM,546150,182450,Barking,Barking,01/01/2017 13:33,01/01/2017 13:35,01/01/2017 13:37,Barking,Home Station,1,,51.529581,0.090057
2,000520-02012017,2017,Fire,Road Vehicle,In street close to gazetteer location,BARKING AND DAGENHAM,544250,184750,Barking,Barking,02/01/2017 02:45,02/01/2017 02:50,02/01/2017 02:50,Barking,Home Station,1,,51.529581,0.090057
3,000521-02012017,2017,False Alarm,Road Vehicle,In street outside gazetteer location,REDBRIDGE,544150,184850,Barking,Barking,02/01/2017 02:45,02/01/2017 02:47,02/01/2017 02:51,Barking,Home Station,1,,51.529581,0.090057
4,000545-02012017,2017,Fire,Outdoor Structure,On land associated with building,BARKING AND DAGENHAM,546850,183850,Barking,Barking,02/01/2017 04:46,02/01/2017 04:47,02/01/2017 04:51,Barking,,1,,51.529581,0.090057


In [4]:
#  Création de la variable cible 'ResponseTimeMinute'
df['DateAndTimeMobilised'] = pd.to_datetime(df['DateAndTimeMobilised'])
df['DateAndTimeArrived'] = pd.to_datetime(df['DateAndTimeArrived'])
df['ResponseTimeMinute'] = (df['DateAndTimeArrived'] - df['DateAndTimeMobilised']).dt.total_seconds()/60
# df.head()

In [5]:
# Nettoyage de la variable cible

# On cherche les temps de réponse négatifs
# df_new = df[df['ResponseTimeMinute'] < 0] # Il y a 128 incidents concernés (0.02 % des données)

# On cherche les temps de réponse aberrants Ex : ici supérieurs à 1h (60 min)
# df_new = df[df['ResponseTimeMinute'] > 60] # Il y a 679 incidents concernés (0.11 % des données)

# On supprime les incidents pour lesquels le temps de réponse est négatif ou supérieur à 1h (60 min)
df = df[(df['ResponseTimeMinute'] > 0) & (df['ResponseTimeMinute'] < 60)]
# df.info()
# df['ResponseTimeMinute'].describe()

In [6]:
# Création des variables 'HourMobilised', 'WeekdayMobilised', 'MonthMobilised'
df['HourMobilised'] = pd.to_datetime(df['DateAndTimeMobilised']).dt.hour
df['WeekdayMobilised'] = pd.to_datetime(df['DateAndTimeMobilised']).dt.weekday
df['MonthMobilised'] = pd.to_datetime(df['DateAndTimeMobilised']).dt.month

# Création de la variable 'Time_preparation'
df['DateAndTimeMobile'] = pd.to_datetime(df['DateAndTimeMobile'])
df["Time_Mobilised"] = pd.to_datetime(df["DateAndTimeMobilised"])
df["Time_preparation"] = (df['DateAndTimeMobile'] - df['DateAndTimeMobilised']).dt.total_seconds()/60

# df.head()

In [7]:
# Création de la variable 'CityCenter'
CityCenter = ['ISLINGTON', 'WESTMINSTER', 'HARINGEY', 'SOUTHWARK', 'TOWER HAMLETS', 'HACKNEY', 'NEWHAM', 'LAMBETH', 'CAMDEN',
               'WANDSWORTH', 'LEWISHAM', 'CITY OF LONDON', 'KENSINGTON AND CHELSEA', 'HAMMERSMITH AND FULHAM']

Peripherie = ['GREENWICH', 'BARKING AND DAGENHAM', 'EALING', 'BRENT', 'REDBRIDGE', 'CROYDON', 'HARROW', 'ENFIELD', 'BROMLEY',
              'BEXLEY', 'HAVERING', 'MERTON', 'HOUNSLOW', 'BARNET', 'HILLINGDON', 'SUTTON', 'WALTHAM FOREST', 
              'KINGSTON UPON THAMES', 'RICHMOND UPON THAMES']

df['CityCenter'] = df['IncGeo_BoroughName'].apply(lambda x: x in CityCenter)

# df.head()

In [8]:
# On recherche les données manquantes
df.isna().sum()

# La variable DelayCode_Description doit être supprimée car comporte trop de données manquantes.
df = df.drop('DelayCode_Description', axis = 1)

# On supprime les quelques données manquantes restantes
df = df.dropna(axis = 0, how = 'any')
df.isna().sum()

IncidentNumber                           0
CalYear                                  0
IncidentGroup                            0
PropertyCategory                         0
AddressQualifier                         0
IncGeo_BoroughName                       0
Easting_rounded                          0
Northing_rounded                         0
IncidentStationGround                    0
FirstPumpArriving_DeployedFromStation    0
DateAndTimeMobilised                     0
DateAndTimeMobile                        0
DateAndTimeArrived                       0
DeployedFromStation_Name                 0
DeployedFromLocation                     0
PumpOrder                                0
Latitude                                 0
Longitude                                0
ResponseTimeMinute                       0
HourMobilised                            0
WeekdayMobilised                         0
MonthMobilised                           0
Time_Mobilised                           0
Time_prepar

In [9]:
# On exporte notre DataFrame au format csv
df.to_csv('Preprocessing.csv',sep=';', index = False) 