# Read and clean a single file

In [1]:
import os
import sys
import logging
import pandas as pd

from pandas_profiling import ProfileReport


project_dir = os.path.dirname(os.path.abspath('.')) # get the path of the project, if doesn't work use os.getcwd() instead and use rfind to find the project dir
sys.path.append(os.path.join(project_dir, 'src', 'data'))

from data_manipulator import get_input_data

bucket = 'ctovar_espbigdata'
directory = 'BigData'

## Read Metadata
-----
General info for each relevant column

In [2]:
file = 'gs://{}/{}/data/metadatos-llamadas-urg-y-emer.csv'.format(bucket, directory)
df_metadata = pd.read_csv(file, encoding='latin1', sep=';')
df_metadata

Unnamed: 0,NOMBRE,DESCRIPCION
0,FECHA_INCIDENTE,Es la fecha el cual se registra la llamada del...
1,FECHA_INICIO_DESPLAZAMIENTO_MOVIL,Es la fecha el cual se inicia el desplazamient...
2,CODIGO LOCALIDAD,Es el código de las 20 localidades de la ciuda...
3,LOCALIDAD,Es la localidad donde sucede el incidente.
4,EDAD,La edad del paciente.
5,UNIDAD,"La descripción de la edad si es en horas, días..."
6,GENERO,Es la distinción de genero del paciente
7,RED,Es la localización a nivel bogota de la red de...
8,TIPO_INCIDENTE,Es la descripción inicial que tipifica el cent...
9,PRIORIDAD,Es la tipificación según la prioridad del inci...


The final data must contains this and only this columns with that specific headers in order to standarize the data, this is a type of data dictionary but it also must include the data types (**schema**) of each field (**column**)

## Get raw data

In [3]:
get_input_data?

[0;31mSignature:[0m
[0mget_input_data[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mbucket[0m[0;34m=[0m[0;34m'esp-big-data'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minitial_directory[0m[0;34m=[0m[0;34m'BigData'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfilename[0m[0;34m=[0m[0;34m'datos-abiertos-agosto-2019.csv'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Read a csv file in a bucket of GCS, the file must use latin1 encoding and the separator is a semicolon (;)

Args:
    bucket (str, optional): Name of the bucket. Defaults to 'esp-big-data'.
    initial_directory (str, optional): project directory. Defaults to 'BigData'.
    filename (str, optional):csv file to read. Defaults to 'datos-abiertos-agosto-2019.csv'.

Returns:
    pandas.dataframe: dataframe with the raw data
[0;31mFile:[0m      ~/ESEIT_BigData/src/data/data_manipulator.py
[0;31mType:[0m      function


In [10]:
raw_data = get_input_data(
    bucket = bucket,
    initial_directory = directory,
    filename = 'datos-abiertos-enero_2020.csv'
)
print(raw_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15304 entries, 0 to 15303
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   NUMERO_INCIDENTE     15304 non-null  object 
 1   FECHA_DESPACHO_518   15304 non-null  object 
 2   CODIGO DE LOCALIDAD  15304 non-null  int64  
 3   LOCALIDAD            15304 non-null  object 
 4   EDAD                 15304 non-null  int64  
 5   UNIDAD               15304 non-null  object 
 6   GENERO               15304 non-null  object 
 7   RED                  15304 non-null  object 
 8   TIPO_INCIDENTE       15304 non-null  object 
 9   PRIORIDAD            15304 non-null  int64  
 10  Unnamed: 10          0 non-null      float64
 11  Unnamed: 11          0 non-null      float64
 12  Unnamed: 12          0 non-null      float64
 13  Unnamed: 13          0 non-null      float64
 14  Unnamed: 14          0 non-null      float64
 15  Unnamed: 15          0 non-null     

## Drop "Unnamed" Columns

In [11]:
list_cols_to_drop = ['Unnamed: {}'.format(i) for i in range(10, 37)]
list_cols_to_drop
raw_data = raw_data.drop(list_cols_to_drop, axis= 'columns')
raw_data.head()

Unnamed: 0,NUMERO_INCIDENTE,FECHA_DESPACHO_518,CODIGO DE LOCALIDAD,LOCALIDAD,EDAD,UNIDAD,GENERO,RED,TIPO_INCIDENTE,PRIORIDAD
0,CRU-00000002-20,2020-01-01 00:13:40,19,Ciudad Bolívar,0,SIN_DATO,SIN_DATO,Sur,Heridos,2
1,CRU-00000002-20,2020-01-01 00:13:40,19,Ciudad Bolívar,0,SIN_DATO,SIN_DATO,Sur,Heridos,2
2,CRU-00000005-20,2020-01-01 00:15:14,14,Los Mártires,0,SIN_DATO,SIN_DATO,Norte,Inconsciente/Paro Cardiorrespiratorio,1
3,CRU-00000024-20,2020-01-01 00:20:50,11,Suba,58,Años,MASCULINO,Norte,Inconsciente/Paro Cardiorrespiratorio,1
4,CRU-00000036-20,2020-01-01 00:26:05,10,Engativá,0,SIN_DATO,SIN_DATO,Norte,Inconsciente/Paro Cardiorrespiratorio,1


It seems that this file contains already the fields, but need to rename them

### Rename Columns

In [18]:
# Don't let white spaces between headers to avoid errors during the pipeline
# This is sometimes called a recipe for etl
raw_data.rename(
    columns = {
        'CODIGO DE LOCALIDAD'               : 'CODIGO_LOCALIDAD',
    },
    inplace=True
)
raw_data.head()

Unnamed: 0,NUMERO_INCIDENTE,FECHA_DESPACHO_518,CODIGO_LOCALIDAD,LOCALIDAD,EDAD,UNIDAD,GENERO,RED,TIPO_INCIDENTE,PRIORIDAD
0,CRU-00000002-20,2020-01-01 00:13:40,19,Ciudad Bolívar,0,SIN_DATO,SIN_DATO,Sur,Heridos,2
1,CRU-00000002-20,2020-01-01 00:13:40,19,Ciudad Bolívar,0,SIN_DATO,SIN_DATO,Sur,Heridos,2
2,CRU-00000005-20,2020-01-01 00:15:14,14,Los Mártires,0,SIN_DATO,SIN_DATO,Norte,Inconsciente/Paro Cardiorrespiratorio,1
3,CRU-00000024-20,2020-01-01 00:20:50,11,Suba,58,Años,MASCULINO,Norte,Inconsciente/Paro Cardiorrespiratorio,1
4,CRU-00000036-20,2020-01-01 00:26:05,10,Engativá,0,SIN_DATO,SIN_DATO,Norte,Inconsciente/Paro Cardiorrespiratorio,1


### Check values per field
--------
1. LOCALIDAD
2. GENERO

In [12]:
raw_data['LOCALIDAD'].value_counts(dropna=False)

 Kennedy              2116
 Engativá             1473
 Suba                 1465
 Bosa                 1072
 Ciudad Bolívar       1022
 Puente Aranda         942
 Fontibón              812
Usaquén                786
 San Cristóbal         746
Rafael Uribe Uribe     737
Usme                   678
Tunjuelito             550
Santa Fe               541
 Teusaquillo           527
 Barrios Unidos        506
Chapinero              498
 Los Mártires          446
 Antonio Nariño        320
 La Candelaria          64
Sumapaz                  3
Name: LOCALIDAD, dtype: int64

It seems that in some fields exist a white space at the begining of the values, probably at the end too, lets clean them all

In [13]:
# Let's use the function strip(), this property works for string objects in python that removes whitespaces
raw_data['LOCALIDAD'].apply(
    lambda x: x.strip()        # here we use a lambda function, exclusively for python. Is a one-line function
).value_counts()

Kennedy               2116
Engativá              1473
Suba                  1465
Bosa                  1072
Ciudad Bolívar        1022
Puente Aranda          942
Fontibón               812
Usaquén                786
San Cristóbal          746
Rafael Uribe Uribe     737
Usme                   678
Tunjuelito             550
Santa Fe               541
Teusaquillo            527
Barrios Unidos         506
Chapinero              498
Los Mártires           446
Antonio Nariño         320
La Candelaria           64
Sumapaz                  3
Name: LOCALIDAD, dtype: int64

In [14]:
raw_data['LOCALIDAD'] = raw_data['LOCALIDAD'].apply(lambda x: x.strip()) # we assign the output of the function apply to the same column

In [15]:
raw_data.head()

Unnamed: 0,NUMERO_INCIDENTE,FECHA_DESPACHO_518,CODIGO DE LOCALIDAD,LOCALIDAD,EDAD,UNIDAD,GENERO,RED,TIPO_INCIDENTE,PRIORIDAD
0,CRU-00000002-20,2020-01-01 00:13:40,19,Ciudad Bolívar,0,SIN_DATO,SIN_DATO,Sur,Heridos,2
1,CRU-00000002-20,2020-01-01 00:13:40,19,Ciudad Bolívar,0,SIN_DATO,SIN_DATO,Sur,Heridos,2
2,CRU-00000005-20,2020-01-01 00:15:14,14,Los Mártires,0,SIN_DATO,SIN_DATO,Norte,Inconsciente/Paro Cardiorrespiratorio,1
3,CRU-00000024-20,2020-01-01 00:20:50,11,Suba,58,Años,MASCULINO,Norte,Inconsciente/Paro Cardiorrespiratorio,1
4,CRU-00000036-20,2020-01-01 00:26:05,10,Engativá,0,SIN_DATO,SIN_DATO,Norte,Inconsciente/Paro Cardiorrespiratorio,1


In [16]:
col = 'GENERO'
raw_data[col] = raw_data[col].apply(lambda x: x.strip())
raw_data[col].value_counts(dropna=False)   # here we could apply the same technique as before to avoid errors

SIN_DATO     7003
MASCULINO    4521
FEMENINO     3780
Name: GENERO, dtype: int64

In [17]:
col = 'TIPO_INCIDENTE'
raw_data[col] = raw_data[col].apply(lambda x: x.strip())
raw_data[col].value_counts(dropna=False)

Heridos                                  6849
Inconsciente/Paro Cardiorrespiratorio    1520
Dificultad Respiratoria                  1359
Enfermo                                  1045
Convulsiones                              994
Trastorno mental                          661
Dolor torácico                            634
Intento de suicidio                       503
Accidente cerebro vascular                374
Patología Gineco - obstétrica             331
Síntomas gastrointestinales               331
Caída                                     320
Intoxicaciones                            138
Ideas de suicidio                          99
SIN_DATO                                   80
Violencia Sexual                           34
Quemaduras                                 10
Sangrado Vaginal                            8
Maltrato                                    7
Electrocución / Rescate                     7
Name: TIPO_INCIDENTE, dtype: int64

Here this columns contains a null value, aka NaN (this is numeric value that is Null), lets replace them first
* NaN --> SIN_DATO

In [19]:
raw_data[col].fillna('SIN_DATO').value_counts()

Heridos                                  6849
Inconsciente/Paro Cardiorrespiratorio    1520
Dificultad Respiratoria                  1359
Enfermo                                  1045
Convulsiones                              994
Trastorno mental                          661
Dolor torácico                            634
Intento de suicidio                       503
Accidente cerebro vascular                374
Patología Gineco - obstétrica             331
Síntomas gastrointestinales               331
Caída                                     320
Intoxicaciones                            138
Ideas de suicidio                          99
SIN_DATO                                   80
Violencia Sexual                           34
Quemaduras                                 10
Sangrado Vaginal                            8
Maltrato                                    7
Electrocución / Rescate                     7
Name: TIPO_INCIDENTE, dtype: int64

# Convert to datetime data type columns must be
________________________________________________

In [20]:
raw_data["FECHA_DESPACHO_518"]= pd.to_datetime(raw_data["FECHA_DESPACHO_518"])
raw_data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15304 entries, 0 to 15303
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   NUMERO_INCIDENTE    15304 non-null  object        
 1   FECHA_DESPACHO_518  15304 non-null  datetime64[ns]
 2   CODIGO_LOCALIDAD    15304 non-null  int64         
 3   LOCALIDAD           15304 non-null  object        
 4   EDAD                15304 non-null  int64         
 5   UNIDAD              15304 non-null  object        
 6   GENERO              15304 non-null  object        
 7   RED                 15304 non-null  object        
 8   TIPO_INCIDENTE      15304 non-null  object        
 9   PRIORIDAD           15304 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(6)
memory usage: 1.2+ MB


# Profile of the Data
------
first install pandas profiling, from a terminal

````
pip install pandas-profiling
````

In [21]:
profile = ProfileReport(df=raw_data)
profile.to_file(output_file=project_dir + '/reports/'+'profile_raw_data_enero_2020.html')

Summarize dataset:   0%|          | 0/23 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Form the initial report of the file we see that are duplicates rows in the table, we need to subtract them

### Remove duplicates

In [25]:
print('Number of rows before cleaning:', raw_data.shape[0] )
raw_data.drop_duplicates(inplace=True)
print('Number of rows after cleaning:', raw_data.shape[0] )

Number of rows before cleaning: 15240
Number of rows after cleaning: 15240


### Save the final table
use to_csv to save the table

In [24]:
# saving in a local directory
raw_data.to_csv(project_dir + '/data/processed/' +'clean_llamadas_123_enero_2020.csv', encoding='latin1', sep=';', index=False)