# Read and clean a single file

In [18]:
import os
import sys
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport
from dateutil.parser import parse

# %load_ext autoreload

project_dir = os.path.dirname(os.path.abspath('.')) # get the path of the project, if doesn't work use os.getcwd() instead and use rfind to find the project dir
sys.path.append(os.path.join(project_dir, 'src', 'data'))

from data_manipulator import get_input_data

bucket = 'espbigdata'
directory = 'BigData'


from google.cloud import storage

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
def list_blobs(bucket_name, directory="BigData/Data/process/"):              
    
    
    storage_client= storage.Client()
    blobs=storage_client.list_blobs(
        bucket_name,
        prefix   = directory,
        delimiter='/'
    )
       
    list_files=list()
    
    for blob in blobs:
        print(blob.name)
        if '.csv' in blob.name:
            list_files.append(blob.name)
    return list_files
    

In [49]:
list_files= list_blobs(bucket_name=bucket)

BigData/Data/process/
BigData/Data/process/clean_llamadas_123_agosto_2019.csv
BigData/Data/process/clean_llamadas_123_enero_2020.csv
BigData/Data/process/clean_llamadas_123_julio_2019.csv
BigData/Data/process/clean_llamadas_123_noviembre_2019.csv
BigData/Data/process/clean_llamadas_123_octubre_2019.csv
BigData/Data/process/clean_llamadas_123_septiembre_2019.csv


In [57]:
list_data = list()

for file in list_files:
    print(file.split('/'))
    list_strings = file.split('/')
    data = get_input_data(
    bucket = bucket, 
    initial_directory = directory,
    step = 'process',
    filename = list_strings[-1]
)
list_data.append(data)
print('total Dataframe:{}'.format(len(list_data)))

['BigData', 'Data', 'process', 'clean_llamadas_123_agosto_2019.csv']
['BigData', 'Data', 'process', 'clean_llamadas_123_enero_2020.csv']
['BigData', 'Data', 'process', 'clean_llamadas_123_julio_2019.csv']
['BigData', 'Data', 'process', 'clean_llamadas_123_noviembre_2019.csv']
['BigData', 'Data', 'process', 'clean_llamadas_123_octubre_2019.csv']
['BigData', 'Data', 'process', 'clean_llamadas_123_septiembre_2019.csv']
total Dataframe:1


## Get raw data

In [3]:
 get_input_data(
    bucket = bucket, 
    initial_directory = directory, 
    filename = 'clean_llamadas_123_enero_2020.csv')
print(raw_data.info())

FileNotFoundError: b/espbigdata/o/BigData%2FData%2Fraw%2Fclean_llamadas_123_enero_2020.csv

It seems that this file contains already the fields, but need to rename them

### Rename Columns

In [None]:
# Don't let white spaces between headers to avoid errors during the pipeline
# This is sometimes called a recipe for etl
raw_data.rename(
    columns = {
        'FECHA_INICIO_DESPLAZAMIENTO-MOVIL' : 'FECHA_INICIO_DESPLAZAMIENTO_MOVIL',
        'CODIGO DE LOCALIDAD'               : 'CODIGO_LOCALIDAD',
        'CLASIFICACION FINAL'               : 'CLASIFICACION_FINAL'
    },
    inplace=True  #Npo sobre escribe 
)
raw_data.head()

### Check values per field
--------
1. LOCALIDAD
2. GENERO

In [None]:
raw_data['LOCALIDAD'].value_counts(dropna=False) ## dropna en true no muestra  los campos nullos en false los muestra por defecto viene en true


It seems that in some fields exist a white space at the begining of the values, probably at the end too, lets clean them all

In [None]:
# Let's use the function strip(), this property works for string objects in python that removes whitespaces
raw_data['LOCALIDAD'].apply(
    lambda x: x.strip()        # here we use a lambda function, exclusively for python. Is a one-line function
).value_counts()

In [None]:
raw_data['LOCALIDAD'] = raw_data['LOCALIDAD'].apply(lambda x: x.strip()) # we assign the output of the function apply to the same column

In [None]:
raw_data.head()

In [None]:
col = 'GENERO'
raw_data[col] = raw_data[col].apply(lambda x: x.strip())
raw_data[col].value_counts(dropna=False)   # here we could apply the same technique as before to avoid errors

In [None]:
col = 'TIPO_INCIDENTE'
raw_data[col] = raw_data[col].apply(lambda x: x.strip())
raw_data[col].value_counts(dropna=False)

In [None]:
col = 'CLASIFICACION_FINAL'
# raw_data[col] = raw_data[col].apply(lambda x: x.strip())
raw_data[col].value_counts(dropna=False)

Here this columns contains a null value, aka NaN (this is numeric value that is Null), lets replace them first
* NaN --> SIN_DATO

In [None]:
raw_data[col].fillna('SIN_DATO').value_counts()  #reemplaza null por sin dato

In [None]:
col = 'CLASIFICACION_FINAL'
raw_data[col].fillna('SIN_DATO', inplace=True)
raw_data[col] = raw_data[col].apply(lambda x: x.strip())
raw_data.head()

# Profile of the Data
------
first install pandas profiling, from a terminal

````
pip install pandas-profiling
````

In [None]:
profile = ProfileReport(df=raw_data)
profile.to_file(output_file=project_dir + '/reports/'+'profile_raw_data_julio_2019.html')

Form the initial report of the file we see that are duplicates rows in the table, we need to subtract them

### Remove duplicates

In [None]:
print('Number of rows before cleaning:', raw_data.shape[0] )
raw_data.drop_duplicates(inplace=True)
print('Number of rows after cleaning:', raw_data.shape[0] )

In [None]:
raw_data.head()

In [None]:
raw_data['FECHA_INCIDENTE'] = raw_data['FECHA_INCIDENTE'].apply(lambda _: datetime.datetime.strptime(_,"%Y-%m-%d %H:%M:%S"))
#import  datetime
#raw_data.head()
raw_data.info()

### Save the final table
use to_csv to save the table

In [None]:
# saving in a local directory
raw_data.to_csv('/home/jupyter/BigData/data/processed/' +'clean_llamadas_123_julio_2019.csv', encoding='latin1', sep=';', index=False)
#raw_data.to_csv(project_dir + '/data/processed/' +'clean_llamadas_123_julio_2019.csv', encoding='latin1', sep=';', index=False)