# Clean Data

***Import libraries***

In [147]:
import pandas as pd
import datetime
import ast
import warnings
warnings.filterwarnings("ignore")

### CSV FILE ###

***Read data file***

In [148]:
trips_data = pd.read_csv("../data/raw/csv/combined_csv.csv", sep=",", low_memory=False)

***Delete the null rows***

In [149]:
trips_data.dropna(how="all", inplace=True)

In [150]:
trips_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4144134 entries, 1 to 8288267
Data columns (total 19 columns):
 #   Column               Dtype  
---  ------               -----  
 0   fecha                object 
 1   idTrip               object 
 2   idBike               float64
 3   fleet                float64
 4   trip_minutes         float64
 5   geolocation_unlock   object 
 6   address_unlock       object 
 7   unlock_date          object 
 8   locktype             object 
 9   unlocktype           object 
 10  geolocation_lock     object 
 11  address_lock         object 
 12  lock_date            object 
 13  station_unlock       object 
 14  dock_unlock          float64
 15  unlock_station_name  object 
 16  station_lock         float64
 17  dock_lock            float64
 18  lock_station_name    object 
dtypes: float64(6), object(13)
memory usage: 632.3+ MB


***Modify column contents***

* Fecha

In [151]:
trips_data.rename(columns={'fecha': 'unlock_date1'}, inplace=True)

* idTrip

In [152]:
trips_data['idTrip'] = trips_data['idTrip'].fillna(trips_data['idTrip']).str.slice(stop=9)
trips_data.rename(columns={'idTrip': 'idDriver'}, inplace=True)

* IdBike

In [153]:
trips_data['idBike'] = trips_data['idBike'].astype(int)

* Fleet

In [154]:
trips_data['fleet'] = trips_data['fleet'].astype(int)

* Geolocation_unlock

In [155]:
trips_data['geolocation_unlock'] = trips_data['geolocation_unlock'].apply(ast.literal_eval)
trips_data['latitude_unlock'] = trips_data['geolocation_unlock'].apply(lambda x: x['coordinates'][1])
trips_data['longitude_unlock'] = trips_data['geolocation_unlock'].apply(lambda x: x['coordinates'][0])

In [156]:
trips_data.drop(['geolocation_unlock'], axis=1, inplace=True)

* Unlock_date

In [157]:
trips_data['unlock_date'] = trips_data['unlock_date'].str.split('T').str[-1]
trips_data.rename(columns={'unlock_date': 'unlock_hour'}, inplace=True)

In [158]:
trips_data.rename(columns={'unlock_date1': 'unlock_date'}, inplace=True)

* Geolocation_lock

In [159]:
trips_data['geolocation_lock'] = trips_data['geolocation_lock'].apply(ast.literal_eval)
trips_data['latitude_lock'] = trips_data['geolocation_lock'].apply(lambda x: x['coordinates'][1])
trips_data['longitude_lock'] = trips_data['geolocation_lock'].apply(lambda x: x['coordinates'][0])

In [160]:
trips_data.drop(['geolocation_lock'], axis=1, inplace=True)

* Lock_date

In [161]:
trips_data['lock_date'] = pd.to_datetime(trips_data['lock_date'])
trips_data['lock_date1'] = trips_data['lock_date'].dt.date
trips_data['lock_hour'] = trips_data['lock_date'].dt.time

In [162]:
trips_data.drop(['lock_date'], axis=1, inplace=True)

In [163]:
trips_data.rename(columns={'lock_date1': 'lock_date'}, inplace=True)

* Station_unlock

In [164]:
trips_data['station_unlock'] = pd.to_numeric(trips_data['station_unlock'], errors='coerce')
trips_data['station_unlock'] = trips_data['station_unlock'].fillna(0).astype(int)

* Dock_unlock

In [165]:
trips_data['dock_unlock'] = pd.to_numeric(trips_data['dock_unlock'], errors='coerce')
trips_data['dock_unlock'] = trips_data['dock_unlock'].fillna(0).astype(int)

* Station_lock

In [166]:
trips_data['station_lock'] = pd.to_numeric(trips_data['station_lock'], errors='coerce')
trips_data['station_lock'] = trips_data['station_lock'].fillna(0).astype(int)

* Dock_lock

In [167]:
trips_data['dock_lock'] = pd.to_numeric(trips_data['dock_lock'], errors='coerce')
trips_data['dock_lock'] = trips_data['dock_lock'].fillna(0).astype(int)

***Sort the columns***

In [168]:
trips_data.head().T

Unnamed: 0,1,3,5,7,9
unlock_date,2022-01-01,2022-01-01,2022-01-01,2022-01-01,2022-01-01
idDriver,489978239,06023769T,50856526F,79006741Q,76959007H
idBike,1718,7340,3861,7657,6653
fleet,1,1,1,1,1
trip_minutes,16.28,7.1,0.48,6.62,8.07
address_unlock,,,,,
unlock_hour,00:02:20,00:07:53,00:09:21,00:09:52,00:09:57
locktype,STATION,STATION,STATION,STATION,STATION
unlocktype,STATION,STATION,STATION,STATION,STATION
address_lock,,,,,


In [169]:
trips_data = trips_data.reindex(columns=['idDriver', 'idBike', 'trip_minutes', 'fleet', 'unlock_date', 'unlock_hour', 'latitude_unlock', 'longitude_unlock', 'address_unlock', 'unlocktype', 'station_unlock', 'dock_unlock', 'unlock_station_name','lock_date', 'lock_hour', 'latitude_lock', 'longitude_lock', 'address_lock', 'locktype', 'station_lock', 'dock_lock', 'lock_station_name'])

In [170]:
trips_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4144134 entries, 1 to 8288267
Data columns (total 22 columns):
 #   Column               Dtype  
---  ------               -----  
 0   idDriver             object 
 1   idBike               int32  
 2   trip_minutes         float64
 3   fleet                int32  
 4   unlock_date          object 
 5   unlock_hour          object 
 6   latitude_unlock      float64
 7   longitude_unlock     float64
 8   address_unlock       object 
 9   unlocktype           object 
 10  station_unlock       int32  
 11  dock_unlock          int32  
 12  unlock_station_name  object 
 13  lock_date            object 
 14  lock_hour            object 
 15  latitude_lock        object 
 16  longitude_lock       object 
 17  address_lock         object 
 18  locktype             object 
 19  station_lock         int32  
 20  dock_lock            int32  
 21  lock_station_name    object 
dtypes: float64(3), int32(6), object(13)
memory usage: 632.3+ MB


***Save the clean file***

In [171]:
trips_data.to_csv('../data/processed/clean_data_trips.csv', index=False)

### JSON FILE ###

In [172]:
data_stations = pd.read_json('../data/raw/json/combined_json.json', lines=True)

In [173]:
stations_df = pd.json_normalize(data_stations['stations'].explode())

In [176]:
replicas = 264
data_stations_rep = pd.concat([data_stations]*replicas)

In [192]:
data_stations_rep = data_stations_rep.reset_index(drop=True)

In [193]:
data_stations_final = pd.concat([data_stations_rep, stations_df], axis=1, join='outer')

In [197]:
data_stations_final.head()

Unnamed: 0,_id,activate,name,reservations_count,light,total_bases,free_bases,number,longitude,no_available,address,latitude,dock_bikes,id
0,2022-01-01T00:13:20.603583,1,Puerta del Sol A,0,3,30,0,1a,-3.7018341,1,Puerta del Sol nº 1,40.4172137,0,1
1,2022-01-01T01:13:21.911079,1,Puerta del Sol B,0,3,30,0,1b,-3.701602938060457,1,Puerta del Sol nº 1,40.41731271011562,0,2
2,2022-01-01T02:13:23.718951,1,Miguel Moya,0,0,24,16,2,-3.7058415,0,Calle Miguel Moya nº 1,40.4205886,7,3
3,2022-01-01T03:13:23.902654,1,Plaza Conde Suchil,0,1,18,1,3,-3.7069171,0,Plaza del Conde del Valle de Súchil nº 3,40.4302937,14,4
4,2022-01-01T04:13:26.826536,1,Malasaña,0,1,24,2,4,-3.7025875,0,Calle Manuela Malasaña nº 5,40.4285524,17,5


In [206]:
data_stations_final['_id'] = pd.to_datetime(data_stations_final['_id'])
data_stations_final['date_station'] = data_stations_final['_id'].dt.date
data_stations_final['hour_station'] = data_stations_final['_id'].dt.time

In [207]:
data_stations_final.head()

Unnamed: 0,_id,activate,name,reservations_count,light,total_bases,free_bases,number,longitude,no_available,address,latitude,dock_bikes,id,date_station,hour_station
0,2022-01-01 00:13:20.603583,1,Puerta del Sol A,0,3,30,0,1a,-3.7018341,1,Puerta del Sol nº 1,40.4172137,0,1,2022-01-01,00:13:20.603583
1,2022-01-01 01:13:21.911079,1,Puerta del Sol B,0,3,30,0,1b,-3.701602938060457,1,Puerta del Sol nº 1,40.41731271011562,0,2,2022-01-01,01:13:21.911079
2,2022-01-01 02:13:23.718951,1,Miguel Moya,0,0,24,16,2,-3.7058415,0,Calle Miguel Moya nº 1,40.4205886,7,3,2022-01-01,02:13:23.718951
3,2022-01-01 03:13:23.902654,1,Plaza Conde Suchil,0,1,18,1,3,-3.7069171,0,Plaza del Conde del Valle de Súchil nº 3,40.4302937,14,4,2022-01-01,03:13:23.902654
4,2022-01-01 04:13:26.826536,1,Malasaña,0,1,24,2,4,-3.7025875,0,Calle Manuela Malasaña nº 5,40.4285524,17,5,2022-01-01,04:13:26.826536


In [208]:
trips_data.head()

Unnamed: 0,idDriver,idBike,trip_minutes,fleet,unlock_date,unlock_hour,latitude_unlock,longitude_unlock,address_unlock,unlocktype,station_unlock,dock_unlock,unlock_station_name,lock_date,lock_hour,latitude_lock,longitude_lock,address_lock,locktype,station_lock,dock_lock,lock_station_name
1,489978239,1718,16.28,1,2022-01-01,00:02:20,40.431861,-3.671417,,STATION,200,3,Avenida de los Toreros,2022-01-01,00:18:37,40.419752,-3.688398,,STATION,64,4,Plaza de la Independencia
3,06023769T,7340,7.1,1,2022-01-01,00:07:53,40.416683,-3.689419,,STATION,69,5,Antonio Maura,2022-01-01,00:14:59,40.430952,-3.699346,,STATION,169,17,Manuel Silvela
5,50856526F,3861,0.48,1,2022-01-01,00:09:21,40.409808,-3.688822,,STATION,73,21,Puerta del Ángel Caído,2022-01-01,00:09:50,40.409808,-3.688822,,STATION,73,21,Puerta del Ángel Caído
7,79006741Q,7657,6.62,1,2022-01-01,00:09:52,40.426,-3.665306,,STATION,192,22,Marqués de Zafra,2022-01-01,00:16:29,40.418667,-3.665778,,STATION,190,17,Parque Roma
9,76959007H,6653,8.07,1,2022-01-01,00:09:57,40.396222,-3.698306,,STATION,183,3,Jaime el Conquistador,2022-01-01,00:18:01,40.410156,-3.702502,,STATION,47,24,Jesús y María
