# Clean Data

***Import libraries***

In [115]:
import pandas as pd
import datetime
import ast
import warnings
warnings.filterwarnings("ignore")

### CSV FILE ###

***Read data file***

In [116]:
trips_data = pd.read_csv("../data/raw/csv/combined_csv.csv", sep=",", low_memory=False)

***Delete the null rows***

In [117]:
trips_data.dropna(how="all", inplace=True)

In [118]:
trips_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4144134 entries, 1 to 8288267
Data columns (total 19 columns):
 #   Column               Dtype  
---  ------               -----  
 0   fecha                object 
 1   idTrip               object 
 2   idBike               float64
 3   fleet                float64
 4   trip_minutes         float64
 5   geolocation_unlock   object 
 6   address_unlock       object 
 7   unlock_date          object 
 8   locktype             object 
 9   unlocktype           object 
 10  geolocation_lock     object 
 11  address_lock         object 
 12  lock_date            object 
 13  station_unlock       object 
 14  dock_unlock          float64
 15  unlock_station_name  object 
 16  station_lock         float64
 17  dock_lock            float64
 18  lock_station_name    object 
dtypes: float64(6), object(13)
memory usage: 632.3+ MB


***Modify column contents***

* Fecha

In [119]:
trips_data.rename(columns={'fecha': 'unlock_date1'}, inplace=True)

* idTrip

In [120]:
trips_data['idTrip'] = trips_data['idTrip'].fillna(trips_data['idTrip']).str.slice(stop=9)
trips_data.rename(columns={'idTrip': 'idDriver'}, inplace=True)

* IdBike

In [121]:
trips_data['idBike'] = trips_data['idBike'].astype(int)

* Fleet

In [122]:
trips_data['fleet'] = trips_data['fleet'].astype(int)

* Geolocation_unlock

In [123]:
trips_data['geolocation_unlock'] = trips_data['geolocation_unlock'].apply(ast.literal_eval)
trips_data['latitude_unlock'] = trips_data['geolocation_unlock'].apply(lambda x: x['coordinates'][1])
trips_data['longitude_unlock'] = trips_data['geolocation_unlock'].apply(lambda x: x['coordinates'][0])

In [124]:
trips_data.drop(['geolocation_unlock'], axis=1, inplace=True)

* Unlock_date

In [125]:
trips_data['unlock_date'] = trips_data['unlock_date'].str.split('T').str[-1]
trips_data.rename(columns={'unlock_date': 'unlock_hour'}, inplace=True)

In [126]:
trips_data.rename(columns={'unlock_date1': 'unlock_date'}, inplace=True)

* Geolocation_lock

In [127]:
trips_data['geolocation_lock'] = trips_data['geolocation_lock'].apply(ast.literal_eval)
trips_data['latitude_lock'] = trips_data['geolocation_lock'].apply(lambda x: x['coordinates'][1])
trips_data['longitude_lock'] = trips_data['geolocation_lock'].apply(lambda x: x['coordinates'][0])

In [None]:
trips_data.drop(['geolocation_lock'], axis=1, inplace=True)

* Lock_date

In [None]:
trips_data['lock_date'] = pd.to_datetime(trips_data['lock_date'])
trips_data['lock_date1'] = trips_data['lock_date'].dt.date
trips_data['lock_hour'] = trips_data['lock_date'].dt.time

In [None]:
trips_data.drop(['lock_date'], axis=1, inplace=True)

In [None]:
trips_data.rename(columns={'lock_date1': 'unlock_date'}, inplace=True)

* Station_unlock

In [None]:
trips_data['station_unlock'] = pd.to_numeric(trips_data['station_unlock'], errors='coerce')
trips_data['station_unlock'] = trips_data['station_unlock'].fillna(0).astype(int)

* Dock_unlock

In [None]:
trips_data['dock_unlock'] = pd.to_numeric(trips_data['dock_unlock'], errors='coerce')
trips_data['dock_unlock'] = trips_data['dock_unlock'].fillna(0).astype(int)

* Station_lock

In [None]:
trips_data['station_lock'] = pd.to_numeric(trips_data['station_lock'], errors='coerce')
trips_data['station_lock'] = trips_data['station_lock'].fillna(0).astype(int)

* Dock_lock

In [None]:
trips_data['dock_lock'] = pd.to_numeric(trips_data['dock_lock'], errors='coerce')
trips_data['dock_lock'] = trips_data['dock_lock'].fillna(0).astype(int)

***Sort the columns***

In [None]:
trips_data = trips_data.reindex(columns=['idDriver', 'idBike', 'trip_minutes', 'fleet', 'unlock_date', 'unlock_hour', 'latitude_unlock', 'longitude_unlock', 'address_unlock', 'unlocktype', 'station_unlock', 'dock_unlock', 'unlock_station_name','lock_date', 'lock_hour', 'latitude_lock', 'longitude_lock', 'address_lock', 'locktype', 'station_lock', 'dock_lock', 'lock_station_name'])

In [None]:
trips_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4144134 entries, 1 to 8288267
Data columns (total 22 columns):
 #   Column               Dtype  
---  ------               -----  
 0   idDriver             object 
 1   idBike               int32  
 2   trip_minutes         float64
 3   fleet                int32  
 4   unlock_date          object 
 5   unlock_hour          object 
 6   latitude_unlock      float64
 7   longitude_unlock     float64
 8   address_unlock       object 
 9   unlocktype           object 
 10  station_unlock       int32  
 11  dock_unlock          int32  
 12  unlock_station_name  object 
 13  lock_date            float64
 14  lock_hour            object 
 15  latitude_lock        object 
 16  longitude_lock       object 
 17  address_lock         object 
 18  locktype             object 
 19  station_lock         int32  
 20  dock_lock            int32  
 21  lock_station_name    object 
dtypes: float64(4), int32(6), object(12)
memory usage: 632.3+ MB


***Save the clean file***

In [None]:
trips_data.to_csv('../data/processed/clean_data_trips.csv', index=False)

### JSON FILE ###

In [None]:
data_stations = pd.read_json('../data/raw/json/combined_json.json', lines=True)

In [None]:
stations_df = pd.json_normalize(data_stations['stations'].explode())

In [None]:
data_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8738 entries, 0 to 8737
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   _id       8738 non-null   object
 1   stations  8738 non-null   object
dtypes: object(2)
memory usage: 136.7+ KB


In [None]:
stations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2306832 entries, 0 to 2306831
Data columns (total 13 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   activate            int64 
 1   name                object
 2   reservations_count  int64 
 3   light               int64 
 4   total_bases         int64 
 5   free_bases          int64 
 6   number              object
 7   longitude           object
 8   no_available        int64 
 9   address             object
 10  latitude            object
 11  dock_bikes          int64 
 12  id                  int64 
dtypes: int64(8), object(5)
memory usage: 228.8+ MB


In [None]:
replicas = 264
data_stations_rep = pd.concat([data_stations]*replicas)