# Clean Data

***Import libraries***

In [None]:
import pandas as pd
import datetime
import ast
import warnings
import numpy as np
warnings.filterwarnings("ignore")

### CSV FILE ###

***Read data file***

In [None]:
trips_data = pd.read_csv("../data/raw/csv/combined_csv.csv", sep=",", low_memory=False)

***Delete the null rows***

In [None]:
trips_data.dropna(how="all", inplace=True)

In [None]:
trips_data.info()

***Modify column contents***

* Fecha

In [None]:
trips_data.rename(columns={'fecha': 'unlock_date1'}, inplace=True)

* idTrip

In [None]:
trips_data['idTrip'] = trips_data['idTrip'].fillna(trips_data['idTrip']).str.slice(stop=9)
trips_data.rename(columns={'idTrip': 'idDriver'}, inplace=True)

* IdBike

In [None]:
trips_data['idBike'] = trips_data['idBike'].astype(int)

* Fleet

In [None]:
trips_data['fleet'] = trips_data['fleet'].astype(int)

* Geolocation_unlock

In [None]:
trips_data['geolocation_unlock'] = trips_data['geolocation_unlock'].apply(ast.literal_eval)
trips_data['latitude_unlock'] = trips_data['geolocation_unlock'].apply(lambda x: x['coordinates'][1])
trips_data['longitude_unlock'] = trips_data['geolocation_unlock'].apply(lambda x: x['coordinates'][0])

In [None]:
trips_data['latitude_unlock'] = trips_data['latitude_unlock'].astype(str).str.slice(stop=8)
trips_data['longitude_unlock'] = trips_data['longitude_unlock'].astype(str).str.slice(stop=8)

In [None]:
trips_data.drop(['geolocation_unlock'], axis=1, inplace=True)

* Unlock_date

In [None]:
trips_data['unlock_date'] = trips_data['unlock_date'].str.split('T').str[-1]
trips_data.rename(columns={'unlock_date': 'unlock_hour'}, inplace=True)

In [None]:
trips_data.rename(columns={'unlock_date1': 'unlock_date'}, inplace=True)

* Geolocation_lock

In [None]:
trips_data['geolocation_lock'] = trips_data['geolocation_lock'].apply(ast.literal_eval)
trips_data['latitude_lock'] = trips_data['geolocation_lock'].apply(lambda x: x['coordinates'][1])
trips_data['longitude_lock'] = trips_data['geolocation_lock'].apply(lambda x: x['coordinates'][0])

In [None]:
trips_data['latitude_lock'] = trips_data['latitude_lock'].astype(str).str.slice(stop=8)
trips_data['longitude_lock'] = trips_data['longitude_lock'].astype(str).str.slice(stop=8)

In [None]:
trips_data.drop(['geolocation_lock'], axis=1, inplace=True)

* Lock_date

In [None]:
trips_data['lock_date'] = pd.to_datetime(trips_data['lock_date'])
trips_data['lock_date1'] = trips_data['lock_date'].dt.date
trips_data['lock_hour'] = trips_data['lock_date'].dt.time

In [None]:
trips_data.drop(['lock_date'], axis=1, inplace=True)

In [None]:
trips_data.rename(columns={'lock_date1': 'lock_date'}, inplace=True)

* Station_unlock

In [None]:
trips_data['station_unlock'] = pd.to_numeric(trips_data['station_unlock'], errors='coerce')
trips_data['station_unlock'] = trips_data['station_unlock'].fillna(0).astype(int)

* Dock_unlock

In [None]:
trips_data['dock_unlock'] = pd.to_numeric(trips_data['dock_unlock'], errors='coerce')
trips_data['dock_unlock'] = trips_data['dock_unlock'].fillna(0).astype(int)

* Station_lock

In [None]:
trips_data['station_lock'] = pd.to_numeric(trips_data['station_lock'], errors='coerce')
trips_data['station_lock'] = trips_data['station_lock'].fillna(0).astype(int)

* Dock_lock

In [None]:
trips_data['dock_lock'] = pd.to_numeric(trips_data['dock_lock'], errors='coerce')
trips_data['dock_lock'] = trips_data['dock_lock'].fillna(0).astype(int)

***Sort the columns***

In [None]:
trips_data.head().T

In [None]:
trips_data = trips_data.reindex(columns=['idDriver', 'idBike', 'trip_minutes', 'fleet', 'unlock_date', 'unlock_hour', 'latitude_unlock', 'longitude_unlock', 'address_unlock', 'unlocktype', 'station_unlock', 'dock_unlock', 'unlock_station_name','lock_date', 'lock_hour', 'latitude_lock', 'longitude_lock', 'address_lock', 'locktype', 'station_lock', 'dock_lock', 'lock_station_name'])

In [None]:
trips_data.info()

***Save the clean file***

In [None]:
trips_data.to_csv('../data/processed/clean_data_trips.csv', index=False)

### JSON FILE ###

***Read data file***

In [4]:
data_stations = pd.read_json(r'C:\Users\jlizo\Desktop\BiciMad_4geeks_ML\data\processed\combined_json.json', lines=True)

***Concat the columns***

In [6]:
stations_df = pd.json_normalize(data_stations['stations'].explode()) # To normalize json by 'stations'

In [7]:
replicas = 264 # Pre-calculate with dataset of 'trips' by concat the same values
data_stations_rep = pd.concat([data_stations]*replicas)

In [8]:
data_stations_rep = data_stations_rep.reset_index(drop=True) # Reset index for cleaning dataset

In [9]:
stations_data = pd.concat([data_stations_rep, stations_df], axis=1, join='outer') # To finish the concatened information

In [10]:
stations_data.head() # Read the head data

Unnamed: 0,_id,stations,activate,name,reservations_count,light,total_bases,free_bases,number,longitude,no_available,address,latitude,dock_bikes,id
0,2022-01-01T00:13:20.603583,"[{'activate': 1, 'name': 'Puerta del Sol A', '...",1,Puerta del Sol A,0,3,30,0,1a,-3.7018341,1,Puerta del Sol nº 1,40.4172137,0,1
1,2022-01-01T01:13:21.911079,"[{'activate': 1, 'name': 'Puerta del Sol A', '...",1,Puerta del Sol B,0,3,30,0,1b,-3.701602938060457,1,Puerta del Sol nº 1,40.41731271011562,0,2
2,2022-01-01T02:13:23.718951,"[{'activate': 1, 'name': 'Puerta del Sol A', '...",1,Miguel Moya,0,0,24,16,2,-3.7058415,0,Calle Miguel Moya nº 1,40.4205886,7,3
3,2022-01-01T03:13:23.902654,"[{'activate': 1, 'name': 'Puerta del Sol A', '...",1,Plaza Conde Suchil,0,1,18,1,3,-3.7069171,0,Plaza del Conde del Valle de Súchil nº 3,40.4302937,14,4
4,2022-01-01T04:13:26.826536,"[{'activate': 1, 'name': 'Puerta del Sol A', '...",1,Malasaña,0,1,24,2,4,-3.7025875,0,Calle Manuela Malasaña nº 5,40.4285524,17,5


***Modify the columns**

### COMENTAR

* _id:

In [11]:
stations_data['_id'] = pd.to_datetime(stations_data['_id'])
stations_data['date_station'] = stations_data['_id'].dt.date
stations_data['hour_station'] = stations_data['_id'].dt.time

In [12]:
stations_data['_id'] = stations_data['_id'].astype(str)
stations_data['_id'] = stations_data['_id'].str.slice(-6)

* hour_station

In [13]:
stations_data['hour_station'] = stations_data['hour_station'].astype(str)
stations_data['hour_station'] = stations_data['hour_station'].str.slice(stop=8)

* longitude & latitude

In [14]:
stations_data['longitude'] = stations_data['longitude'].str.slice(stop=8)
stations_data['latitude'] = stations_data['latitude'].str.slice(stop=8)

***Select the neccessary columns for lock and unlock stations***

In [15]:
data_stations_final_unlock = stations_data[['name', 'number','date_station', 'hour_station', 'longitude', 'latitude', 'light', 'total_bases', 'free_bases', 'no_available', 'dock_bikes']]
data_stations_final_unlock.rename(columns={'date_station': 'unlock_date', 'hour_station': 'unlock_hour', 'longitude': 'longitude_unlock', 'latitude': 'latitude_unlock', 'name':  'name_unlock', 'number': 'number_unlock','light' : 'light_unlock', 'total_bases' : 'total_bases_unlock', 'free_bases' : 'free_bases_unlock', 'no_available' : 'no_available_unlock', 'dock_bikes' : 'dock_bikes_unlock'}, inplace=True)

In [16]:
data_stations_final_lock = stations_data[['name', 'number','date_station', 'hour_station', 'longitude', 'latitude', 'light', 'total_bases', 'free_bases', 'no_available', 'dock_bikes']]
data_stations_final_lock.rename(columns={'date_station': 'lock_date', 'hour_station': 'lock_hour', 'longitude': 'longitude_lock', 'latitude': 'latitude_lock', 'name':  'name_lock', 'number': 'number_lock', 'light' : 'light_lock', 'total_bases' : 'total_bases_lock', 'free_bases' : 'free_bases_lock', 'no_available' : 'no_available_lock', 'dock_bikes' : 'dock_bikes_lock'}, inplace=True)

***Concat the datasets***

In [17]:
data_stations_final_unlock_lock = pd.concat([data_stations_final_unlock, data_stations_final_lock], axis=1)

***Save the dataset***

In [19]:
data_stations_final_unlock_lock.to_csv(r'C:\Users\jlizo\Desktop\BiciMad_4geeks_ML\data\processed\clean_data_stations.csv', index=False)

***Read and visualize the clenaed dataset***

In [20]:
data_stations_clean_inferences = pd.read_csv(r'C:\Users\jlizo\Desktop\BiciMad_4geeks_ML\data\processed\clean_data_stations.csv', sep=',')

In [21]:
data_stations_clean_inferences.head()

Unnamed: 0,name_unlock,number_unlock,unlock_date,unlock_hour,longitude_unlock,latitude_unlock,light_unlock,total_bases_unlock,free_bases_unlock,no_available_unlock,...,number_lock,lock_date,lock_hour,longitude_lock,latitude_lock,light_lock,total_bases_lock,free_bases_lock,no_available_lock,dock_bikes_lock
0,Puerta del Sol A,1a,2022-01-01,00:13:20,-3.70183,40.41721,3,30,0,1,...,1a,2022-01-01,00:13:20,-3.70183,40.41721,3,30,0,1,0
1,Puerta del Sol B,1b,2022-01-01,01:13:21,-3.7016,40.41731,3,30,0,1,...,1b,2022-01-01,01:13:21,-3.7016,40.41731,3,30,0,1,0
2,Miguel Moya,2,2022-01-01,02:13:23,-3.70584,40.42058,0,24,16,0,...,2,2022-01-01,02:13:23,-3.70584,40.42058,0,24,16,0,7
3,Plaza Conde Suchil,3,2022-01-01,03:13:23,-3.70691,40.43029,1,18,1,0,...,3,2022-01-01,03:13:23,-3.70691,40.43029,1,18,1,0,14
4,Malasaña,4,2022-01-01,04:13:26,-3.70258,40.42855,1,24,2,0,...,4,2022-01-01,04:13:26,-3.70258,40.42855,1,24,2,0,17


### Merge Data

# DESCARTAMOS OPCIONES

***option 1***

In [None]:
#trips_data_merge = pd.merge(trips_data, data_stations_final_unlock_lock, how='left', left_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock', 'lock_date', 'lock_hour', 'longitude_lock', 'latitude_lock'], right_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock', 'lock_date', 'lock_hour', 'longitude_lock', 'latitude_lock'])

***option R***

In [None]:
import pandas as pd

# Supongamos que 'trips_data' es tu DataFrame trips_data y 'data_stations_final_unlock_lock' es el otro DataFrame.

# Convertir la columna de fecha a tipo datetime
trips_data['unlock_date'] = pd.to_datetime(trips_data['unlock_date'])

# Realizar un merge en las columnas de coincidencia
merged_data = pd.merge(trips_data, data_stations_final_unlock_lock,
                       left_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock'],
                       right_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock'],
                       how='left', suffixes=('_trip', '_station'))

# Identificar las filas que no tuvieron coincidencias
no_match_mask = merged_data['name_unlock'].isnull()

# Concatenar las filas sin coincidencias al DataFrame resultante
result_data = pd.concat([merged_data[no_match_mask], merged_data[~no_match_mask].drop_duplicates()])

# Imprimir el DataFrame resultante
print(result_data.head())





Opcion J (Utiliza un bucle y agrega las filas al DataFrame uno por uno) 👉 Computacionalmente no parece asumible

In [None]:
# Imprimir las primeras filas del DataFrame resultante
print(trips_data_merge.head())


In [None]:
import pandas as pd

# Tu código para cargar los conjuntos de datos
# ...

# Realizar merge con tipo 'left'
trips_data_merge = pd.merge(trips_data, data_stations_final_unlock_lock, how='left', left_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock'], right_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock'])

# Imprimir las primeras filas del DataFrame resultante
print(trips_data_merge.head())

# Estadísticas y tipos de datos
print(trips_data_merge.describe())
print(trips_data_merge.dtypes)

# Contar valores nulos por columna
print(trips_data_merge.isnull().sum())


In [None]:
# Convertir la columna 'combined_datetime' de ambos DataFrames a datetime
trips_data['combined_datetime'] = pd.to_datetime(trips_data['combined_datetime'])
data_stations_final_unlock_lock['combined_datetime'] = pd.to_datetime(data_stations_final_unlock_lock['combined_datetime'])

# Realizar un merge utilizando 'outer'
trips_data_merge_outer = pd.merge(trips_data, data_stations_final_unlock_lock, on=['combined_datetime', 'longitude_unlock', 'latitude_unlock'], how='outer')

# Restablecer el índice del DataFrame resultante
trips_data_merge_outer.reset_index(drop=True, inplace=True)

# Imprimir el DataFrame resultante
print(trips_data_merge_outer.head())



In [None]:
trips_data_merge2 = pd.DataFrame(columns=trips_data.columns)

trips_data['unlock_hour'] = pd.to_datetime(trips_data['unlock_hour'])
data_stations_final_unlock_lock['unlock_hour'] = pd.to_datetime(data_stations_final_unlock_lock['unlock_hour'])

for index, row in trips_data.iterrows():
    # Convertir la fecha de desbloqueo a datetime
    fecha_desbloqueo = pd.to_datetime(row['unlock_date'])
    
    # Filtrar las filas de 'data_stations_final_unlock_lock' que coinciden con la fecha y la hora de desbloqueo
    filtro = (data_stations_final_unlock_lock['unlock_date'] == fecha_desbloqueo) & \
             (data_stations_final_unlock_lock['unlock_hour'].dt.hour == row['unlock_hour'].dt.hour) & \
             (data_stations_final_unlock_lock['longitude_unlock'] == row['longitude_unlock']) & \
             (data_stations_final_unlock_lock['latitude_unlock'] == row['latitude_unlock'])
    
    # Agregar la fila de 'trips_data' al nuevo DataFrame
    trips_data_merge2 = trips_data_merge2.append(row)
    
    # Si existe una fila correspondiente en 'data_stations_final_unlock_lock', agregarla al nuevo DataFrame
    if not data_stations_final_unlock_lock[filtro].empty:
        trips_data_merge2 = trips_data_merge2.append(data_stations_final_unlock_lock[filtro].iloc[0])

# Imprimir el DataFrame resultante
trips_data_merge2.head()



In [None]:
import pandas as pd

def explore_dataframe_info(dataframe):
    # Ver las primeras filas
    print("Primeras filas del DataFrame:")
    print(dataframe.head())

    # Descripción estadística básica
    print("\nDescripción estadística:")
    print(dataframe.describe())

    # Tipos de datos
    print("\nTipos de datos de las columnas:")
    print(dataframe.dtypes)

    # Contar valores nulos
    print("\nCantidad de valores nulos por columna:")
    print(dataframe.isnull().sum())

    # Explorar valores únicos en cada columna
    print("\nValores únicos en cada columna:")
    for column in dataframe.columns:
        print(f"\nColumna: {column}")
        print(dataframe[column].unique())

# Ejemplo de uso con uno de tus DataFrames
explore_dataframe_info(trips_data)


In [None]:
def explore_dataframe_info(df):
    """
    Muestra información detallada sobre un DataFrame, incluyendo descripciones estadísticas,
    tipos de datos, y conteo de valores nulos por columna.

    Parameters:
    - df: DataFrame de pandas

    Returns:
    - None (imprime la información directamente)
    """

    # Primeras filas del DataFrame
    print(f"Primeras filas del DataFrame:\n{df.head()}")

    # Descripción estadística
    print("\nDescripción estadística:")
    print(df.describe())

    # Tipos de datos de las columnas
    print("\nTipos de datos de las columnas:")
    print(df.dtypes)

    # Cantidad de valores nulos por columna
    print("\nCantidad de valores nulos por columna:")
    print(df.isnull().sum())

# Aplicar la función al DataFrame data_stations_final_unlock_lock
explore_dataframe_info(data_stations_final_unlock_lock)


In [None]:
print(trips_data['unlock_date'].dtype)
print(trips_data['unlock_hour'].dtype)
print(data_stations_final_unlock_lock['unlock_date'].dtype)
print(data_stations_final_unlock_lock['unlock_hour'].dtype)


In [None]:
import pandas as pd

# Asegúrate de convertir las columnas de fecha y hora a tipos de datos apropiados
trips_data['unlock_date'] = pd.to_datetime(trips_data['unlock_date'])
data_stations_final_unlock_lock['unlock_date'] = pd.to_datetime(data_stations_final_unlock_lock['unlock_date'])
trips_data['unlock_hour'] = pd.to_datetime(trips_data['unlock_hour']).dt.time
data_stations_final_unlock_lock['unlock_hour'] = pd.to_datetime(data_stations_final_unlock_lock['unlock_hour']).dt.time

# Crea una nueva columna 'combined_datetime' en ambos DataFrames
trips_data['combined_datetime'] = pd.to_datetime(trips_data['unlock_date'].astype(str) + ' ' + trips_data['unlock_hour'].astype(str))
data_stations_final_unlock_lock['combined_datetime'] = pd.to_datetime(data_stations_final_unlock_lock['unlock_date'].astype(str) + ' ' + data_stations_final_unlock_lock['unlock_hour'].astype(str))

# Realiza la fusión de los DataFrames utilizando 'merge'
trips_data_merge2 = pd.merge(trips_data, data_stations_final_unlock_lock,
                             left_on=['combined_datetime', 'longitude_unlock', 'latitude_unlock'],
                             right_on=['combined_datetime', 'longitude_unlock', 'latitude_unlock'],
                             how='left')

# Elimina columnas temporales
trips_data_merge2 = trips_data_merge2.drop(columns=['combined_datetime'])

# Elimina duplicados si es necesario
trips_data_merge2 = trips_data_merge2.drop_duplicates()

# Imprime el DataFrame resultante
print(trips_data_merge2.head())




In [None]:
# Convertir las columnas de fecha y hora a tipo datetime en ambos DataFrames
trips_data['unlock_hour'] = pd.to_datetime(trips_data['unlock_hour'])
data_stations_final_unlock_lock['unlock_hour'] = pd.to_datetime(data_stations_final_unlock_lock['unlock_hour'])
trips_data['unlock_date'] = pd.to_datetime(trips_data['unlock_date'])

# Merge de los DataFrames utilizando 'merge'
trips_data_merge2 = pd.merge(trips_data, data_stations_final_unlock_lock,
                             left_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock'],
                             right_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock'],
                             how='left')

# Eliminar duplicados si es necesario
trips_data_merge2 = trips_data_merge2.drop_duplicates()

# Imprimir el DataFrame resultante
print(trips_data_merge2.head())


In [None]:
import pandas as pd

# Convertir las columnas de fecha y hora a tipo datetime
trips_data['unlock_hour'] = pd.to_datetime(trips_data['unlock_hour'])
data_stations_final_unlock_lock['unlock_hour'] = pd.to_datetime(data_stations_final_unlock_lock['unlock_hour'])
trips_data['unlock_date'] = pd.to_datetime(trips_data['unlock_date'])

# Crear un DataFrame vacío sin especificar columnas
trips_data_merge2 = pd.DataFrame()

for index, row in trips_data.iterrows():
    # Convertir la fecha de desbloqueo a datetime
    fecha_desbloqueo = pd.to_datetime(row['unlock_date'])
    
    # Filtrar las filas de 'data_stations_final_unlock_lock' que coinciden con la fecha y la hora de desbloqueo
    filtro = (data_stations_final_unlock_lock['unlock_date'] == fecha_desbloqueo) & \
             (data_stations_final_unlock_lock['unlock_hour'].dt.hour == row['unlock_hour'].hour) & \
             (data_stations_final_unlock_lock['longitude_unlock'] == row['longitude_unlock']) & \
             (data_stations_final_unlock_lock['latitude_unlock'] == row['latitude_unlock'])
    
    # Agregar la fila de 'trips_data' al nuevo DataFrame
    trips_data_merge2 = trips_data_merge2.append(row)
    
    # Si existe una fila correspondiente en 'data_stations_final_unlock_lock', agregarla al nuevo DataFrame
    if not data_stations_final_unlock_lock[filtro].empty:
        trips_data_merge2 = trips_data_merge2.append(data_stations_final_unlock_lock[filtro].iloc[0])

# Eliminar duplicados si es necesario
trips_data_merge2 = trips_data_merge2.drop_duplicates()

# Imprimir el DataFrame resultante
print(trips_data_merge2.head())



In [None]:
trips_data['lock_hour'] = pd.to_datetime(trips_data['lock_hour'])
data_stations_final_unlock_lock['lock_hour'] = pd.to_datetime(data_stations_final_unlock_lock['lock_hour'])

for index, row in trips_data.iterrows():
    # Convertir la fecha de desbloqueo a datetime
    fecha_desbloqueo_l = pd.to_datetime(row['lock_date'])
    
    # Filtrar las filas de 'data_stations_final_unlock_lock' que coinciden con la fecha y la hora de desbloqueo
    filtro = (data_stations_final_unlock_lock['lock_date'] == fecha_desbloqueo_l) & \
             (data_stations_final_unlock_lock['lock_hour'].dt.hour == row['lock_hour'].dt.hour) & \
             (data_stations_final_unlock_lock['longitude_lock'] == row['longitude_lock']) & \
             (data_stations_final_unlock_lock['latitude_lock'] == row['latitude_lock'])
    
    # Agregar la fila de 'trips_data' al nuevo DataFrame
    trips_data_merge2 = trips_data_merge2.append(row)
    
    # Si existe una fila correspondiente en 'data_stations_final_unlock_lock', agregarla al nuevo DataFrame
    if not data_stations_final_unlock_lock[filtro].empty:
        trips_data_merge2 = trips_data_merge2.append(data_stations_final_unlock_lock[filtro].iloc[0])

# Imprimir el DataFrame resultante
trips_data_merge2.head()

***Save the clean data***

In [None]:
# trips_data_merge.to_csv('../data/processed/clean_data_trips_stations.csv', index=False)