# Clean Data

***Import libraries***

In [1]:
import pandas as pd
import datetime
import ast
import warnings
import numpy as np
warnings.filterwarnings("ignore")

### CSV FILE ###

***Read data file***

In [2]:
trips_data = pd.read_csv("../data/raw/csv/combined_csv.csv", sep=",", low_memory=False)

***Delete the null rows***

In [3]:
trips_data.dropna(how="all", inplace=True)

In [4]:
trips_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4144134 entries, 0 to 4144133
Data columns (total 19 columns):
 #   Column               Dtype  
---  ------               -----  
 0   fecha                object 
 1   idTrip               object 
 2   idBike               float64
 3   fleet                float64
 4   trip_minutes         float64
 5   geolocation_unlock   object 
 6   address_unlock       object 
 7   unlock_date          object 
 8   locktype             object 
 9   unlocktype           object 
 10  geolocation_lock     object 
 11  address_lock         object 
 12  lock_date            object 
 13  station_unlock       object 
 14  dock_unlock          float64
 15  unlock_station_name  object 
 16  station_lock         float64
 17  dock_lock            float64
 18  lock_station_name    object 
dtypes: float64(6), object(13)
memory usage: 600.7+ MB


***Modify column contents***

* Fecha

In [5]:
trips_data.rename(columns={'fecha': 'unlock_date1'}, inplace=True)

* idTrip

In [6]:
trips_data['idTrip'] = trips_data['idTrip'].fillna(trips_data['idTrip']).str.slice(stop=9)
trips_data.rename(columns={'idTrip': 'idDriver'}, inplace=True)

* IdBike

In [7]:
trips_data['idBike'] = trips_data['idBike'].astype(int)

* Fleet

In [8]:
trips_data['fleet'] = trips_data['fleet'].astype(int)

* Geolocation_unlock

In [9]:
trips_data['geolocation_unlock'] = trips_data['geolocation_unlock'].apply(ast.literal_eval)
trips_data['latitude_unlock'] = trips_data['geolocation_unlock'].apply(lambda x: x['coordinates'][1])
trips_data['longitude_unlock'] = trips_data['geolocation_unlock'].apply(lambda x: x['coordinates'][0])

In [10]:
trips_data['latitude_unlock'] = trips_data['latitude_unlock'].astype(str).str.slice(stop=8)
trips_data['longitude_unlock'] = trips_data['longitude_unlock'].astype(str).str.slice(stop=8)

In [11]:
trips_data.drop(['geolocation_unlock'], axis=1, inplace=True)

* Unlock_date

In [12]:
trips_data['unlock_date'] = trips_data['unlock_date'].str.split('T').str[-1]
trips_data.rename(columns={'unlock_date': 'unlock_hour'}, inplace=True)

In [13]:
trips_data.rename(columns={'unlock_date1': 'unlock_date'}, inplace=True)

* Geolocation_lock

In [14]:
trips_data['geolocation_lock'] = trips_data['geolocation_lock'].apply(ast.literal_eval)
trips_data['latitude_lock'] = trips_data['geolocation_lock'].apply(lambda x: x['coordinates'][1])
trips_data['longitude_lock'] = trips_data['geolocation_lock'].apply(lambda x: x['coordinates'][0])

In [15]:
trips_data['latitude_lock'] = trips_data['latitude_lock'].astype(str).str.slice(stop=8)
trips_data['longitude_lock'] = trips_data['longitude_lock'].astype(str).str.slice(stop=8)

In [16]:
trips_data.drop(['geolocation_lock'], axis=1, inplace=True)

* Lock_date

In [17]:
trips_data['lock_date'] = pd.to_datetime(trips_data['lock_date'])
trips_data['lock_date1'] = trips_data['lock_date'].dt.date
trips_data['lock_hour'] = trips_data['lock_date'].dt.time

In [18]:
trips_data.drop(['lock_date'], axis=1, inplace=True)

In [19]:
trips_data.rename(columns={'lock_date1': 'lock_date'}, inplace=True)

* Station_unlock

In [20]:
trips_data['station_unlock'] = pd.to_numeric(trips_data['station_unlock'], errors='coerce')
trips_data['station_unlock'] = trips_data['station_unlock'].fillna(0).astype(int)

* Dock_unlock

In [21]:
trips_data['dock_unlock'] = pd.to_numeric(trips_data['dock_unlock'], errors='coerce')
trips_data['dock_unlock'] = trips_data['dock_unlock'].fillna(0).astype(int)

* Station_lock

In [22]:
trips_data['station_lock'] = pd.to_numeric(trips_data['station_lock'], errors='coerce')
trips_data['station_lock'] = trips_data['station_lock'].fillna(0).astype(int)

* Dock_lock

In [23]:
trips_data['dock_lock'] = pd.to_numeric(trips_data['dock_lock'], errors='coerce')
trips_data['dock_lock'] = trips_data['dock_lock'].fillna(0).astype(int)

***Sort the columns***

In [24]:
trips_data.head().T

Unnamed: 0,0,1,2,3,4
unlock_date,2022-01-01,2022-01-01,2022-01-01,2022-01-01,2022-01-01
idDriver,489978239,06023769T,50856526F,79006741Q,76959007H
idBike,1718,7340,3861,7657,6653
fleet,1,1,1,1,1
trip_minutes,16.28,7.1,0.48,6.62,8.07
address_unlock,,,,,
unlock_hour,00:02:20,00:07:53,00:09:21,00:09:52,00:09:57
locktype,STATION,STATION,STATION,STATION,STATION
unlocktype,STATION,STATION,STATION,STATION,STATION
address_lock,,,,,


In [25]:
trips_data = trips_data.reindex(columns=['idDriver', 'idBike', 'trip_minutes', 'fleet', 'unlock_date', 'unlock_hour', 'latitude_unlock', 'longitude_unlock', 'address_unlock', 'unlocktype', 'station_unlock', 'dock_unlock', 'unlock_station_name','lock_date', 'lock_hour', 'latitude_lock', 'longitude_lock', 'address_lock', 'locktype', 'station_lock', 'dock_lock', 'lock_station_name'])

In [26]:
trips_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4144134 entries, 0 to 4144133
Data columns (total 22 columns):
 #   Column               Dtype  
---  ------               -----  
 0   idDriver             object 
 1   idBike               int32  
 2   trip_minutes         float64
 3   fleet                int32  
 4   unlock_date          object 
 5   unlock_hour          object 
 6   latitude_unlock      object 
 7   longitude_unlock     object 
 8   address_unlock       object 
 9   unlocktype           object 
 10  station_unlock       int32  
 11  dock_unlock          int32  
 12  unlock_station_name  object 
 13  lock_date            object 
 14  lock_hour            object 
 15  latitude_lock        object 
 16  longitude_lock       object 
 17  address_lock         object 
 18  locktype             object 
 19  station_lock         int32  
 20  dock_lock            int32  
 21  lock_station_name    object 
dtypes: float64(1), int32(6), object(15)
memory usage: 600.7+ MB


***Save the clean file***

In [27]:
trips_data.to_csv('../data/processed/clean_data_trips.csv', index=False)

### JSON FILE ###

***Read data file***

In [28]:
data_stations = pd.read_json('../data/raw/json/combined_json.json', lines=True)

***Concat the columns***

In [29]:
stations_df = pd.json_normalize(data_stations['stations'].explode())

In [30]:
replicas = 264
data_stations_rep = pd.concat([data_stations]*replicas)

In [31]:
data_stations_rep = data_stations_rep.reset_index(drop=True)

In [32]:
data_stations_final = pd.concat([data_stations_rep, stations_df], axis=1, join='outer')

***Modify the columns**

* _id

In [33]:
data_stations_final['_id'] = pd.to_datetime(data_stations_final['_id'])
data_stations_final['date_station'] = data_stations_final['_id'].dt.date
data_stations_final['hour_station'] = data_stations_final['_id'].dt.time

In [34]:
data_stations_final['_id'] = data_stations_final['_id'].astype(str)
data_stations_final['_id'] = data_stations_final['_id'].str.slice(-6)

* hour_station

In [35]:
data_stations_final['hour_station'] = data_stations_final['hour_station'].astype(str)
data_stations_final['hour_station'] = data_stations_final['hour_station'].str.slice(stop=8)

* longitude & latitude

In [36]:
data_stations_final['longitude'] = data_stations_final['longitude'].str.slice(stop=8)
data_stations_final['latitude'] = data_stations_final['latitude'].str.slice(stop=8)

***Select the neccessary columns for lock and unlock stations***

In [37]:
data_stations_final_unlock = data_stations_final[['name', 'number','date_station', 'hour_station', 'longitude', 'latitude', 'light', 'total_bases', 'free_bases', 'no_available', 'dock_bikes']]
data_stations_final_unlock.rename(columns={'date_station': 'unlock_date', 'hour_station': 'unlock_hour', 'longitude': 'longitude_unlock', 'latitude': 'latitude_unlock', 'name':  'name_unlock', 'number': 'number_unlock','light' : 'light_unlock', 'total_bases' : 'total_bases_unlock', 'free_bases' : 'free_bases_unlock', 'no_available' : 'no_available_unlock', 'dock_bikes' : 'dock_bikes_unlock'}, inplace=True)

In [38]:
data_stations_final_lock = data_stations_final[['name', 'number','date_station', 'hour_station', 'longitude', 'latitude', 'light', 'total_bases', 'free_bases', 'no_available', 'dock_bikes']]
data_stations_final_lock.rename(columns={'date_station': 'lock_date', 'hour_station': 'lock_hour', 'longitude': 'longitude_lock', 'latitude': 'latitude_lock', 'name':  'name_lock', 'number': 'number_lock', 'light' : 'light_lock', 'total_bases' : 'total_bases_lock', 'free_bases' : 'free_bases_lock', 'no_available' : 'no_available_lock', 'dock_bikes' : 'dock_bikes_lock'}, inplace=True)

***Concat the datasets***

In [39]:
data_stations_final_unlock_lock = pd.concat([data_stations_final_unlock, data_stations_final_lock], axis=1)

***Save the dataset***

In [40]:
data_stations_final_unlock_lock.to_csv('../data/processed/clean_data_stations.csv', index=False)

### Merge Data

***option 1***

In [None]:
#trips_data_merge = pd.merge(trips_data, data_stations_final_unlock_lock, how='left', left_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock', 'lock_date', 'lock_hour', 'longitude_lock', 'latitude_lock'], right_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock', 'lock_date', 'lock_hour', 'longitude_lock', 'latitude_lock'])

***option R***

In [63]:
import pandas as pd

# Supongamos que 'trips_data' es tu DataFrame trips_data y 'data_stations_final_unlock_lock' es el otro DataFrame.

# Convertir la columna de fecha a tipo datetime
trips_data['unlock_date'] = pd.to_datetime(trips_data['unlock_date'])

# Realizar un merge en las columnas de coincidencia
merged_data = pd.merge(trips_data, data_stations_final_unlock_lock,
                       left_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock'],
                       right_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock'],
                       how='left', suffixes=('_trip', '_station'))

# Identificar las filas que no tuvieron coincidencias
no_match_mask = merged_data['name_unlock'].isnull()

# Concatenar las filas sin coincidencias al DataFrame resultante
result_data = pd.concat([merged_data[no_match_mask], merged_data[~no_match_mask].drop_duplicates()])

# Imprimir el DataFrame resultante
print(result_data.head())





    idDriver  idBike  trip_minutes  fleet unlock_date unlock_hour  \
0  489978239    1718         16.28      1  2022-01-01    00:02:20   
1  06023769T    7340          7.10      1  2022-01-01    00:07:53   
2  50856526F    3861          0.48      1  2022-01-01    00:09:21   
3  79006741Q    7657          6.62      1  2022-01-01    00:09:52   
4  76959007H    6653          8.07      1  2022-01-01    00:09:57   

  latitude_unlock longitude_unlock address_unlock unlocktype  ...  \
0        40.43186         -3.67141            NaN    STATION  ...   
1        40.41668         -3.68941            NaN    STATION  ...   
2        40.40980         -3.68882            NaN    STATION  ...   
3          40.426         -3.66530            NaN    STATION  ...   
4        40.39622         -3.69830            NaN    STATION  ...   

   lock_hour_station  longitude_lock_station latitude_lock_station light_lock  \
0                NaN                     NaN                   NaN        NaN   
1       

Opcion J (Utiliza un bucle y agrega las filas al DataFrame uno por uno) 👉 Computacionalmente no parece asumible

In [59]:
# Imprimir las primeras filas del DataFrame resultante
print(trips_data_merge.head())


    idDriver  idBike  trip_minutes  fleet unlock_date unlock_hour  \
0  489978239  1718.0         16.28    1.0  2022-01-01    00:02:20   
1  06023769T  7340.0          7.10    1.0  2022-01-01    00:07:53   
2  50856526F  3861.0          0.48    1.0  2022-01-01    00:09:21   
3  79006741Q  7657.0          6.62    1.0  2022-01-01    00:09:52   
4  76959007H  6653.0          8.07    1.0  2022-01-01    00:09:57   

  latitude_unlock longitude_unlock address_unlock unlocktype  ...  \
0        40.43186         -3.67141            NaN    STATION  ...   
1        40.41668         -3.68941            NaN    STATION  ...   
2        40.40980         -3.68882            NaN    STATION  ...   
3          40.426         -3.66530            NaN    STATION  ...   
4        40.39622         -3.69830            NaN    STATION  ...   

   lock_hour_station  longitude_lock_station latitude_lock_station light_lock  \
0                NaN                     NaN                   NaN        NaN   
1       

In [57]:
import pandas as pd

# Tu código para cargar los conjuntos de datos
# ...

# Realizar merge con tipo 'left'
trips_data_merge = pd.merge(trips_data, data_stations_final_unlock_lock, how='left', left_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock'], right_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock'])

# Imprimir las primeras filas del DataFrame resultante
print(trips_data_merge.head())

# Estadísticas y tipos de datos
print(trips_data_merge.describe())
print(trips_data_merge.dtypes)

# Contar valores nulos por columna
print(trips_data_merge.isnull().sum())


    idDriver  idBike  trip_minutes  fleet unlock_date unlock_hour  \
0  489978239    1718         16.28      1  2022-01-01    00:02:20   
1  06023769T    7340          7.10      1  2022-01-01    00:07:53   
2  50856526F    3861          0.48      1  2022-01-01    00:09:21   
3  79006741Q    7657          6.62      1  2022-01-01    00:09:52   
4  76959007H    6653          8.07      1  2022-01-01    00:09:57   

  latitude_unlock longitude_unlock address_unlock unlocktype  ...  \
0        40.43186         -3.67141            NaN    STATION  ...   
1        40.41668         -3.68941            NaN    STATION  ...   
2        40.40980         -3.68882            NaN    STATION  ...   
3          40.426         -3.66530            NaN    STATION  ...   
4        40.39622         -3.69830            NaN    STATION  ...   

   lock_hour_y  longitude_lock_y latitude_lock_y light_lock total_bases_lock  \
0          NaN               NaN             NaN        NaN              NaN   
1         

In [56]:
# Convertir la columna 'combined_datetime' de ambos DataFrames a datetime
trips_data['combined_datetime'] = pd.to_datetime(trips_data['combined_datetime'])
data_stations_final_unlock_lock['combined_datetime'] = pd.to_datetime(data_stations_final_unlock_lock['combined_datetime'])

# Realizar un merge utilizando 'outer'
trips_data_merge_outer = pd.merge(trips_data, data_stations_final_unlock_lock, on=['combined_datetime', 'longitude_unlock', 'latitude_unlock'], how='outer')

# Restablecer el índice del DataFrame resultante
trips_data_merge_outer.reset_index(drop=True, inplace=True)

# Imprimir el DataFrame resultante
print(trips_data_merge_outer.head())



    idDriver  idBike  trip_minutes  fleet unlock_date_x unlock_hour_x  \
0  489978239  1718.0         16.28    1.0    2022-01-01      00:02:20   
1  06023769T  7340.0          7.10    1.0    2022-01-01      00:07:53   
2  50856526F  3861.0          0.48    1.0    2022-01-01      00:09:21   
3  79006741Q  7657.0          6.62    1.0    2022-01-01      00:09:52   
4  76959007H  6653.0          8.07    1.0    2022-01-01      00:09:57   

  latitude_unlock longitude_unlock address_unlock unlocktype  ...  \
0        40.43186         -3.67141            NaN    STATION  ...   
1        40.41668         -3.68941            NaN    STATION  ...   
2        40.40980         -3.68882            NaN    STATION  ...   
3          40.426         -3.66530            NaN    STATION  ...   
4        40.39622         -3.69830            NaN    STATION  ...   

   lock_date_y  lock_hour_y longitude_lock_y latitude_lock_y light_lock  \
0          NaN          NaN              NaN             NaN        NaN

In [41]:
trips_data_merge2 = pd.DataFrame(columns=trips_data.columns)

trips_data['unlock_hour'] = pd.to_datetime(trips_data['unlock_hour'])
data_stations_final_unlock_lock['unlock_hour'] = pd.to_datetime(data_stations_final_unlock_lock['unlock_hour'])

for index, row in trips_data.iterrows():
    # Convertir la fecha de desbloqueo a datetime
    fecha_desbloqueo = pd.to_datetime(row['unlock_date'])
    
    # Filtrar las filas de 'data_stations_final_unlock_lock' que coinciden con la fecha y la hora de desbloqueo
    filtro = (data_stations_final_unlock_lock['unlock_date'] == fecha_desbloqueo) & \
             (data_stations_final_unlock_lock['unlock_hour'].dt.hour == row['unlock_hour'].dt.hour) & \
             (data_stations_final_unlock_lock['longitude_unlock'] == row['longitude_unlock']) & \
             (data_stations_final_unlock_lock['latitude_unlock'] == row['latitude_unlock'])
    
    # Agregar la fila de 'trips_data' al nuevo DataFrame
    trips_data_merge2 = trips_data_merge2.append(row)
    
    # Si existe una fila correspondiente en 'data_stations_final_unlock_lock', agregarla al nuevo DataFrame
    if not data_stations_final_unlock_lock[filtro].empty:
        trips_data_merge2 = trips_data_merge2.append(data_stations_final_unlock_lock[filtro].iloc[0])

# Imprimir el DataFrame resultante
trips_data_merge2.head()



AttributeError: 'Timestamp' object has no attribute 'dt'

In [53]:
import pandas as pd

def explore_dataframe_info(dataframe):
    # Ver las primeras filas
    print("Primeras filas del DataFrame:")
    print(dataframe.head())

    # Descripción estadística básica
    print("\nDescripción estadística:")
    print(dataframe.describe())

    # Tipos de datos
    print("\nTipos de datos de las columnas:")
    print(dataframe.dtypes)

    # Contar valores nulos
    print("\nCantidad de valores nulos por columna:")
    print(dataframe.isnull().sum())

    # Explorar valores únicos en cada columna
    print("\nValores únicos en cada columna:")
    for column in dataframe.columns:
        print(f"\nColumna: {column}")
        print(dataframe[column].unique())

# Ejemplo de uso con uno de tus DataFrames
explore_dataframe_info(trips_data)


Primeras filas del DataFrame:
    idDriver  idBike  trip_minutes  fleet unlock_date unlock_hour  \
0  489978239    1718         16.28      1  2022-01-01    00:02:20   
1  06023769T    7340          7.10      1  2022-01-01    00:07:53   
2  50856526F    3861          0.48      1  2022-01-01    00:09:21   
3  79006741Q    7657          6.62      1  2022-01-01    00:09:52   
4  76959007H    6653          8.07      1  2022-01-01    00:09:57   

  latitude_unlock longitude_unlock address_unlock unlocktype  ...  lock_hour  \
0        40.43186         -3.67141            NaN    STATION  ...   00:18:37   
1        40.41668         -3.68941            NaN    STATION  ...   00:14:59   
2        40.40980         -3.68882            NaN    STATION  ...   00:09:50   
3          40.426         -3.66530            NaN    STATION  ...   00:16:29   
4        40.39622         -3.69830            NaN    STATION  ...   00:18:01   

   latitude_lock longitude_lock address_lock locktype station_lock dock_lo

In [54]:
def explore_dataframe_info(df):
    """
    Muestra información detallada sobre un DataFrame, incluyendo descripciones estadísticas,
    tipos de datos, y conteo de valores nulos por columna.

    Parameters:
    - df: DataFrame de pandas

    Returns:
    - None (imprime la información directamente)
    """

    # Primeras filas del DataFrame
    print(f"Primeras filas del DataFrame:\n{df.head()}")

    # Descripción estadística
    print("\nDescripción estadística:")
    print(df.describe())

    # Tipos de datos de las columnas
    print("\nTipos de datos de las columnas:")
    print(df.dtypes)

    # Cantidad de valores nulos por columna
    print("\nCantidad de valores nulos por columna:")
    print(df.isnull().sum())

# Aplicar la función al DataFrame data_stations_final_unlock_lock
explore_dataframe_info(data_stations_final_unlock_lock)


Primeras filas del DataFrame:
          name_unlock number_unlock unlock_date unlock_hour longitude_unlock  \
0    Puerta del Sol A            1a  2022-01-01    00:13:20         -3.70183   
1    Puerta del Sol B            1b  2022-01-01    01:13:21         -3.70160   
2         Miguel Moya             2  2022-01-01    02:13:23         -3.70584   
3  Plaza Conde Suchil             3  2022-01-01    03:13:23         -3.70691   
4            Malasaña             4  2022-01-01    04:13:26         -3.70258   

  latitude_unlock  light_unlock  total_bases_unlock  free_bases_unlock  \
0        40.41721             3                  30                  0   
1        40.41731             3                  30                  0   
2        40.42058             0                  24                 16   
3        40.43029             1                  18                  1   
4        40.42855             1                  24                  2   

   no_available_unlock  ...  lock_hour longi

In [46]:
print(trips_data['unlock_date'].dtype)
print(trips_data['unlock_hour'].dtype)
print(data_stations_final_unlock_lock['unlock_date'].dtype)
print(data_stations_final_unlock_lock['unlock_hour'].dtype)


datetime64[ns]
datetime64[ns]
object
datetime64[ns]


In [51]:
import pandas as pd

# Asegúrate de convertir las columnas de fecha y hora a tipos de datos apropiados
trips_data['unlock_date'] = pd.to_datetime(trips_data['unlock_date'])
data_stations_final_unlock_lock['unlock_date'] = pd.to_datetime(data_stations_final_unlock_lock['unlock_date'])
trips_data['unlock_hour'] = pd.to_datetime(trips_data['unlock_hour']).dt.time
data_stations_final_unlock_lock['unlock_hour'] = pd.to_datetime(data_stations_final_unlock_lock['unlock_hour']).dt.time

# Crea una nueva columna 'combined_datetime' en ambos DataFrames
trips_data['combined_datetime'] = pd.to_datetime(trips_data['unlock_date'].astype(str) + ' ' + trips_data['unlock_hour'].astype(str))
data_stations_final_unlock_lock['combined_datetime'] = pd.to_datetime(data_stations_final_unlock_lock['unlock_date'].astype(str) + ' ' + data_stations_final_unlock_lock['unlock_hour'].astype(str))

# Realiza la fusión de los DataFrames utilizando 'merge'
trips_data_merge2 = pd.merge(trips_data, data_stations_final_unlock_lock,
                             left_on=['combined_datetime', 'longitude_unlock', 'latitude_unlock'],
                             right_on=['combined_datetime', 'longitude_unlock', 'latitude_unlock'],
                             how='left')

# Elimina columnas temporales
trips_data_merge2 = trips_data_merge2.drop(columns=['combined_datetime'])

# Elimina duplicados si es necesario
trips_data_merge2 = trips_data_merge2.drop_duplicates()

# Imprime el DataFrame resultante
print(trips_data_merge2.head())




    idDriver  idBike  trip_minutes  fleet unlock_date_x unlock_hour_x  \
0  489978239    1718         16.28      1    2022-01-01      00:02:20   
1  06023769T    7340          7.10      1    2022-01-01      00:07:53   
2  50856526F    3861          0.48      1    2022-01-01      00:09:21   
3  79006741Q    7657          6.62      1    2022-01-01      00:09:52   
4  76959007H    6653          8.07      1    2022-01-01      00:09:57   

  latitude_unlock longitude_unlock address_unlock unlocktype  ...  \
0        40.43186         -3.67141            NaN    STATION  ...   
1        40.41668         -3.68941            NaN    STATION  ...   
2        40.40980         -3.68882            NaN    STATION  ...   
3          40.426         -3.66530            NaN    STATION  ...   
4        40.39622         -3.69830            NaN    STATION  ...   

   lock_date_y  lock_hour_y longitude_lock_y latitude_lock_y light_lock  \
0          NaN          NaN              NaN             NaN        NaN

In [45]:
# Convertir las columnas de fecha y hora a tipo datetime en ambos DataFrames
trips_data['unlock_hour'] = pd.to_datetime(trips_data['unlock_hour'])
data_stations_final_unlock_lock['unlock_hour'] = pd.to_datetime(data_stations_final_unlock_lock['unlock_hour'])
trips_data['unlock_date'] = pd.to_datetime(trips_data['unlock_date'])

# Merge de los DataFrames utilizando 'merge'
trips_data_merge2 = pd.merge(trips_data, data_stations_final_unlock_lock,
                             left_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock'],
                             right_on=['unlock_date', 'unlock_hour', 'longitude_unlock', 'latitude_unlock'],
                             how='left')

# Eliminar duplicados si es necesario
trips_data_merge2 = trips_data_merge2.drop_duplicates()

# Imprimir el DataFrame resultante
print(trips_data_merge2.head())


ValueError: You are trying to merge on datetime64[ns] and object columns. If you wish to proceed you should use pd.concat

In [43]:
import pandas as pd

# Convertir las columnas de fecha y hora a tipo datetime
trips_data['unlock_hour'] = pd.to_datetime(trips_data['unlock_hour'])
data_stations_final_unlock_lock['unlock_hour'] = pd.to_datetime(data_stations_final_unlock_lock['unlock_hour'])
trips_data['unlock_date'] = pd.to_datetime(trips_data['unlock_date'])

# Crear un DataFrame vacío sin especificar columnas
trips_data_merge2 = pd.DataFrame()

for index, row in trips_data.iterrows():
    # Convertir la fecha de desbloqueo a datetime
    fecha_desbloqueo = pd.to_datetime(row['unlock_date'])
    
    # Filtrar las filas de 'data_stations_final_unlock_lock' que coinciden con la fecha y la hora de desbloqueo
    filtro = (data_stations_final_unlock_lock['unlock_date'] == fecha_desbloqueo) & \
             (data_stations_final_unlock_lock['unlock_hour'].dt.hour == row['unlock_hour'].hour) & \
             (data_stations_final_unlock_lock['longitude_unlock'] == row['longitude_unlock']) & \
             (data_stations_final_unlock_lock['latitude_unlock'] == row['latitude_unlock'])
    
    # Agregar la fila de 'trips_data' al nuevo DataFrame
    trips_data_merge2 = trips_data_merge2.append(row)
    
    # Si existe una fila correspondiente en 'data_stations_final_unlock_lock', agregarla al nuevo DataFrame
    if not data_stations_final_unlock_lock[filtro].empty:
        trips_data_merge2 = trips_data_merge2.append(data_stations_final_unlock_lock[filtro].iloc[0])

# Eliminar duplicados si es necesario
trips_data_merge2 = trips_data_merge2.drop_duplicates()

# Imprimir el DataFrame resultante
print(trips_data_merge2.head())



KeyboardInterrupt: 

In [None]:
trips_data['lock_hour'] = pd.to_datetime(trips_data['lock_hour'])
data_stations_final_unlock_lock['lock_hour'] = pd.to_datetime(data_stations_final_unlock_lock['lock_hour'])

for index, row in trips_data.iterrows():
    # Convertir la fecha de desbloqueo a datetime
    fecha_desbloqueo_l = pd.to_datetime(row['lock_date'])
    
    # Filtrar las filas de 'data_stations_final_unlock_lock' que coinciden con la fecha y la hora de desbloqueo
    filtro = (data_stations_final_unlock_lock['lock_date'] == fecha_desbloqueo_l) & \
             (data_stations_final_unlock_lock['lock_hour'].dt.hour == row['lock_hour'].dt.hour) & \
             (data_stations_final_unlock_lock['longitude_lock'] == row['longitude_lock']) & \
             (data_stations_final_unlock_lock['latitude_lock'] == row['latitude_lock'])
    
    # Agregar la fila de 'trips_data' al nuevo DataFrame
    trips_data_merge2 = trips_data_merge2.append(row)
    
    # Si existe una fila correspondiente en 'data_stations_final_unlock_lock', agregarla al nuevo DataFrame
    if not data_stations_final_unlock_lock[filtro].empty:
        trips_data_merge2 = trips_data_merge2.append(data_stations_final_unlock_lock[filtro].iloc[0])

# Imprimir el DataFrame resultante
trips_data_merge2.head()

***Save the clean data***

In [None]:
# trips_data_merge.to_csv('../data/processed/clean_data_trips_stations.csv', index=False)