# Exploratory Data Analysis

### Import libraries

In [75]:
import pandas as pd
import datetime
import ast
import warnings
warnings.filterwarnings("ignore")

***Read data file***

In [36]:
trips_data = pd.read_csv("../data/raw/csv/combined_csv.csv", sep=",", low_memory=False)

***Delete the null rows***

In [37]:
trips_data.dropna(how="all", inplace=True)

In [51]:
trips_data.head()

Unnamed: 0,fecha,idTrip,idBike,fleet,trip_minutes,geolocation_unlock,address_unlock,unlock_date,locktype,unlocktype,geolocation_lock,address_lock,lock_date,station_unlock,dock_unlock,unlock_station_name,station_lock,dock_lock,lock_station_name
1,2022-01-01,489978239_1718_2022-01-01T00:18:37,1718.0,1.0,16.28,"{'type': 'Point', 'coordinates': [-3.6714166, ...",,2022-01-01T00:02:20,STATION,STATION,"{'type': 'Point', 'coordinates': [-3.688398, 4...",,2022-01-01T00:18:37,200.0,3.0,Avenida de los Toreros,64.0,4.0,Plaza de la Independencia
3,2022-01-01,06023769T_7340_2022-01-01T00:14:59,7340.0,1.0,7.1,"{'type': 'Point', 'coordinates': [-3.6894193, ...",,2022-01-01T00:07:53,STATION,STATION,"{'type': 'Point', 'coordinates': [-3.6993465, ...",,2022-01-01T00:14:59,69.0,5.0,Antonio Maura,169.0,17.0,Manuel Silvela
5,2022-01-01,50856526F_3861_2022-01-01T00:09:50,3861.0,1.0,0.48,"{'type': 'Point', 'coordinates': [-3.688822, 4...",,2022-01-01T00:09:21,STATION,STATION,"{'type': 'Point', 'coordinates': [-3.688822, 4...",,2022-01-01T00:09:50,73.0,21.0,Puerta del Ángel Caído,73.0,21.0,Puerta del Ángel Caído
7,2022-01-01,79006741Q_7657_2022-01-01T00:16:29,7657.0,1.0,6.62,"{'type': 'Point', 'coordinates': [-3.6653055, ...",,2022-01-01T00:09:52,STATION,STATION,"{'type': 'Point', 'coordinates': [-3.6657777, ...",,2022-01-01T00:16:29,192.0,22.0,Marqués de Zafra,190.0,17.0,Parque Roma
9,2022-01-01,76959007H_6653_2022-01-01T00:18:01,6653.0,1.0,8.07,"{'type': 'Point', 'coordinates': [-3.6983055, ...",,2022-01-01T00:09:57,STATION,STATION,"{'type': 'Point', 'coordinates': [-3.7025024, ...",,2022-01-01T00:18:01,183.0,3.0,Jaime el Conquistador,47.0,24.0,Jesús y María


***Modify the columns***

* Fecha

In [None]:
trips_data.rename(columns={'fecha': 'unlock_date1'}, inplace=True)

* idTrip:

In [59]:
trips_data['idTrip'] = trips_data['idTrip'].fillna(trips_data['idTrip']).str.slice(stop=9)

In [61]:
trips_data.rename(columns={'idTrip': 'idDriver'}, inplace=True)

* IdBike

In [64]:
trips_data['idBike'] = trips_data['idBike'].astype(int)

* Fleet

In [65]:
trips_data['fleet'] = trips_data['fleet'].astype(int)

* Geolocation unlock

In [76]:
trips_data['geolocation_unlock'] = trips_data['geolocation_unlock'].apply(ast.literal_eval)
trips_data['latitude_unlock'] = trips_data['geolocation_unlock'].apply(lambda x: x['coordinates'][1])
trips_data['longitude_unlock'] = trips_data['geolocation_unlock'].apply(lambda x: x['coordinates'][0])

In [21]:
# Convierte la columna 'fecha' a tipo de dato datetime si aún no lo está
trips_data['fecha'] = pd.to_datetime(trips_data['fecha'])

# Ordena el DataFrame por la columna 'fecha'
trips_data = trips_data.sort_values(by='fecha')

# Muestra las primeras filas del DataFrame ordenado
trips_data.head()

Unnamed: 0,fecha,idTrip,idBike,fleet,trip_minutes,geolocation_unlock,address_unlock,unlock_date,locktype,unlocktype,geolocation_lock,address_lock,lock_date,station_unlock,dock_unlock,unlock_station_name,station_lock,dock_lock,lock_station_name
4144135,2022-01-01,489978239_1718_2022-01-01T00:18:37,1718.0,1.0,16.28,"{'type': 'Point', 'coordinates': [-3.6714166, ...",,2022-01-01T00:02:20,STATION,STATION,"{'type': 'Point', 'coordinates': [-3.688398, 4...",,2022-01-01T00:18:37,200.0,3.0,Avenida de los Toreros,64.0,4.0,Plaza de la Independencia
4144137,2022-01-01,06023769T_7340_2022-01-01T00:14:59,7340.0,1.0,7.1,"{'type': 'Point', 'coordinates': [-3.6894193, ...",,2022-01-01T00:07:53,STATION,STATION,"{'type': 'Point', 'coordinates': [-3.6993465, ...",,2022-01-01T00:14:59,69.0,5.0,Antonio Maura,169.0,17.0,Manuel Silvela
4144139,2022-01-01,50856526F_3861_2022-01-01T00:09:50,3861.0,1.0,0.48,"{'type': 'Point', 'coordinates': [-3.688822, 4...",,2022-01-01T00:09:21,STATION,STATION,"{'type': 'Point', 'coordinates': [-3.688822, 4...",,2022-01-01T00:09:50,73.0,21.0,Puerta del Ángel Caído,73.0,21.0,Puerta del Ángel Caído
4144141,2022-01-01,79006741Q_7657_2022-01-01T00:16:29,7657.0,1.0,6.62,"{'type': 'Point', 'coordinates': [-3.6653055, ...",,2022-01-01T00:09:52,STATION,STATION,"{'type': 'Point', 'coordinates': [-3.6657777, ...",,2022-01-01T00:16:29,192.0,22.0,Marqués de Zafra,190.0,17.0,Parque Roma
4144143,2022-01-01,76959007H_6653_2022-01-01T00:18:01,6653.0,1.0,8.07,"{'type': 'Point', 'coordinates': [-3.6983055, ...",,2022-01-01T00:09:57,STATION,STATION,"{'type': 'Point', 'coordinates': [-3.7025024, ...",,2022-01-01T00:18:01,183.0,3.0,Jaime el Conquistador,47.0,24.0,Jesús y María


In [42]:
trips_data["fleet"].value_counts()



1.0    4111082
2.0      33052
Name: fleet, dtype: int64

In [36]:
null_counts = trips_data.isnull().sum()
print(null_counts)


fecha                        0
idTrip                 2041145
idBike                       0
fleet                        0
trip_minutes                 0
geolocation_unlock           0
address_unlock          548474
unlock_date                  0
locktype                     0
unlocktype                   0
geolocation_lock             0
address_lock            547602
lock_date                    0
station_unlock           16792
dock_unlock              16792
unlock_station_name      16792
station_lock             19907
dock_lock                19907
lock_station_name        19907
dtype: int64


revisar nulos (idTrip)

Despues de arreglar nulos en idTrip, splitear columna segun codigo de abajo:

In [None]:
'''# Separar la columna "idTrip" en varias columnas
trips_data[['DNI', 'idBike2', 'Fecha2', 'Hora_Fin']] = trips_data['idTrip'].str.split('_|-', expand=True)

# Convertir la columna 'Fecha' a tipo de dato datetime
trips_data['Fecha2'] = pd.to_datetime(trips_data['Fecha2'])

# Mostrar el DataFrame con las nuevas columnas
print(trips_data)'''


### Clean final data

In [None]:
'''# Creamos una función para proceder a la limpieza y adecuación del conjunto de datos importado
def limpieza_datos (trips_data):
  # Eliminamos las entradas de los viajes que han durado más de 2h
  trips_data = trips_data[trips_data["trip_minutes"] <= 120]
  # Eliminamos las entradas de los viajes que han durado menos de 3 minutos
  trips_data = trips_data[trips_data["trip_minutes"] >= 3]
  # Eliminamos los números de las estaciones
  trips_data['unlock_station_name'] = trips_data['unlock_station_name'].str.replace('\d+\s*\-\s*', '')
  trips_data['lock_station_name'] = trips_data['lock_station_name'].str.replace('\d+\s*\-\s*', '')
  # Eliminamos los registros en los que el formato de fecha se encuentre mal introducido
  trips_data['unlock_date'] = pd.to_datetime(trips_data['unlock_date'], format='%Y-%m-%dT%H:%M:%S', errors='coerce')
  trips_data = trips_data.dropna(subset=['unlock_date'])
  # Devolvemos la tabla de datos
  return trips_data
  '''