# Exploratory Data Analysis

***Import neccessary libraries***

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random

### EDA

***Read the files***

In [2]:
data_trips = pd.read_csv('../data/processed/clean_data_trips.csv', sep=',', low_memory=False)
data_stations = pd.read_csv('../data/processed/clean_data_stations.csv', sep=',', low_memory=False)

***Exploration and data cleaning***

In [4]:
data_trips.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4144134 entries, 0 to 4144133
Data columns (total 22 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   idDriver             2102989 non-null  object 
 1   idBike               4144134 non-null  int64  
 2   trip_minutes         4144134 non-null  float64
 3   fleet                4144134 non-null  int64  
 4   unlock_date          4144134 non-null  object 
 5   unlock_hour          4144134 non-null  object 
 6   latitude_unlock      4144134 non-null  float64
 7   longitude_unlock     4144134 non-null  float64
 8   address_unlock       3595660 non-null  object 
 9   unlocktype           4144134 non-null  object 
 10  station_unlock       4144134 non-null  int64  
 11  dock_unlock          4144134 non-null  int64  
 12  unlock_station_name  4127342 non-null  object 
 13  lock_date            4144134 non-null  object 
 14  lock_hour            4144134 non-null  object 
 15

-  We have 4144134 rows corresponding to an entire year of data collection with 32 columns of varied data
- There are variables with Null values ​​that indicate that some of the columns have not been identified, for example, idDriver
-  There are 21 numerical variables and 11 categorical variables


* Eliminate duplicates

In [5]:
data_trips = data_trips.drop_duplicates()
data_stations = data_stations.drop_duplicates()

* Eliminate irrelevant information

In [9]:
data_trips.drop(['address_unlock', 'unlocktype', 'address_lock', 'locktype'], axis=1, inplace=True)

In [6]:
data_trips = data_trips.dropna(subset=['idDriver'])

In [7]:
data_trips = data_trips[data_trips["trip_minutes"] <= 120]
data_trips = data_trips[data_trips["trip_minutes"] >= 3]

In [11]:
data_trips = data_trips.dropna(subset=['unlock_station_name'])
data_trips = data_trips.dropna(subset=['lock_station_name'])

In [None]:
# Convierte 'unlock_date' a datetime
data_trips['unlock_date'] = pd.to_datetime(data_trips['unlock_date'])

# Crea un nuevo DataFrame vacío
data_trips_red = pd.DataFrame()

while len(data_trips_red) < 60000:
    # Selecciona un mes aleatorio
    mes = pd.Timestamp(random.choice(data_trips['unlock_date'])).month
    
    # Filtra las filas de 'data_trips' que pertenecen al mes seleccionado
    df_mes = data_trips[data_trips['unlock_date'].dt.month == mes]
    
    # Calcula el porcentaje de datos que se deben tomar del mes
    porcentaje = min(1, 60000 / len(df_mes))
    
    # Toma una muestra aleatoria de 'df_mes'
    df_muestra = df_mes.sample(frac=porcentaje)
    
    # Agrega la muestra al nuevo DataFrame
    data_trips_red = pd.concat([data_trips_red, df_muestra], ignore_index=True)

# Ordena el nuevo DataFrame por 'unlock_date'
data_trips_red = data_trips_red.sort_values('unlock_date')



In [21]:
data_trips_red = df_reducido

In [22]:
data_trips_red.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 60000 entries, 8043 to 33698
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   idDriver             60000 non-null  object        
 1   idBike               60000 non-null  int64         
 2   trip_minutes         60000 non-null  float64       
 3   fleet                60000 non-null  int64         
 4   unlock_date          60000 non-null  datetime64[ns]
 5   unlock_hour          60000 non-null  object        
 6   latitude_unlock      60000 non-null  float64       
 7   longitude_unlock     60000 non-null  float64       
 8   station_unlock       60000 non-null  int64         
 9   dock_unlock          60000 non-null  int64         
 10  unlock_station_name  60000 non-null  object        
 11  lock_date            60000 non-null  object        
 12  lock_hour            60000 non-null  object        
 13  latitude_lock        60000 non-nu

*** ¡Hay una incongruencia en la relación de los dos datsets en los números de las estaciones!. El nombre coincide, pero los números hay varios que varian, puesto que en data_trips empiezan por 1=Puerta de Sol A y en data_stations con 1A directamente ***

***Analysis of univariate variables***

* Analysis on categorical variables

In [147]:
fig, axis = plt.subplots(1, 3, figsize = (10, 7))

sns.histplot(ax = axis[0], data = data_trips, x = "idDriver").set(ylabel = None)
sns.histplot(ax = axis[1], data = data_trips, x = "unlock_date").set(ylabel = None)
sns.histplot(ax = axis[2], data = data_trips, x = "unlock_hour").set(ylabel = None)

plt.tight_layout()
plt.show()

KeyboardInterrupt: 

Error in callback <function _draw_all_if_interactive at 0x0000025B00C60F40> (for post_execute), with arguments args (),kwargs {}:


KeyboardInterrupt: 

* Analysis on numeric variables

***Analysis of multivariate variables***

* Numerical-numerical analysis

* Categorical-categorical analysis

* Combinations of class with various predictors

* Correlation analysis

* Numerical-categorical analysis (complete)

***Feature engineering***

* Outlier analysis

* Missing value analysis

* Inference of new features

* Feature scaling

* Normalization / Mix-Max Scaling

***Feature selection***

***Save the interim data***

In [None]:
'''# Creamos una función para proceder a la limpieza y adecuación del conjunto de datos 
def limpieza_datos (trips_data):
  # Eliminamos las entradas de los viajes que han durado más de 2h
  trips_data = trips_data[trips_data["trip_minutes"] <= 120]
  # Eliminamos las entradas de los viajes que han durado menos de 3 minutos
  trips_data = trips_data[trips_data["trip_minutes"] >= 3]
  # Eliminamos los números de las estaciones
  trips_data['unlock_station_name'] = trips_data['unlock_station_name'].str.replace('\d+\s*\-\s*', '')
  trips_data['lock_station_name'] = trips_data['lock_station_name'].str.replace('\d+\s*\-\s*', '')
  # Eliminamos los registros en los que el formato de fecha se encuentre mal introducido
  trips_data['unlock_date'] = pd.to_datetime(trips_data['unlock_date'], format='%Y-%m-%dT%H:%M:%S', errors='coerce')
  trips_data = trips_data.dropna(subset=['unlock_date'])
  # Devolvemos la tabla de datos
  return trips_data
  '''