# Exploration Train

In [1]:
import pandas as pd

# Cargar train.csv
train_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/raw/train.csv'
train_data = pd.read_csv(train_path)

# Exploración inicial
print(train_data.info())  # Información general
print(train_data.head())  # Primeras filas
print(train_data.isnull().sum())  # Valores nulos por columna


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46551445 entries, 0 to 46551444
Data columns (total 9 columns):
 #   Column           Dtype  
---  ------           -----  
 0   session_id       int64  
 1   date             object 
 2   timestamp_local  object 
 3   add_to_cart      int64  
 4   user_id          float64
 5   country          int64  
 6   partnumber       int64  
 7   device_type      int64  
 8   pagetype         float64
dtypes: float64(2), int64(5), object(2)
memory usage: 3.1+ GB
None
   session_id        date          timestamp_local  add_to_cart   user_id  \
0          64  2024-06-06  2024-06-06 16:43:17.389            0       NaN   
1         117  2024-06-08  2024-06-08 15:11:02.782            0       NaN   
2         117  2024-06-08  2024-06-08 15:11:44.797            0       NaN   
3         579  2024-06-05  2024-06-05 19:24:48.397            0       NaN   
4        1220  2024-06-04  2024-06-04 08:21:13.476            0  480729.0   

   country  partnumber  de


### **Análisis Inicial**
1. **Dimensiones:**
   - **Filas:** 46,551,445.
   - **Columnas:** 9.

2. **Columnas Clave:**
   - **`add_to_cart`:** Indicador binario (0/1) sobre si el producto fue añadido al carrito.
   - **`user_id`:** Tiene **39,694,715 valores nulos (85.3%)**, lo que corresponde a usuarios no logueados.
   - **`pagetype`:** Tiene **1,197 valores nulos**, lo que es insignificante comparado con el tamaño del dataset.
   - **`date` y `timestamp_local`:** Ambas están en formato `object`, deberían convertirse a `datetime`.
   - **Otras columnas (`session_id`, `country`, `partnumber`, `device_type`):** Completas, pero sus tipos pueden ser optimizados.

3. **Uso de Memoria:**
   - Actual: **3.1+ GB**, lo cual es elevado, pero puede reducirse significativamente con optimizaciones.


1. Manejo de valores nulos

In [2]:
# Rellenar valores nulos en user_id con -1
train_data['user_id'] = train_data['user_id'].fillna(-1).astype('int32')

# Rellenar valores nulos en pagetype con -1
train_data['pagetype'] = train_data['pagetype'].fillna(-1).astype('int16')


2. Transformación de variables temporales

In [3]:
# Convertir date y timestamp_local a datetime
train_data['date'] = pd.to_datetime(train_data['date'])
train_data['timestamp_local'] = pd.to_datetime(train_data['timestamp_local'])


3. Optimización de tipos de datos

In [4]:
# Optimizar tipos de datos
train_data['session_id'] = train_data['session_id'].astype('int32')
train_data['country'] = train_data['country'].astype('int16')
train_data['partnumber'] = train_data['partnumber'].astype('int32')
train_data['device_type'] = train_data['device_type'].astype('int8')
train_data['add_to_cart'] = train_data['add_to_cart'].astype('int8')

# Confirmar cambios
print(train_data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46551445 entries, 0 to 46551444
Data columns (total 9 columns):
 #   Column           Dtype         
---  ------           -----         
 0   session_id       int32         
 1   date             datetime64[ns]
 2   timestamp_local  datetime64[ns]
 3   add_to_cart      int8          
 4   user_id          int32         
 5   country          int16         
 6   partnumber       int32         
 7   device_type      int8          
 8   pagetype         int16         
dtypes: datetime64[ns](2), int16(2), int32(3), int8(2)
memory usage: 1.5 GB
None


4. Guardado del dataset optimizado

In [5]:
train_data.to_csv('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.csv', index=False)
train_data.to_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl')
