# Exploration test


In [1]:
import pandas as pd

# Cargar test.csv
test_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/raw/test.csv'
test_data = pd.read_csv(test_path)

# Exploración inicial
print(test_data.info())  # Información general
print(test_data.head())  # Primeras filas
print(test_data.isnull().sum())  # Valores nulos por columna


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29275 entries, 0 to 29274
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   session_id       29275 non-null  int64  
 1   date             29275 non-null  object 
 2   timestamp_local  29275 non-null  object 
 3   user_id          5766 non-null   float64
 4   country          29275 non-null  int64  
 5   partnumber       29275 non-null  int64  
 6   device_type      29275 non-null  int64  
 7   pagetype         29275 non-null  float64
dtypes: float64(2), int64(4), object(2)
memory usage: 1.8+ MB
None
   session_id        date          timestamp_local  user_id  country  \
0         746  2024-06-15  2024-06-15 18:36:47.390      NaN       57   
1         746  2024-06-15  2024-06-15 18:37:04.052      NaN       57   
2         746  2024-06-15  2024-06-15 18:37:48.159      NaN       57   
3         746  2024-06-15  2024-06-15 18:38:19.899      NaN       57   
4    

### **Análisis Inicial**
1. **Dimensiones:**
   - Total de filas: 29,275.
   - Total de columnas: 8.

2. **Columnas:**
   - **`session_id`:** Identificador único de la sesión, no tiene valores nulos.
   - **`date` y `timestamp_local`:** Datos temporales sin valores nulos, aunque están en formato `object`.
   - **`user_id`:** Tiene **23,509 valores nulos (80.3%)**, lo que corresponde a usuarios no logueados según el contexto.
   - **`country`, `partnumber`, `device_type`, `pagetype`:** Completas y con tipos adecuados.

3. **Notas clave:**
   - `user_id` necesita ser transformado para identificar claramente a usuarios no logueados.
   - `date` y `timestamp_local` deberían ser convertidas a un formato datetime para facilitar el análisis temporal.
   - Tipos de datos como `session_id`, `country`, `device_type` y `pagetype` pueden ser optimizados.


---


### **Plan de Transformación**

1. **Identificar usuarios no logueados:**
   - Reemplazar valores nulos en `user_id` con `-1` para indicar usuarios no logueados.

2. **Transformar variables temporales:**
   - Convertir `date` y `timestamp_local` al tipo `datetime`.

3. **Optimizar tipos de datos:**
   - Reducir `session_id`, `country`, `device_type`, y `partnumber` a tipos más pequeños (`int32` o `int16`).
   - Cambiar `pagetype` a `int16` si no requiere decimales.



1. Manejo de user_id:

In [2]:
# Reemplazar valores nulos en user_id con -1
test_data['user_id'] = test_data['user_id'].fillna(-1).astype('int32')


2. Transformación de variables temporales:

In [3]:
# Convertir date y timestamp_local a datetime
test_data['date'] = pd.to_datetime(test_data['date'])
test_data['timestamp_local'] = pd.to_datetime(test_data['timestamp_local'])


3. Optimización de tipos de datos:

In [4]:
# Optimizar tipos de datos
test_data['session_id'] = test_data['session_id'].astype('int32')
test_data['country'] = test_data['country'].astype('int16')
test_data['partnumber'] = test_data['partnumber'].astype('int32')
test_data['device_type'] = test_data['device_type'].astype('int8')
test_data['pagetype'] = test_data['pagetype'].astype('int16')

# Confirmar cambios
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29275 entries, 0 to 29274
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   session_id       29275 non-null  int32         
 1   date             29275 non-null  datetime64[ns]
 2   timestamp_local  29275 non-null  datetime64[ns]
 3   user_id          29275 non-null  int32         
 4   country          29275 non-null  int16         
 5   partnumber       29275 non-null  int32         
 6   device_type      29275 non-null  int8          
 7   pagetype         29275 non-null  int16         
dtypes: datetime64[ns](2), int16(2), int32(3), int8(1)
memory usage: 943.6 KB
None


4. Guardado del dataset optimizado

In [5]:
test_data.to_csv('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.csv', index=False)
test_data.to_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl')

---

In [5]:
import pandas as pd

# Cargar el dataset desde el archivo pickle
test_data = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl')

# Verificar que se cargó correctamente
print(test_data.head(20))

    session_id       date         timestamp_local  user_id  country  \
0          746 2024-06-15 2024-06-15 18:36:47.390       -1       57   
1          746 2024-06-15 2024-06-15 18:37:04.052       -1       57   
2          746 2024-06-15 2024-06-15 18:37:48.159       -1       57   
3          746 2024-06-15 2024-06-15 18:38:19.899       -1       57   
4          746 2024-06-15 2024-06-15 18:38:46.492       -1       57   
5          746 2024-06-15 2024-06-15 18:38:56.264       -1       57   
6          746 2024-06-15 2024-06-15 18:39:15.605       -1       57   
7         1306 2024-06-15 2024-06-15 13:15:55.471       -1       34   
8         1364 2024-06-15 2024-06-15 00:02:16.930       -1       34   
9         1377 2024-06-15 2024-06-15 19:08:13.376   259465       25   
10        1377 2024-06-15 2024-06-15 19:08:17.288   259465       25   
11        1377 2024-06-15 2024-06-15 19:08:18.078   259465       25   
12        1377 2024-06-15 2024-06-15 19:09:46.989   259465       25   
13    