# 01 - Curación y Preparación de Datos
## Proyecto Final: Análisis de Datos
### Objetivo: Verificar y limpiar la estructura de los datos


In [11]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

print("Librerías importadas correctamente")

Librerías importadas correctamente


In [12]:
# Cargar datos
data_path = "../data/"
clients = pd.read_csv(os.path.join(data_path, "clients.csv"))
projects = pd.read_csv(os.path.join(data_path, "projects.csv"))

print("Datos cargados correctamente")

Datos cargados correctamente


In [13]:
# Información inicial de clients
print("=== DATASET CLIENTS ===")
print(f"Dimensiones: {clients.shape}")
print("\nTipos de datos:")
print(clients.dtypes)
print("\nPrimeras filas:")
clients.head()

=== DATASET CLIENTS ===
Dimensiones: (100, 9)

Tipos de datos:
client_id                    object
industry                     object
size                         object
region                       object
support_contract              int64
tickets_opened_last_year      int64
avg_response_time_hours     float64
satisfaction_score            int64
renewed_contract              int64
dtype: object

Primeras filas:


Unnamed: 0,client_id,industry,size,region,support_contract,tickets_opened_last_year,avg_response_time_hours,satisfaction_score,renewed_contract
0,C1000,Finanzas,Pequeña,Oruro,0,21,33.5,5,0
1,C1001,Otros,Mediana,Santa Cruz,1,21,25.8,3,1
2,C1002,Educación,Pequeña,La Paz,1,23,24.6,2,1
3,C1003,Gobierno,Mediana,Oruro,1,16,17.0,1,1
4,C1004,Retail,Grande,Oruro,0,26,18.4,5,1


In [14]:
# Información inicial de projects
print("\n=== DATASET PROJECTS ===")
print(f"Dimensiones: {projects.shape}")
print("\nTipos de datos:")
print(projects.dtypes)
print("\nPrimeras filas:")
projects.head()


=== DATASET PROJECTS ===
Dimensiones: (200, 10)

Tipos de datos:
project_id           object
client_id            object
start_date           object
planned_end_date     object
actual_end_date      object
budget_usd          float64
dev_team_size         int64
complexity           object
status               object
final_cost_usd      float64
dtype: object

Primeras filas:


Unnamed: 0,project_id,client_id,start_date,planned_end_date,actual_end_date,budget_usd,dev_team_size,complexity,status,final_cost_usd
0,P2000,C1050,2024-01-15,2024-04-03,2024-03-27,27811.31,4,Media,On-time,27496.8
1,P2001,C1057,2023-11-08,2024-04-16,2024-08-07,15455.14,5,Media,Delayed,17813.71
2,P2002,C1004,2024-10-01,2025-01-31,2025-01-25,26634.78,11,Baja,On-time,27121.7
3,P2003,C1004,2023-11-01,2023-12-24,2023-12-24,24774.24,10,Media,On-time,26293.34
4,P2004,C1093,2023-12-04,2024-03-02,2024-03-05,585.06,7,Baja,On-time,582.31


In [15]:
# Verificar nulos
print("\n=== VALORES NULOS ===")
print("Clients:")
print(clients.isna().sum())
print("\nProjects:")
print(projects.isna().sum())


=== VALORES NULOS ===
Clients:
client_id                   0
industry                    0
size                        0
region                      0
support_contract            0
tickets_opened_last_year    0
avg_response_time_hours     0
satisfaction_score          0
renewed_contract            0
dtype: int64

Projects:
project_id          0
client_id           0
start_date          0
planned_end_date    0
actual_end_date     0
budget_usd          0
dev_team_size       0
complexity          0
status              0
final_cost_usd      0
dtype: int64


In [16]:
# Convertir fechas en projects
date_cols = ['start_date', 'planned_end_date', 'actual_end_date']
for col in date_cols:
    projects[col] = pd.to_datetime(projects[col])

# Verificar conversión
print("Tipos después de conversión:")
print(projects[date_cols].dtypes)

Tipos después de conversión:
start_date          datetime64[ns]
planned_end_date    datetime64[ns]
actual_end_date     datetime64[ns]
dtype: object


In [17]:
# Verificar coherencia de fechas
projects['duration_planned'] = (projects['planned_end_date'] - projects['start_date']).dt.days
projects['duration_actual'] = (projects['actual_end_date'] - projects['start_date']).dt.days

# Identificar inconsistencias
inconsistent_dates = projects[projects['duration_actual'] < 0]
print(f"Proyectos con fechas inconsistentes: {len(inconsistent_dates)}")

if len(inconsistent_dates) > 0:
    print("Proyectos inconsistentes:")
    inconsistent_dates[['project_id', 'start_date', 'actual_end_date', 'duration_actual']]

Proyectos con fechas inconsistentes: 0


In [18]:
# Verificar valores únicos en columnas categóricas
print("=== VALORES ÚNICOS ===")
print("Clients - Industry:", clients['industry'].unique())
print("Clients - Size:", clients['size'].unique())
print("Projects - Complexity:", projects['complexity'].unique())
print("Projects - Status:", projects['status'].unique())

=== VALORES ÚNICOS ===
Clients - Industry: ['Finanzas' 'Otros' 'Educación' 'Gobierno' 'Retail' 'Salud']
Clients - Size: ['Pequeña' 'Mediana' 'Grande']
Projects - Complexity: ['Media' 'Baja' 'Alta']
Projects - Status: ['On-time' 'Delayed']


In [19]:
# Limpieza adicional: eliminar duplicados
print(f"Clientes duplicados: {clients.duplicated().sum()}")
print(f"Proyectos duplicados: {projects.duplicated().sum()}")

# Eliminar duplicados si existen
clients = clients.drop_duplicates()
projects = projects.drop_duplicates()
print("Duplicados eliminados (si existían)")

Clientes duplicados: 0
Proyectos duplicados: 0
Duplicados eliminados (si existían)


In [20]:
# Guardar datasets curados
clients_curated = clients.copy()
projects_curated = projects.copy()

clients_curated.to_csv(os.path.join(data_path, "clients_curated.csv"), index=False)
projects_curated.to_csv(os.path.join(data_path, "projects_curated.csv"), index=False)

print("Curación completada y datasets guardados")
print(f"Clientes: {len(clients_curated)} registros")
print(f"Proyectos: {len(projects_curated)} registros")

Curación completada y datasets guardados
Clientes: 100 registros
Proyectos: 200 registros
