In [5]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from scipy import stats

# ===============================
# 1. Cargar datos
# ===============================
# Asegúrate de que findata.csv esté en la misma carpeta donde corres el script
df = pd.read_csv("findata.csv")

print("Shape original:", df.shape)

# ===============================
# 2. Limpieza
# ===============================

# 2.1 Eliminar duplicados
df = df.drop_duplicates()
print("Shape después de eliminar duplicados:", df.shape)

# 2.2 Detección de outliers numéricos con z-score
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

z_scores = np.abs(stats.zscore(df[numeric_cols], nan_policy='omit'))


z_df = pd.DataFrame(z_scores, columns=numeric_cols, index=df.index)


outlier_threshold = 3


df["tiene_outlier"] = (z_df > outlier_threshold).any(axis=1)



print("Outliers detectados:", df["tiene_outlier"].sum())

# ===============================
# 3. Tratamiento de valores faltantes + Codificación
# ===============================


numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

if "tiene_outlier" in numeric_cols:
    numeric_cols.remove("tiene_outlier")

categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()


numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])


categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Desconocido")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ],
    remainder="drop"  
)

# ===============================
# 4. Aplicar el preprocesamiento
# ===============================

cols_for_model = numeric_cols + categorical_cols
X = df[cols_for_model].copy()

X_processed = preprocessor.fit_transform(X)


ohe = preprocessor.named_transformers_["cat"]["onehot"]
ohe_feature_names = ohe.get_feature_names_out(categorical_cols)

processed_col_names = numeric_cols + list(ohe_feature_names)

X_processed_df = pd.DataFrame(X_processed, columns=processed_col_names)

print("Shape final preprocesado:", X_processed_df.shape)
X_processed_df


Shape original: (10000, 98)
Shape después de eliminar duplicados: (10000, 98)
Outliers detectados: 2727
Shape final preprocesado: (10000, 248)


Unnamed: 0,Female,RaceWhite,College4Degree,NumFinDepChildren,FourMoreFinDepChild,Income150K,IncomeApprox,Military,SpouseMilitary,KnowledgeableBin,...,QuestionM10_*False*,QuestionM10_Desconocido,QuestionM10_Don't Know,QuestionM10_True,QuestionM31_*At least 2 yrs and less than 5 yrs*,QuestionM31_At least 10 yrs,QuestionM31_At least 5 yrs and less than 10 yrs,QuestionM31_Desconocido,QuestionM31_Don't know,QuestionM31_Less than 2 yrs
0,1.0,0.0,0.0,2.0,0.0,0.0,125000.0,1.0,1.000000,1.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,42500.0,0.0,0.000000,1.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0,0.0,87500.0,0.0,0.000000,1.000000,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,62500.0,0.0,0.000000,1.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,42500.0,0.0,0.000000,1.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.0,1.0,0.0,0.0,0.0,0.0,62500.0,0.0,1.000000,1.000000,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,1.0,0.0,0.0,0.0,0.0,20000.0,1.0,0.179227,0.904307,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,42500.0,0.0,0.179227,0.904307,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9998,1.0,0.0,0.0,0.0,0.0,0.0,7500.0,0.0,0.179227,0.904307,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
n_clusters = 4
random_state = 42
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
cluster = kmeans.fit_predict(X_processed_df)
X_processed_df['cluster']  = cluster

In [19]:
X_processed_df.to_csv("clientes_segmentados.csv")