In [None]:
from pathlib import Path
import pandas as pd


src =  "Data/Dataset.csv"
dst =  "Data/Dataset_clean.csv"

df = pd.read_csv(src)
before = len(df)

# Columnas a mantener
cols = [
	"Vehicle Type",
	"Avg VTAT",
	"Avg CTAT",
	"Booking Value",
	"Ride Distance",
	"Driver Ratings",
	"Customer Rating",
	"Payment Method",
]

# Seleccionar solo las columnas solicitadas 
existing_cols = [c for c in cols if c in df.columns]
df_sub = df[existing_cols].copy()

# Eliminar filas con nulos en las columnas retenidas
df_clean = df_sub.dropna(how="any")
after = len(df_clean)

# Transformar la columna categórica 'Payment Method' a numérica 0..N-1
# Mapeo determinista: ordenar categorías y asignar 0..N-1
pm = df_clean['Payment Method'].astype(str)
unique = sorted(pm.unique())
mapping = {cat: i for i, cat in enumerate(unique)}
df_clean['Payment Method'] = pm.map(lambda x: mapping.get(x)).astype(int)

# Guardar mapping para referencia
map_path = 'payment_method_mapping.csv'
pd.Series(mapping).rename('code').to_csv(map_path)

	

df_clean.to_csv(dst, index=False)

print(f"Leído: {src}")
print(f"Filas antes: {before}")
print(f"Filas después (sin nulos): {after}")
print(f"Archivo limpio guardado en: {dst}")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Payment Method'] = pm.map(lambda x: mapping.get(x)).astype(int)


Leído: Data/Dataset.csv
Filas antes: 150000
Filas después (sin nulos): 93000
Archivo limpio guardado en: Data/Dataset_clean.csv


In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split


# === Análisis inicial del dataset ===
def analyze_dataset(df: pd.DataFrame) -> None:
    print('Dataset shape:', df.shape)
    print('Columns:', list(df.columns))
    print('Dtypes:\n', df.dtypes)
    print('Head:')
    print(df.head(5))


# === División en variables X / y ===
def split_features(df: pd.DataFrame):
    label_col = 'Vehicle Type'
    y = df[label_col].copy()
    X = df.drop(columns=[label_col]).select_dtypes(include=[np.number]).copy()
    return X, y


# === Configuraciones de KMeans ===
def build_kmeans_random_configs():
    return [
        {'algo': 'kmeans', 'name': f'kmeans_random_k{k}', 'n_clusters': k, 'init': 'random', 'n_init': 10, 'random_state': 42}
        for k in [2, 3, 4, 5]
    ]


def build_kmeanspp_configs():
    return [
        {'algo': 'kmeans++', 'name': f'kmeanspp_k{k}', 'n_clusters': k, 'init': 'k-means++', 'n_init': 10, 'random_state': 42}
        for k in [2, 3, 4, 5]
    ]


# === Configuraciones de MeanShift ===
def build_meanshift_configs(X_train: np.ndarray):
    sample_size = min(5000, X_train.shape[0])
    quantiles = [0.1, 0.2, 0.3]
    configs = [
        {'algo': 'meanshift', 'name': f'ms_bw_q{q}', 'bandwidth': float(estimate_bandwidth(X_train, quantile=q, n_samples=sample_size))}
        for q in quantiles
    ]
    bws = [c['bandwidth'] for c in configs]
    med = float(np.median(bws))
    configs.append({'algo': 'meanshift', 'name': 'ms_bw_med_scaled', 'bandwidth': med * 1.5})
    return configs[:4]

# === Evaluación de configuración (versión corregida) ===
def eval_config(X_train: np.ndarray, cfg: dict):
    if cfg['algo'] in ('kmeans', 'kmeans++'):
        model = KMeans(
            n_clusters=cfg['n_clusters'],
            init=cfg['init'],
            n_init=10,
            max_iter=300,
            tol=1e-4,
            random_state=42
        )
    else:
        model = MeanShift(bandwidth=cfg['bandwidth'], bin_seeding=True, n_jobs=-1)

    labels = model.fit_predict(X_train)
    n_clusters = len(np.unique(labels))

    # Caso 1 cluster
    if n_clusters < 2:
        score = -1
    else:
        score = silhouette_score(X_train, labels)

    return score, cfg | {'model': model}, labels


# === Mapeo de etiquetas dominantes ===
def dominant_label_map(train_labels: np.ndarray, y_train: pd.Series):
    mapping = {}
    y_vals = y_train.reset_index(drop=True)
    for c in np.unique(train_labels):
        vals = y_vals[train_labels == c]
        if len(vals) > 0:
            mapping[int(c)] = vals.mode(dropna=True).iloc[0]
    return mapping


# === Ejecución ===
path = "Data/Dataset_clean.csv"

df = pd.read_csv(path)
print('== Análisis del dataset ==')
analyze_dataset(df)

X_df, y = split_features(df)
X_train_df, X_test_df, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_df.values)
X_test = scaler.transform(X_test_df.values)

# Configuraciones (4 por algoritmo)
km_random_cfgs = build_kmeans_random_configs()
kmpp_cfgs = build_kmeanspp_configs()
ms_cfgs = build_meanshift_configs(X_train)
all_cfgs = km_random_cfgs + kmpp_cfgs + ms_cfgs

# Evaluación de configuraciones
evaluated = [eval_config(X_train, cfg) for cfg in all_cfgs]
evaluated.sort(key=lambda t: t[0], reverse=True)

print('\n== Resultados (Silhouette en train) ==')
for score, cfg, labels in evaluated:
    if cfg['algo'] in ('kmeans', 'kmeans++'):
        print(f"{cfg['name']}: {cfg['algo']} k={cfg['n_clusters']}, silhouette={score:.6f}, clusters={len(np.unique(labels))}")
    else:
        print(f"{cfg['name']}: meanshift bw={cfg['bandwidth']:.3f}, silhouette={score:.6f}, clusters={len(np.unique(labels))}")

# Top 3
top3 = evaluated[:3]
print('\n== Top 3 seleccionadas ==')
for i, (score, cfg, labels) in enumerate(top3, 1):
    extra = f"k={cfg['n_clusters']}" if cfg['algo'] in ('kmeans', 'kmeans++') else f"bw={cfg['bandwidth']:.3f}"
    print(f"{i}. {cfg['name']} ({cfg['algo']}, {extra}) -> silhouette={score:.6f}, clusters={len(np.unique(labels))}")

# === Evaluación en test ===
os.makedirs("results", exist_ok=True)
summary_path = "results/cluster_eval_summary.txt"

with open(summary_path, 'w') as f:
    f.write('Resultados de las 12 configuraciones (ordenadas por silhouette en train)\n')
    for score, cfg, labels in evaluated:
        extra = f"k={cfg['n_clusters']}" if cfg['algo'] in ('kmeans', 'kmeans++') else f"bw={cfg['bandwidth']:.3f}"
        f.write(f"{cfg['name']}: {cfg['algo']} {extra}, silhouette={score:.6f}, clusters={len(np.unique(labels))}\n")

    f.write('\nTop 3 globales:\n')
    for i, (score, cfg, labels) in enumerate(top3, 1):
        extra = f"k={cfg['n_clusters']}" if cfg['algo'] in ('kmeans', 'kmeans++') else f"bw={cfg['bandwidth']:.3f}"
        f.write(f"{i}. {cfg['name']} ({cfg['algo']}, {extra}) -> silhouette={score:.6f}, clusters={len(np.unique(labels))}\n")

# === Comparación de etiquetas en test ===
for score, cfg, train_labels in top3:
    model = cfg['model']
    dom_map = dominant_label_map(train_labels, y_train)
    test_clusters = model.predict(X_test)

    rows = []
    matches = 0
    total = len(X_test_df)

    for idx, c in zip(X_test_df.index, test_clusters):
        dominant = dom_map.get(int(c))
        real_y = y_test.loc[idx]
        match = (real_y == dominant)
        if match:
            matches += 1
        rows.append({
            'index': idx,
            'pred_cluster': int(c),
            'dominant_train_label_for_cluster': dominant,
            'real_Y': real_y,
            'match': match,
        })

    acc = matches / total
    comp_df = pd.DataFrame(rows).set_index('index')
    comp_path = f"results/{cfg['name']}_test_comparison.csv"
    comp_df.to_csv(comp_path)

    with open(summary_path, 'a') as f:
        f.write(f"\n=== {cfg['name']} ({cfg['algo']}) ===\n")
        extra = f"k={cfg['n_clusters']}" if cfg['algo'] in ('kmeans', 'kmeans++') else f"bandwidth={cfg['bandwidth']:.3f}"
        f.write(f"{extra}\nSilhouette train={score:.6f}\n")
        f.write(f"Dominant-label match accuracy on test={acc:.4f}\n")
        f.write(f"Detalle: {os.path.basename(comp_path)}\n")

print(f"\nResumen guardado en: {summary_path}")
print("Archivos CSV de comparación guardados en: results/")


== Análisis del dataset ==
Dataset shape: (93000, 8)
Columns: ['Vehicle Type', 'Avg VTAT', 'Avg CTAT', 'Booking Value', 'Ride Distance', 'Driver Ratings', 'Customer Rating', 'Payment Method']
Dtypes:
 Vehicle Type        object
Avg VTAT           float64
Avg CTAT           float64
Booking Value      float64
Ride Distance      float64
Driver Ratings     float64
Customer Rating    float64
Payment Method       int64
dtype: object
Head:
    Vehicle Type  Avg VTAT  Avg CTAT  Booking Value  Ride Distance  \
0           Auto      13.4      25.8          627.0          13.58   
1  Premier Sedan      13.1      28.5          416.0          34.02   
2           Bike       5.3      19.6          737.0          48.21   
3           Auto       5.1      18.1          316.0           4.85   
4        Go Mini       7.1      20.4          640.0          41.24   

   Driver Ratings  Customer Rating  Payment Method  
0             4.9              4.9               2  
1             4.6              5.0  

In [6]:
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split


# === Carga de datos ===
def load_data(path, max_rows=None):
    df = pd.read_csv(path)
    if max_rows:
        df = df.head(max_rows)
    return df


# === Preprocesamiento ===
def preprocess(df):
    df = df.dropna()

    # Etiqueta: valor alto de reserva (mayor que la mediana)
    df['Booking Value'] = pd.to_numeric(df['Booking Value'], errors='coerce')
    median_val = df['Booking Value'].median()
    y = (df['Booking Value'] > median_val).astype(int)

    num_cols = [
        'Avg VTAT', 'Avg CTAT', 'Booking Value',
        'Ride Distance', 'Driver Ratings',
        'Customer Rating', 'Payment Method'
    ]
    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors='coerce')

    X_num = df[num_cols].fillna(0)

    # Codificación de la variable categórica
    X_cat = pd.get_dummies(df['Vehicle Type'].fillna('missing'), drop_first=True)

    X_final = pd.concat([X_num, X_cat], axis=1)
    return X_final.values, y.values


# === Creación del modelo ===
def make_model(config):
    loss = 'log_loss' if config['technique'] == 'logistic' else 'hinge'
    return SGDClassifier(
        loss=loss,
        learning_rate='constant',
        eta0=config['eta0'],
        alpha=config['alpha'],
        max_iter=1,
        tol=None,
        random_state=config['random_state'],
        warm_start=True
    )


# === Evaluación ===
def evaluate_model(model, scaler, X_test, y_test):
    Xs = scaler.transform(X_test)
    preds = model.predict(Xs)
    return {
        'accuracy': accuracy_score(y_test, preds),
        'precision': precision_score(y_test, preds, zero_division=0),
        'recall': recall_score(y_test, preds, zero_division=0),
        'f1': f1_score(y_test, preds, zero_division=0)
    }


# === Entrenamiento con eliminación progresiva ===
def run_training(configs, X_train, y_train, X_val, y_val):
    active = {c['id']: {'config': c, 'history': []} for c in configs}
    scaler = StandardScaler().fit(X_train)
    eval_interval = 5
    max_epochs = max(c['max_epochs'] for c in configs)

    models = {c['id']: make_model(c) for c in configs}

    # Inicializar warm start
    for cid, model in models.items():
        model.partial_fit(scaler.transform(X_train[:2]), y_train[:2], classes=np.array([0, 1]))

    for epoch in range(1, max_epochs + 1):
        for cid, entry in active.items():
            cfg = entry['config']
            model = models[cid]
            n = X_train.shape[0]
            idx = np.random.permutation(n)
            for start in range(0, n, cfg['batch_size']):
                b = idx[start:start + cfg['batch_size']]
                model.partial_fit(scaler.transform(X_train[b]), y_train[b])

        if epoch % eval_interval == 0:
            scores = []
            for cid in list(active.keys()):
                model = models[cid]
                acc = accuracy_score(y_train, model.predict(scaler.transform(X_train)))
                active[cid]['history'].append((epoch, acc))
                scores.append((cid, acc))

            # Eliminar la peor configuración si hay más de 2
            if len(active) > 2:
                worst = min(scores, key=lambda x: x[1])[0]
                del active[worst]
                print(f"Epoch {epoch}: Eliminada la peor config {worst}")

    results = {}
    for cid, entry in active.items():
        model = models[cid]
        results[cid] = {
            'config': entry['config'],
            'train_history': entry['history'],
            'test_metrics': evaluate_model(model, scaler, X_val, y_val)
        }

    return results


# === Ejecución ===
cfg_path = "configs.json"
data_path = "Data/Dataset_clean.csv"

# Cargar configuración y datos
with open(cfg_path, "r") as f:
    configs = json.load(f)['configs']

df = load_data(data_path)
X, y = preprocess(df)

# División de datos 80/20
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y
)

# Entrenamiento
results = run_training(configs, X_train, y_train, X_test, y_test)

# Resultados finales
print('\n=== Resultados finales ===')
for cid, res in results.items():
    print(f"\nConfig {cid}:")
    print(json.dumps(res['test_metrics'], indent=2))


Epoch 5: Eliminada la peor config log_2
Epoch 10: Eliminada la peor config svm_1
Epoch 15: Eliminada la peor config svm_2
Epoch 20: Eliminada la peor config svm_3

=== Resultados finales ===

Config log_1:
{
  "accuracy": 0.998978494623656,
  "precision": 0.9988152934841141,
  "recall": 0.9991381167851756,
  "f1": 0.998976679054236
}

Config log_3:
{
  "accuracy": 0.9986021505376345,
  "precision": 0.9988143996550981,
  "recall": 0.9983839689722043,
  "f1": 0.9985991379310345
}
