In [None]:
import pandas as pd

df = pd.read_csv('lyft.csv')

In [None]:
df.head().T

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
import missingno as msn
import matplotlib.pyplot as plt

msn.matrix(df)
plt.show()

In [None]:
cat_cols = ['source', 'destination', 'cab_type', 'name', 'short_summary', 'weekday']
for col in cat_cols:
    print(f"\n--- {col} ---")
    print(df[col].value_counts().head(10))

In [None]:
numCols = df.select_dtypes(include="number").columns.tolist()
numCols.remove('uvIndex')
numCols.remove('visibility.1')

matriz = df[numCols].describe().T
matriz['median'] = df[numCols].median()
matriz['IQR'] = matriz['75%'] - matriz['25%']
matriz['skew'] = df[numCols].skew()
matriz['kurtosis'] = (df[numCols].kurtosis())
matriz

In [None]:
numCols = df.select_dtypes(include='number').columns.tolist()

for col in numCols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    mask = df[(df[col] < lower ) | (df[col] > upper)]
    print(f"num de outliers de {col}: {len(mask)}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

numCols = df.select_dtypes(include="number").columns.tolist()

for col in numCols:
    plt.figure(figsize=(8,4))
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot de {col}")
    plt.xlabel(col)
    plt.show()

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# numCols = df.select_dtypes(include="number").columns.tolist()

# for col in numCols:
#     plt.figure(figsize=(6,3))
#     sns.histplot(df[col].dropna(), kde=True,bins=30, color="red")
#     plt.title(f"Histograma de {col}")
#     plt.xlabel(col)
#     plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

col = 'surge_multiplier'
mean = df[col].mean()
std = df[col].std()
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)

sns.histplot(df[col].dropna(), kde=True, bins=30, color="red")
plt.title(f"Asimetria de {col}")
plt.xlabel(col)

plt.axvline(mean, color='blue', linestyle='--', label=f"Media: {mean:.2f}")
plt.axvline(mean - std, color='green', linestyle='--', label=f"Media - std: {mean - std:.2f}")
plt.axvline(mean + std, color='green', linestyle='--', label=f"Media + std: {mean + std:.2f}")
plt.axvline(Q1, color='orange', linestyle='--', label=f"Q1: {Q1:.2f}")
plt.axvline(Q3, color='orange', linestyle='--', label=f"Q3: {Q3:.2f}")

plt.legend()
plt.show()

LIMPIEZA Y PREPROCESAMIENTO

In [None]:
#Suma de duplicados
df.duplicated().sum()

df = df.drop_duplicates()

In [None]:
colDrop = ['id', 'visibility.1', 'uvIndex', 'windBearing', 'moonPhase', 'cab_type']
df = df.drop(columns=colDrop)

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

ordEnc = OrdinalEncoder(categories=[['Mon','Tue','Wed','Thu','Fri','Sat','Sun']])
df['weekday_encoded'] = ordEnc.fit_transform(df[['weekday']])

ohe = OneHotEncoder(sparse_output=False, drop='first')
oheCols = ohe.fit_transform(df[['source','destination','name']])
ohe_df = pd.DataFrame(oheCols, columns=ohe.get_feature_names_out(['source','destination','name']))

df = pd.concat([df.drop(columns=['weekday','source','destination','name']), ohe_df], axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler

numCols = ['hour', 'day', 'month', 'distance', 'surge_multiplier','temperature', 'apparentTemperature', 'precipIntensity',
           'precipProbability','humidity', 'windSpeed', 'visibility', 'temperatureHigh', 'temperatureLow', 'dewPoint', 'cloudCover']
scaler = StandardScaler()
df[numCols] = scaler.fit_transform(df[numCols])


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10, random_state=42)
X_pca = pca.fit_transform(df[numCols])

explainer_var = pca.explained_variance_ratio_
plt.plot(range(1,len(explainer_var)+1), explainer_var.cumsum(), marker="o")
plt.xlabel("Número de componentes")
plt.ylabel("Varainza acumulada")
plt.grid(True)
plt.show()

In [None]:
pca_cols = [f"PC{i+1}" for i in range(X_pca.shape[1])]
df_pca = pd.DataFrame(X_pca, columns=pca_cols)


In [None]:
df_model = pd.concat([
    df_pca.reset_index(drop=True),
    ohe_df.reset_index(drop=True),
    df[['weekday_encoded']].reset_index(drop=True)
], axis=1)

In [None]:
y = df['price']
X = df_model

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train,y_test=train_test_split(
    X,y, train_size=0.8, random_state=42
)

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

y_pred = dt_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2: {r2:.2f}")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Obtener la importancia de cada variable
importances = dt_model.feature_importances_

# Crear un DataFrame para ordenar y graficar
feature_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# Graficar
plt.figure(figsize=(10,6))
plt.barh(feature_importance_df['feature'], feature_importance_df['importance'], color='skyblue')
plt.gca().invert_yaxis()
plt.xlabel("Importancia")
plt.title("Importancia de las características en Decision Tree")
plt.show()


In [None]:
X_cluster = df_model

In [None]:
from sklearn.cluster import KMeans

inertia = []
K = range(1, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_cluster)
    inertia.append(kmeans.inertia_)

plt.plot(K, inertia, marker='o')
plt.xlabel('Número de clusters')
plt.ylabel('Inercia')
plt.title('Regla del codo')
plt.grid(True)
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_cluster)

# Agregamos los clusters al DataFrame
df_model['cluster'] = clusters


In [None]:
df_model.groupby('cluster').mean()


In [None]:
# -----------------------------
# 1️⃣ Clustering: K-Means + Regla del codo
# -----------------------------
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Usamos todo df_model (numérico)
X_cluster = df_model.drop(columns=['cluster'], errors='ignore')  # por si ya existe la columna

# Regla del codo
inertia = []
K = range(1, 11)  # probamos de 1 a 10 clusters

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_cluster)
    inertia.append(kmeans.inertia_)

plt.plot(K, inertia, marker='o')
plt.xlabel('Número de clusters')
plt.ylabel('Inercia')
plt.title('Regla del codo')
plt.grid(True)
plt.show()

# Entrenar K-Means con el número de clusters elegido (por ejemplo, k=3)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_cluster)

# Agregar los clusters al DataFrame
df_model['cluster'] = clusters

# Explorar los clusters
print(df_model.groupby('cluster').mean())

# -----------------------------
# 2️⃣ MLPRegressor para predecir price
# -----------------------------
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Variable objetivo
y = df['price']
X = df_model.drop(columns=['cluster'], errors='ignore')  # no usamos cluster como input

# Separar entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=42
)

# Entrenar MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(100,50), max_iter=500, random_state=42)
mlp.fit(X_train, y_train)

# Predicciones
y_pred = mlp.predict(X_test)

# Métricas
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2: {r2:.2f}")
