In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Set style for better visualizations
sns.set_palette('husl')

# Set figure size for better visibility
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
df = toyota_cut.copy()

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()
print("\nBasic Statistics:")
df.describe()

## 1. Histograms for all variables

In [None]:
# Create histograms for all numeric variables
numeric_cols = df.select_dtypes(include=[np.number]).columns
n_cols = 3
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
axes = axes.flatten()

for idx, col in enumerate(numeric_cols):
    sns.histplot(data=df, x=col, ax=axes[idx], kde=True)
    axes[idx].set_title(f'Distribution of {col}')
    axes[idx].tick_params(axis='x', rotation=45)

# Hide empty subplots if any
for idx in range(len(numeric_cols), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.show()

## 2. Boxplots for all variables

In [None]:
def box_plots(data_df):
    # Create boxplots for all numeric variables
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
    axes = axes.flatten()

    for idx, col in enumerate(numeric_cols):
        sns.boxplot(data=data_df, y=col, ax=axes[idx])
        axes[idx].set_title(f'Boxplot of {col}')
        axes[idx].tick_params(axis='x', rotation=45)

    # Hide empty subplots if any
    for idx in range(len(numeric_cols), len(axes)):
        axes[idx].set_visible(False)

    plt.tight_layout()
    plt.show()
    
box_plots(df)

## 3. Correlation Matrix

In [None]:
# Calculate correlation matrix
correlation_matrix = df[numeric_cols].corr(method='pearson')

# Plot correlation matrix
plt.figure(figsize=(20, 16))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, 
            mask=mask,
            annot=True, 
            fmt='.2f', 
            cmap='coolwarm',
            center=0,
            square=True,
            linewidths=.5,
            cbar_kws={"shrink": .8})
plt.title('Pearson Correlation Matrix')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## 4. Scatterplots between variables

In [None]:
def scatter_plots(df):
    # Create scatterplots for highly correlated variables (|correlation| > 0.5)
    high_corr_pairs = []
    for i in range(len(numeric_cols)):
        for j in range(i+1, len(numeric_cols)):
            corr = correlation_matrix.iloc[i,j]
            if abs(corr) > 0.5:
                high_corr_pairs.append((numeric_cols[i], numeric_cols[j], corr))

    # Sort pairs by absolute correlation value
    high_corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)

    # Create scatterplots for highly correlated pairs
    n_pairs = len(high_corr_pairs)
    n_cols = 2
    n_rows = (n_pairs + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten()

    for idx, (col1, col2, corr) in enumerate(high_corr_pairs):
        sns.scatterplot(data=df, x=col1, y=col2, ax=axes[idx])
        axes[idx].set_title(f'{col1} vs {col2}\nCorrelation: {corr:.2f}')
        axes[idx].tick_params(axis='x', rotation=45)

    # Hide empty subplots if any
    for idx in range(len(high_corr_pairs), len(axes)):
        axes[idx].set_visible(False)

    plt.tight_layout()
    plt.show()
    
scatter_plots(df)

## 5. Summary Statistics for Outliers

In [None]:
# Calculate and display outlier statistics
outlier_stats = pd.DataFrame()

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    
    outlier_stats.loc[col, 'Outliers Count'] = len(outliers)
    outlier_stats.loc[col, 'Outliers Percentage'] = (len(outliers) / len(df)) * 100
    outlier_stats.loc[col, 'Lower Bound'] = lower_bound
    outlier_stats.loc[col, 'Upper Bound'] = upper_bound

outlier_stats.sort_values('Outliers Count', ascending=False)

## 6. Variables vs. Price

In [None]:
# Create scatterplots for each numeric variable vs. price
n_cols = 3
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
axes = axes.flatten()

for idx, col in enumerate(numeric_cols):
    sns.scatterplot(data=df, x=col, y='Price', ax=axes[idx])
    axes[idx].set_title(f'{col} vs Price')

# Hide empty subplots if any
for idx in range(len(numeric_cols), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()

## 7. Asimetrias y Kurtosis.

Primero eliminamos outliers

In [None]:
toyota_filtered = toyota_cut.copy()

# Clean based on SCATTER PLOTS
# Keep
toyota_filtered = toyota_filtered[toyota_filtered['m_mpv_verso'] == 0]
toyota_filtered = toyota_filtered[(toyota_filtered['Price'] > 0) & (toyota_filtered['Price'] < 30000)]
toyota_filtered = toyota_filtered[(toyota_filtered['Weight'] > 0) & (toyota_filtered['Weight'] < 1400)]
toyota_filtered = toyota_filtered[(toyota_filtered['KM'] > 0) & (toyota_filtered['KM'] < 225000)]

# Drop
toyota_filtered = toyota_filtered.drop(toyota_filtered[(toyota_filtered['Weight'] > 1200) & (toyota_filtered['Price'] < 10000)].index)
toyota_filtered = toyota_filtered.drop(toyota_filtered[(toyota_filtered['Age_08_04'] > 40) & (toyota_filtered['Price'] < 5500)].index)
toyota_filtered = toyota_filtered.drop(toyota_filtered[(toyota_filtered['Age_08_04'] > 40) & (toyota_filtered['Price'] > 17000)].index)
toyota_filtered = toyota_filtered.drop(toyota_filtered[(toyota_filtered['m_life_months'] < 260) & (toyota_filtered['Price'] > 30000)].index)
toyota_filtered = toyota_filtered.drop(toyota_filtered[(toyota_filtered['Weight'] > 1400) & (toyota_filtered['Price'] < 15000)].index)


# Clean based on BOX PLOTS
print(toyota_filtered.shape)
scatter_plots(toyota_filtered)
box_plots(toyota_filtered)

In [None]:
# Normalizado
scaler = MinMaxScaler()
toyota_normalized = toyota_filtered.copy()
columns = toyota_normalized.columns
toyota_normalized[columns] = scaler.fit_transform(toyota_normalized[columns])
toyota_normalized = pd.DataFrame(toyota_normalized, columns=columns)

En base a lo obtenido, observamos una Asimetría (skew) en las distribuciones, por lo que haremos los siguientes tratamientos:

- *Outliers*: quitamos los outliers del dataset.
- *Normalizado*: escalamos las variables con la funcion Min Max (0 a 1).
- *Transformacion*: aplicamos esta transformación a las columnas no binarias.


In [None]:
skewed_cols = ["Price", "KM", "HP", "Weight", "Mfg_Year"]

In [None]:
# Transformado
def show_skew_info(df, col):
    skewness = df[col].skew()
    kurtosis = df[col].kurtosis()
    print(f"Col: {col}")
    print(f"\tSkewness: {skewness}")
    print(f"\tKurtosis: {kurtosis}")
    
toyota_transformed = toyota_normalized[skewed_cols].copy()

show_skew_info(toyota_transformed, "KM")
toyota_transformed['KM'] = np.log(toyota_transformed['KM']+1)
toyota_transformed['KM'] = np.sqrt(toyota_transformed['KM'])
show_skew_info(toyota_transformed, "KM")

show_skew_info(toyota_transformed, "Weight")
toyota_transformed['Weight'] = np.log(toyota_transformed['Weight']+1)
toyota_transformed['Weight'] = np.sqrt(toyota_transformed['Weight'])
show_skew_info(toyota_transformed, "Weight")

toyota_transformed.describe().T

In [None]:
def print_histograms_comparison_v2(original_df, toyota_normalized, transformed_df):
    numeric_cols = toyota_normalized.select_dtypes(include=[np.number]).columns
    n_cols = 3  # Original, Scaled, Transformed
    n_rows = len(numeric_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 4*n_rows))
    if n_rows == 1:
        axes = np.array([axes])  # Ensure axes is 2D

    for idx, col in enumerate(numeric_cols):
        # Histograma original
        sns.histplot(data=original_df, x=col, ax=axes[idx, 0], kde=True, color='skyblue')
        axes[idx, 0].set_title(f'Original: {col}')
        axes[idx, 0].tick_params(axis='x', rotation=45)

        # Histograma escalado
        sns.histplot(data=toyota_normalized, x=col, ax=axes[idx, 1], kde=True, color='salmon')
        axes[idx, 1].set_title(f'Escalado: {col}')
        axes[idx, 1].tick_params(axis='x', rotation=45)

        # Histograma transformado
        sns.histplot(data=transformed_df, x=col, ax=axes[idx, 2], kde=True, color='seagreen')
        axes[idx, 2].set_title(f'Transformado: {col}')
        axes[idx, 2].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()

print_histograms_comparison_v2(toyota_cut[skewed_cols], toyota_normalized[skewed_cols], toyota_transformed[skewed_cols])

Validamos el modelo con las transformciones realizadas.

In [None]:
from sklearn.model_selection import KFold
import statsmodels.api as sm
from toyota.utils import load_dataset, get_metrics, LinearRegDiagnostic

toyota_final = toyota_transformed.copy()
cols_to_drop = ["Id", "Model", "Fuel_Type"]
toyota_final = toyota_final.drop([col for col in cols_to_drop if col in toyota_final.columns], axis=1)

split_params = {
    "n_splits": 5,
    "random_state": 42,
    "shuffle": True,
}
kf = KFold(**split_params)
folds = kf.split(toyota_final)

train_indexes = []
test_indexes = []

for (train_index, test_index) in folds:
    train_indexes.append(train_index)
    test_indexes.append(test_index)

models = []
for i, train_index in enumerate(train_indexes):
    train_fold = toyota_final.iloc[train_index]
    X_train = sm.add_constant(train_fold.drop(columns=["Price"], axis=1))
    y_train = train_fold["Price"]
    model = sm.OLS(y_train, X_train).fit()
    model_data = {
        "model": model,
    }
    models.append(model_data)
    
metrics_all = []
for i, test_index in enumerate(test_indexes):
    test_fold = toyota_final.iloc[test_index]
    model = models[i]["model"]
    X_test = sm.add_constant(test_fold.drop(columns=["Price"], axis=1))
    y_test = test_fold["Price"]
    y_pred = model.predict(X_test)
    metrics = get_metrics(y_test, y_pred)
    diagnosticPlotter = LinearRegDiagnostic(model)
    diagnosticPlotter()
    metrics_all.append(metrics)
metrics_means = {key: np.mean([metrics[key] for metrics in metrics_all]) for key in metrics_all[0]}
print(metrics_means)