In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set style for better visualizations
sns.set_palette('husl')

# Set figure size for better visibility
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
df = toyota_clean.copy()

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()
print("\nBasic Statistics:")
df.describe()

## 1. Histograms for all variables

In [None]:
# Create histograms for all numeric variables
numeric_cols = df.select_dtypes(include=[np.number]).columns
n_cols = 3
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
axes = axes.flatten()

for idx, col in enumerate(numeric_cols):
    sns.histplot(data=df, x=col, ax=axes[idx], kde=True)
    axes[idx].set_title(f'Distribution of {col}')
    axes[idx].tick_params(axis='x', rotation=45)

# Hide empty subplots if any
for idx in range(len(numeric_cols), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.show()

## 2. Boxplots for all variables

In [None]:
# Create boxplots for all numeric variables
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
axes = axes.flatten()

for idx, col in enumerate(numeric_cols):
    sns.boxplot(data=df, y=col, ax=axes[idx])
    axes[idx].set_title(f'Boxplot of {col}')
    axes[idx].tick_params(axis='x', rotation=45)

# Hide empty subplots if any
for idx in range(len(numeric_cols), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.show()

## 3. Correlation Matrix

In [None]:
# Calculate correlation matrix
correlation_matrix = df[numeric_cols].corr(method='pearson')

# Plot correlation matrix
plt.figure(figsize=(20, 16))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, 
            mask=mask,
            annot=True, 
            fmt='.2f', 
            cmap='coolwarm',
            center=0,
            square=True,
            linewidths=.5,
            cbar_kws={"shrink": .8})
plt.title('Pearson Correlation Matrix')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

- `dsl`  es diesel, ya está en una columna
- `m_sport` esta poco relacionada con precio y su t-value muy bajo y p-value practicamente igual a 1. Ademas la cantidad de registros son pocos con el tipo sport.
- `m_16v` tiene 0.6 bajo t-value y 50% de p-value.
- `Mfg_month` y `Mfg_year` se unifican en una sola columna que muestra la edad en meses (esta nueva columna es exactament eigual que `Age_08_04` por lo que no nos sirve).
- `m_hatch_b` y `m_liftb` muy relacionado con `Doors`, por lo que las sacamos.
- Todos lo fuel type no aportan a Precio y presentan colinealidad con la mayoria de las variables.

## 4. Scatterplots between variables

In [None]:
# Create scatterplots for highly correlated variables (|correlation| > 0.5)
high_corr_pairs = []
for i in range(len(numeric_cols)):
    for j in range(i+1, len(numeric_cols)):
        corr = correlation_matrix.iloc[i,j]
        if abs(corr) > 0.5:
            high_corr_pairs.append((numeric_cols[i], numeric_cols[j], corr))

# Sort pairs by absolute correlation value
high_corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)

# Create scatterplots for highly correlated pairs
n_pairs = len(high_corr_pairs)
n_cols = 2
n_rows = (n_pairs + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
axes = axes.flatten()

for idx, (col1, col2, corr) in enumerate(high_corr_pairs):
    sns.scatterplot(data=df, x=col1, y=col2, ax=axes[idx])
    axes[idx].set_title(f'{col1} vs {col2}\nCorrelation: {corr:.2f}')
    axes[idx].tick_params(axis='x', rotation=45)

# Hide empty subplots if any
for idx in range(len(high_corr_pairs), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.show()

## 5. Summary Statistics for Outliers

In [None]:
# Calculate and display outlier statistics
outlier_stats = pd.DataFrame()

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    
    outlier_stats.loc[col, 'Outliers Count'] = len(outliers)
    outlier_stats.loc[col, 'Outliers Percentage'] = (len(outliers) / len(df)) * 100
    outlier_stats.loc[col, 'Lower Bound'] = lower_bound
    outlier_stats.loc[col, 'Upper Bound'] = upper_bound

outlier_stats.sort_values('Outliers Count', ascending=False)

## 6. Variables vs. Price

In [None]:
# Create scatterplots for each numeric variable vs. price
n_cols = 3
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
axes = axes.flatten()

for idx, col in enumerate(numeric_cols):
    sns.scatterplot(data=df, x=col, y='Price', ax=axes[idx])
    axes[idx].set_title(f'{col} vs Price')

# Hide empty subplots if any
for idx in range(len(numeric_cols), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()