In [None]:
!pip install shap

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import shap
from matplotlib.ticker import FuncFormatter
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.inspection import PartialDependenceDisplay
from PIL import Image
import io
from sklearn.inspection import permutation_importance
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

sns.set(
    { "figure.figsize": (8, 6) },
    style='ticks',
    color_codes=True,
    font_scale=0.8
)
%config InlineBackend.figure_formats = set(('retina', 'svg'))

import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cars = pd.read_csv('/content/drive/MyDrive/adverts.csv')

# Data/Domain Understanding and Exploration

In [None]:
cars.shape

In [None]:
cars.info()

In [None]:
cars.columns

In [None]:
cars.dtypes

In [None]:
cars.head()

In [None]:
cars.duplicated().sum()

In [None]:
cars.describe()

In [None]:
missing_values = cars.isnull().sum()
print(missing_values)
print((missing_values / len(cars)) * 100)

In [None]:
for col in cars.columns:
    print(f"{col}: {cars[col].nunique()} unique values")

In [None]:
bins = [0, 100, 10000, 50000, 100000, 150000, 200000, 300000, float('inf')]
mileage_labels = ['New', 'Very Low', 'Low', 'Medium', 'High', 'Very High', 'Extremely High', 'Ultra High']

cars['mileage_bins'] = pd.cut(cars['mileage'], bins=bins, labels=mileage_labels, right=False)

print(cars[['mileage', 'mileage_bins']].head())

In [None]:
price_bins = [0, 5000, 20000, 60000, 100000, 200000, float('inf')]
price_labels = ['Very Low', 'Low', 'Medium', 'High', 'Luxury', 'Ultra Luxury']

cars['price_bins'] = pd.cut(cars['price'], bins=price_bins, labels=price_labels, right=False)

print(cars[['price', 'price_bins']].head())

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=True)

cars['mileage_bins'].value_counts().sort_index().plot(
    kind='bar',
    color='green',
    edgecolor='black',
    ax=axes[0]
)
axes[0].set_title('Distribution of Vehicles by Mileage Bins', fontsize=16, fontweight='bold')
axes[0].set_xlabel('Mileage Bins', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Count', fontsize=12, fontweight='bold')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)

cars['price_bins'].value_counts().sort_index().plot(
    kind='bar',
    color='orange',
    edgecolor='black',
    ax=axes[1]
)
axes[1].set_title('Distribution of Vehicles by Price Bins', fontsize=16, fontweight='bold')
axes[1].set_xlabel('Price Bins', fontsize=12, fontweight='bold')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(
    pd.crosstab(cars['mileage_bins'], cars['price_bins']),
    annot=True, fmt='d', cmap='plasma'  # تغییر پالت رنگ به 'coolwarm'
)
plt.title('Relationship Between Mileage Bins and Price Bins', fontsize=16, fontweight='bold')
plt.xlabel('Price Bins', fontsize=12, fontweight='bold')
plt.ylabel('Mileage Bins', fontsize=12, fontweight='bold')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Analysis for 'year_of_registration'

year_of_registration_stats = cars['year_of_registration'].describe()

unique_years = cars['year_of_registration'].nunique()

year_of_registration_freq = cars['year_of_registration'].value_counts().head(10)

missing_year_of_registration = cars['year_of_registration'].isnull().sum()

print("Basic Statistics:")
print(year_of_registration_stats)

print("\nUnique Values Count:")
print(unique_years)

print("\nTop 10 Frequent Years:")
print(year_of_registration_freq)

print("\nMissing Values:")
print(missing_year_of_registration)

In [None]:
year_data = cars['year_of_registration']
year_data = year_data[year_data >= 2000]

year_mean = year_data.mean()
year_median = year_data.median()

plt.figure(figsize=(14, 8))
plt.hist(year_data, bins=range(2000, 2021, 2), edgecolor='black', alpha=0.7, color='skyblue', rwidth=0.85)

plt.axvline(year_mean, color='red', linestyle='--', linewidth=2, label=f'Mean: {year_mean:.1f}')
plt.axvline(year_median, color='green', linestyle='--', linewidth=2, label=f'Median: {year_median:.1f}')

plt.text(year_mean, plt.ylim()[1] * 0.9, f'Mean: {int(year_mean)}', color='red', fontsize=12, weight='bold')
plt.text(year_median, plt.ylim()[1] * 0.8, f'Median: {int(year_median)}', color='green', fontsize=12, weight='bold')

plt.xticks(range(2000, 2021, 5), fontsize=12, rotation=45)
plt.xlabel('Year of Registration', fontsize=15, fontweight='bold')
plt.ylabel('Frequency', fontsize=15, fontweight='bold')

plt.title('Distribution of Year of Registration (2000 and onwards)', fontsize=18, fontweight='bold', color='darkblue')
plt.grid(axis='y', alpha=0.5, linestyle='--')

plt.legend(fontsize=12, loc='upper left')

plt.show()

In [None]:
# Analysis for 'mileage'

mileage_stats = cars['mileage'].describe()

unique_mileages = cars['mileage'].nunique()

mileage_freq = cars['mileage'].value_counts().head(10)

missing_mileage = cars['mileage'].isnull().sum()

total_cars = len(cars)

below_10k = cars[cars['mileage'] < 10000].shape[0]
percentage_below_10k = (below_10k / total_cars) * 100

between_10k_100k = cars[(cars['mileage'] >= 10000) & (cars['mileage'] <= 100000)].shape[0]
percentage_between_10k_100k = (between_10k_100k / total_cars) * 100

above_100k = cars[cars['mileage'] > 100000].shape[0]
percentage_above_100k = (above_100k / total_cars) * 100

print("Basic Statistics:")
print(mileage_stats)

# print("\nUnique Values Count:")
# print(unique_mileages)

# print("\nTop 10 Frequent Mileages:")
# print(mileage_freq)

print("\nMissing Values:")
print(missing_mileage)

print("\nCars with mileage < 10,000:")
print(f"{percentage_below_10k:.2f}%")

print("\nCars with mileage between 10,000 and 100,000:")
print(f"{percentage_between_10k_100k:.2f}%")

print("\nCars with mileage > 100,000:")
print(f"{percentage_above_100k:.2f}%")

In [None]:
mileage_data = pd.to_numeric(cars['mileage'], errors='coerce')
mileage_data = mileage_data[mileage_data > 0]

log_mileage_data = mileage_data.apply(np.log)

plt.figure(figsize=(14, 8))
plt.hist(log_mileage_data.dropna(), bins=30, edgecolor='black', alpha=0.7, color='skyblue')

mean_log = log_mileage_data.mean()
median_log = log_mileage_data.median()
plt.axvline(mean_log, color='red', linestyle='--', linewidth=2, label=f'Mean (Log): {mean_log:.2f}')
plt.axvline(median_log, color='green', linestyle='--', linewidth=2, label=f'Median (Log): {median_log:.2f}')

log_ticks = [np.log(x) for x in [1, 100, 1000, 10000, 30000, 50000, 100000, 200000, 300000, 500000, 1000000]]
tick_labels = ['1', '100', '1000', '10000', '30000', '50000', '100000', '200000', '300000', '500000', '1000000']
plt.xticks(log_ticks, tick_labels, fontsize=12, rotation=90)
plt.xlabel('Mileage (Miles)', fontsize=15, fontweight='bold')
plt.ylabel('Frequency', fontsize=15, fontweight='bold')

plt.title('Histogram of Mileage (Log Scale)', fontsize=18, fontweight='bold', color='darkblue')
plt.grid(axis='y', alpha=0.5, linestyle='--')

plt.text(mean_log, plt.ylim()[1] * 0.9, f'Mean: ~{int(np.exp(mean_log))} miles', color='red', fontsize=12, weight='bold')
plt.text(median_log, plt.ylim()[1] * 0.8, f'Median: ~{int(np.exp(median_log))} miles', color='green', fontsize=12, weight='bold')

plt.legend(fontsize=12, loc='upper left')

plt.show()

In [None]:
sqrt_mileage = np.sqrt(cars['mileage'].dropna())

mean_mileage = cars['mileage'].mean()
median_mileage = cars['mileage'].median()

mean_sqrt = np.sqrt(mean_mileage)
median_sqrt = np.sqrt(median_mileage)

adjusted_rounded_ticks = [0, 1000, 10000, 30000, 50000, 100000, 200000, 300000, 500000, 1000000]
sqrt_adjusted_ticks = np.sqrt(adjusted_rounded_ticks)

plt.figure(figsize=(10, 6))
plt.hist(sqrt_mileage, bins=30, edgecolor='black', alpha=0.7, color='orange', label='Mileage Distribution')

plt.axvline(mean_sqrt, color='red', linestyle='--', linewidth=1.5, label=f'Mean ({int(mean_mileage):,})')
plt.axvline(median_sqrt, color='blue', linestyle='--', linewidth=1.5, label=f'Median ({int(median_mileage):,})')

plt.xticks(sqrt_adjusted_ticks, labels=[f"{int(val):,}" for val in adjusted_rounded_ticks], rotation=90)

plt.title('Distribution of Mileage with Mean and Median', fontsize=14, fontweight='bold')
plt.xlabel('Mileage (Rounded Values)', fontsize=12, fontweight='bold')
plt.ylabel('Frequency', fontsize=12, fontweight='bold')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 8))
plt.boxplot(
    mileage_data.dropna(),
    vert=False,
    patch_artist=True,
    boxprops=dict(facecolor="lightblue", edgecolor="black", linewidth=1.5),
    medianprops=dict(color="red", linewidth=2),
    whiskerprops=dict(color="black", linewidth=1.5),
    capprops=dict(color="black", linewidth=1.5),
    flierprops=dict(marker="o", color="orange", markersize=5)
)

correct_labels = ['0-50k', '50k-100k', '100k-150k', '150k-200k', '200k+']
correct_ticks = [25000, 75000, 125000, 175000, 225000]

plt.title('Box Plot of Mileage Categorized by Ranges', fontsize=18, fontweight='bold', color='darkblue')
plt.xlabel('Mileage Ranges (Miles)', fontsize=15, fontweight='bold')

plt.xticks(correct_ticks, correct_labels, fontsize=12, rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.5)

plt.show()

In [None]:
# Analysis for 'price'

price_stats = cars['price'].describe()

unique_prices = cars['price'].nunique()

price_freq = cars['price'].value_counts().head(10)

missing_price = cars['price'].isnull().sum()

print("Basic Statistics:")
print(price_stats)

print("\nUnique Values Count:")
print(unique_prices)

print("\nTop 10 Frequent Prices:")
print(price_freq)

print("\nMissing Values:")
print(missing_price)

In [None]:
mean_price = cars['price'].mean()
median_price = cars['price'].median()

def format_func(value, tick_number):
    return f'{int(value):,}'

plt.figure(figsize=(10, 6))

plt.hist(cars['price'].dropna(), bins=100, edgecolor='black', alpha=0.7, label='Histogram')

plt.axvline(mean_price, color='red', linestyle='--', label=f'Mean: {int(mean_price):,}')
plt.axvline(median_price, color='green', linestyle='--', label=f'Median: {int(median_price):,}')

plt.title('Histogram of Price (Log Scale, 0 to 10,000,000)', fontsize=14, color='blue')
plt.xlabel('Price', fontsize=12)
plt.ylabel('Frequency (Log Scale)', fontsize=12)

plt.gca().xaxis.set_major_formatter(FuncFormatter(format_func))

plt.yscale('log')

plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: f'{int(x):,}'))

plt.legend()

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Analysis for 'vehicle_condition'

unique_conditions = cars['vehicle_condition'].nunique()

condition_freq = cars['vehicle_condition'].value_counts()

missing_vehicle_condition = cars['vehicle_condition'].isnull().sum()

print("Unique Values Count:")
print(unique_conditions)

print("\nFrequency Distribution:")
print(condition_freq)

print("\nMissing Values:")
print(missing_vehicle_condition)

In [None]:
vehicle_condition_counts = cars['vehicle_condition'].value_counts(normalize=True) * 100

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
vehicle_condition_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=['skyblue', 'orange'])
plt.title("Vehicle Condition Distribution")
plt.ylabel("")
plt.show()

In [None]:
vehicle_condition_counts = cars['vehicle_condition'].value_counts()

plt.figure(figsize=(8, 6))
bars = plt.bar(vehicle_condition_counts.index, vehicle_condition_counts.values, color=['skyblue', 'orange'])
plt.title("Vehicle Condition Distribution", fontsize=14)
plt.xlabel("Vehicle Condition", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.xticks(rotation=0, fontsize=10)
plt.yticks(fontsize=10)

for bar in bars:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 5000,
             str(int(bar.get_height())), ha='center', va='bottom', fontsize=10, color='black')

plt.show()

In [None]:
# Analysis for 'body_type'

unique_body_types = cars['body_type'].nunique()

body_type_freq = cars['body_type'].value_counts()

missing_body_type = cars['body_type'].isnull().sum()

print("Unique Values Count:")
print(unique_body_types)

print("\nFrequency Distribution:")
print(body_type_freq)

print("\nMissing Values:")
print(missing_body_type)

In [None]:
body_type_data = cars['body_type']

body_type_counts = body_type_data.value_counts(dropna=False)

plt.figure(figsize=(12, 6))
body_type_counts.plot(kind='bar', color='skyblue', edgecolor='black', alpha=0.8)
plt.title('Distribution of Body Types (Including Missing Values)', fontsize=18, fontweight='bold', color='darkblue')
plt.xlabel('Body Type', fontsize=15, fontweight='bold')
plt.ylabel('Frequency', fontsize=15, fontweight='bold')
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.5)

for index, value in enumerate(body_type_counts):
    plt.text(index, value + 1000, str(value), ha='center', fontsize=10)
plt.show()

In [None]:
# Analysis for 'fuel_type'

unique_fuel_types = cars['fuel_type'].nunique()

fuel_type_freq = cars['fuel_type'].value_counts()
fuel_type_percentage = (cars['fuel_type'].value_counts(normalize=True) * 100).round(2)

missing_fuel_type = cars['fuel_type'].isnull().sum()

print("Unique Values Count:")
print(unique_fuel_types)

print("\nFrequency Distribution (Count and Percentage):")
for fuel, count in fuel_type_freq.items():
    print(f"{fuel}: {count} ({fuel_type_percentage[fuel]}%)")

print("\nMissing Values:")
print(missing_fuel_type)

In [None]:
fuel_type_data = cars['fuel_type']

fuel_type_counts = fuel_type_data.value_counts(dropna=False)

plt.figure(figsize=(12, 6))
fuel_type_counts.plot(kind='bar', color='green', edgecolor='black', alpha=0.8)
plt.title('Distribution of Fuel Types (Including Missing Values)', fontsize=18, fontweight='bold', color='darkblue')
plt.xlabel('Fuel Type', fontsize=15, fontweight='bold')
plt.ylabel('Frequency', fontsize=15, fontweight='bold')
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.5)

for index, value in enumerate(fuel_type_counts):
    plt.text(index, value + 1000, str(value), ha='center', fontsize=10)

plt.show()

In [None]:
# Analysis for 'crossover_car_and_van'

unique_crossover_values = cars['crossover_car_and_van'].nunique()

crossover_freq = cars['crossover_car_and_van'].value_counts()

missing_crossover = cars['crossover_car_and_van'].isnull().sum()

print("Unique Values Count:")
print(unique_crossover_values)

print("\nFrequency Distribution:")
print(crossover_freq)

print("\nMissing Values:")
print(missing_crossover)

In [None]:
plt.figure(figsize=(8, 8))

cars['crossover_car_and_van'].value_counts().plot(
    kind='pie', autopct='%1.1f%%', colors=['skyblue', 'orange'],
    labels=['False', 'True'], startangle=90, explode=[0, 0.1]
)

plt.title('Distribution of Crossover Car and Van', fontsize=14, color='blue')
plt.axis('equal')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))

cars['crossover_car_and_van'].value_counts().plot(
    kind='bar', color=['skyblue', 'orange'], edgecolor='black'
)

plt.title('Distribution of Crossover Car and Van', fontsize=14, color='blue')
plt.xlabel('Crossover Car and Van', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

for index, value in enumerate(cars['crossover_car_and_van'].value_counts()):
    plt.text(index, value + 5000, f'{value:,}', ha='center', fontsize=10)

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks([0, 1], ['False', 'True'], rotation=0)
plt.show()

In [None]:
# Analysis for 'standard_make'

unique_makes = cars['standard_make'].nunique()

make_freq = cars['standard_make'].value_counts().head(10)

missing_standard_make = cars['standard_make'].isnull().sum()

print("Unique Values Count:")
print(unique_makes)

print("\nTop 10 Frequent Makes:")
print(make_freq)

print("\nMissing Values:")
print(missing_standard_make)

In [None]:
plt.figure(figsize=(12, 6))

cars['standard_make'].value_counts().head(10).plot(
    kind='bar', color='skyblue', edgecolor='black'
)

plt.title('Top 10 Most Frequent standard_Makes', fontsize=14, color='blue')
plt.xlabel('Car Make', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

for index, value in enumerate(cars['standard_make'].value_counts().head(10)):
    plt.text(index, value + 1000, f'{value:,}', ha='center', fontsize=10)

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Analysis for 'standard_model'

unique_models = cars['standard_model'].nunique()

model_freq = cars['standard_model'].value_counts().head(10)

missing_standard_model = cars['standard_model'].isnull().sum()

print("Unique Values Count:")
print(unique_models)

print("\nTop 10 Frequent Models:")
print(model_freq)

print("\nMissing Values:")
print(missing_standard_model)

In [None]:
plt.figure(figsize=(14, 7))

cars['standard_model'].value_counts().head(20).plot(
    kind='bar', color='lightcoral', edgecolor='black'
)

plt.title('Top 20 Most Frequent Car Models', fontsize=14, color='blue')
plt.xlabel('Car Model', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

for index, value in enumerate(cars['standard_model'].value_counts().head(20)):
    plt.text(index, value + 300, f'{value:,}', ha='center', fontsize=9)

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Analysis for 'standard_colour'

unique_colours = cars['standard_colour'].nunique()

colour_freq = cars['standard_colour'].value_counts().head(10)

missing_standard_colour = cars['standard_colour'].isnull().sum()

print("Unique Values Count:")
print(unique_colours)

print("\nTop 10 Frequent Colours:")
print(colour_freq)

print("\nMissing Values:")
print(missing_standard_colour)

In [None]:
plt.figure(figsize=(12, 6))

cars['standard_colour'].value_counts().plot(
    kind='bar', color='mediumpurple', edgecolor='black'
)

plt.title('Distribution of Standard Colours', fontsize=14, color='blue')
plt.xlabel('Colour', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

for index, value in enumerate(cars['standard_colour'].value_counts()):
        plt.text(index, value + 1000, f'{value:,}', ha='center', fontsize=10)

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.show()

In [None]:
correlations = cars[['mileage', 'year_of_registration', 'price']].corr()
print(correlations)

import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(correlations, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
numeric_columns = ['mileage', 'year_of_registration']

numeric_correlations = cars[numeric_columns].corrwith(cars['price'])
print(numeric_correlations)

plt.figure(figsize=(8, 6))
plt.bar(numeric_correlations.index, numeric_correlations.values, color=['skyblue', 'orange'])
plt.title("Correlation of 'mileage' and 'year_of_registration' with 'price'", fontsize=14, fontweight='bold')
plt.xlabel("Features", fontsize=12)
plt.ylabel("Correlation with Price", fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
from scipy.stats import f_oneway

categorical_columns = cars.select_dtypes(include=['object', 'bool', 'category']).columns

anova_scores = {}
for col in categorical_columns:
    if cars[col].nunique() > 1:
        groups = [cars[cars[col] == category]['price'] for category in cars[col].unique()]
        f_stat, p_value = f_oneway(*groups)
        anova_scores[col] = f_stat

anova_scores = pd.Series(anova_scores).sort_values(ascending=False)

print("ANOVA F-Statistics for Categorical Features with Price:")
print(anova_scores)

plt.figure(figsize=(12, 6))
plt.bar(anova_scores.index, anova_scores.values, color='green')
plt.title("ANOVA F-Statistics for Categorical Features with Price", fontsize=16, fontweight='bold')
plt.xlabel("Features", fontsize=12)
plt.ylabel("F-Statistic", fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
price_min = 0
price_max = 50000

top_10_models = cars['standard_model'].value_counts().head(10).index

filtered_data_limited = cars[
    (cars['standard_model'].isin(top_10_models)) &
    (cars['price'] >= price_min) &
    (cars['price'] <= price_max)
]

custom_palette = {True: "#1f77b4", False: "#ff7f0e"}

sns.boxplot(
    x=filtered_data_limited['standard_model'],
    y=filtered_data_limited['price'],
    hue=filtered_data_limited['year_of_registration'] > 2010,
    palette=custom_palette,
    showmeans=False,
    meanline=False,
    flierprops={'marker': 'o', 'markersize': 5, 'color': 'black'},
    whiskerprops={'linewidth': 1},
    boxprops={'linewidth': 1}
)

plt.title(f"Price Distribution by 10 Most Frequent Models (Price Range: {price_min:,} - {price_max:,})")
plt.xlabel("Standard Model")
plt.ylabel("Price")
plt.legend(title="Year > 2010")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
categorical_columns = cars.select_dtypes(include=['object', 'category']).columns

category_price_relationships = {}
for column in categorical_columns:
    mean_price = cars.groupby(column)['price'].mean()
    category_price_relationships[column] = mean_price

for column, values in category_price_relationships.items():
    print(f"Column: {column}")
    print(values)
    print("\n")

In [None]:
qualitative_cols = ['vehicle_condition', 'body_type', 'fuel_type', 'crossover_car_and_van',
                    'standard_make', 'standard_model', 'standard_colour']

for col in qualitative_cols:
    plt.figure(figsize=(12, 6))

    if col in ['standard_make', 'standard_model']:
        mean_prices = cars.groupby(col)['price'].mean().sort_values(ascending=False).head(20)
        title_suffix = " (Top 20)"
    else:
        mean_prices = cars.groupby(col)['price'].mean().sort_values(ascending=False)
        title_suffix = ""

    sns.barplot(x=mean_prices.index, y=mean_prices.values, palette='viridis')
    plt.title(f"Average Price by {col}{title_suffix}", fontsize=16, fontweight='bold')
    plt.xlabel(col.capitalize(), fontsize=12)
    plt.ylabel("Average Price", fontsize=12)
    plt.xticks(rotation=90, fontsize=10)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# Data Preprocessing

In [None]:
summary_table = pd.DataFrame({
    'Missing Values': cars.isnull().sum(),
    'Unique Values': cars.nunique(),
    'Data Type': cars.dtypes,
    'Mean': cars.select_dtypes(include='number').mean(),
    'Median': cars.select_dtypes(include='number').median(),
    'Min': cars.select_dtypes(include='number').min(),
    'Max': cars.select_dtypes(include='number').max()
}).reset_index()

summary_table.rename(columns={'index': 'Column'}, inplace=True)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

print(summary_table)

In [None]:
missing_count = cars['year_of_registration'].isnull().sum()
total_count = len(cars)
missing_percentage = (missing_count / total_count) * 100
print(f"Missing values of year_of_registration: {missing_count} ({missing_percentage:.2f}%)")

In [None]:
print(cars['year_of_registration'].min(), cars['year_of_registration'].max())

In [None]:
outliers_before_1900 = cars[cars['year_of_registration'] < 1900]

print(f"Number of records with 'year_of_registration' before 1900: {len(outliers_before_1900)}")
outliers_before_1900.head()

In [None]:
new_vehicles_mileage = cars[cars['vehicle_condition'] == 'NEW']['mileage']

print(new_vehicles_mileage.min())
print(new_vehicles_mileage.max())

In [None]:
print(cars['year_of_registration'].max())

In [None]:
missing_year_with_reg_code = cars[cars['reg_code'].notna() & cars['year_of_registration'].isna()]

missing_year_with_reg_code.head()

In [None]:
missing_both = cars[cars['year_of_registration'].isna() & cars['reg_code'].isna()]

missing_both_count = missing_both.shape[0]
missing_both_count

In [None]:
vehicles_with_low_mileage_and_no_year = cars[
    (cars['mileage'] >= 0) &
    (cars['mileage'] <= 100) &
    (cars['year_of_registration'].isnull())
]

print(f"Number of vehicles: {vehicles_with_low_mileage_and_no_year.shape[0]}")

In [None]:
# filling missing values of year_of_registration for New Cars
import datetime

current_year = datetime.datetime.now().year

max_year = cars['year_of_registration'].max()
if max_year + 1 <= current_year:
    fill_year = max_year + 1
else:
    fill_year = current_year

cars.loc[(cars['vehicle_condition'] == 'NEW') & (cars['mileage'].between(0, 100)), 'year_of_registration'] = fill_year
cars['year_of_registration'].isnull().sum()

In [None]:
new_cars_years = cars.loc[cars['vehicle_condition'] == 'NEW', 'year_of_registration'].unique()

print(new_cars_years)

In [None]:
# filling missing values of year_of_registration after 2001

valid_reg_code = cars['reg_code'].str.isdigit() & (cars['reg_code'].str.len() == 2)

cars.loc[valid_reg_code, 'reg_code_numeric'] = cars.loc[valid_reg_code, 'reg_code'].astype(int)

calculated_year_1_to_50 = 2000 + cars.loc[valid_reg_code &
                                          (cars['reg_code_numeric'] >= 1) & (cars['reg_code_numeric'] <= 50),
                                          'reg_code_numeric']

cars.loc[
    valid_reg_code & (cars['reg_code_numeric'] >= 1) & (cars['reg_code_numeric'] <= 50) &
    (calculated_year_1_to_50 <= 2021), 'year_of_registration'
] = calculated_year_1_to_50

calculated_year_51_to_99 = 2000 + cars.loc[valid_reg_code &
                                          (cars['reg_code_numeric'] >= 51) & (cars['reg_code_numeric'] <= 99),
                                          'reg_code_numeric'] - 50

cars.loc[
    valid_reg_code & (cars['reg_code_numeric'] >= 51) & (cars['reg_code_numeric'] <= 99) &
    (calculated_year_51_to_99 <= 2021), 'year_of_registration'
] = calculated_year_51_to_99

cars.drop(columns=['reg_code_numeric'], inplace=True)
cars['year_of_registration'].isnull().sum()

In [None]:
# fill missing values of year_of_registration between 1963 and 2001

year_from_reg_code = {
    1963: 'A', 1964: 'B', 1965: 'C', 1966: 'D', 1967: ['E', 'F'], 1968: 'F', 1969: 'G', 1970: 'H',
    1971: 'J', 1972: 'K', 1973: 'L', 1974: 'M', 1975: 'N', 1976: 'P', 1977: 'R', 1978: 'S',
    1979: 'T', 1980: 'V', 1981: 'W', 1982: 'X', 1983: 'A', 1984: 'B', 1985: 'C', 1986: 'D',
    1987: 'E', 1988: 'F', 1989: 'G', 1990: 'H', 1991: 'J', 1992: 'K', 1993: 'L', 1994: 'M',
    1995: 'N', 1996: 'P', 1997: 'R', 1998: 'S', 1999: ['T', 'V'], 2000: ['W', 'X'], 2001: 'Y'
}

reg_code_to_year = {}
for year, codes in year_from_reg_code.items():
    if isinstance(codes, list):
        for code in codes:
            reg_code_to_year[code] = year
    else:
        reg_code_to_year[codes] = year

for index, row in cars.iterrows():
    if (pd.isnull(row['year_of_registration']) or row['year_of_registration'] == 0) and pd.notnull(row['reg_code']):

        cars.at[index, 'year_of_registration'] = reg_code_to_year.get(row['reg_code'], None)
    elif pd.isnull(row['reg_code']) and pd.notnull(row['year_of_registration']):
        year = int(row['year_of_registration'])
        reg_code_options = year_from_reg_code.get(year, None)
        if isinstance(reg_code_options, list):
            cars.at[index, 'reg_code'] = random.choice(reg_code_options)
        else:
            cars.at[index, 'reg_code'] = reg_code_options
cars['year_of_registration'].isnull().sum()

In [None]:
missing_count = cars['mileage'].isnull().sum()
total_count = len(cars)
missing_percentage = (missing_count / total_count) * 100
print(f"Missing values: {missing_count} ({missing_percentage:.2f}%)")

In [None]:
mileage_stats = cars['mileage'].describe()

print("Descriptive Statistics for Mileage:")
print(mileage_stats)

Q1 = mileage_stats['25%']
Q3 = mileage_stats['75%']
IQR = Q3 - Q1
print(f"\nInterquartile Range (IQR): {IQR}")

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = cars[(cars['mileage'] < lower_bound) | (cars['mileage'] > upper_bound)]
print(f"\nNumber of Outliers: {len(outliers)}")

In [None]:
correlation_with_year = cars['mileage'].corr(cars['year_of_registration'])
correlation_with_price = cars['mileage'].corr(cars['price'])

print(f"Correlation between mileage and year_of_registration: {correlation_with_year}")
print(f"Correlation between mileage and price: {correlation_with_price}")

In [None]:
qualitative_columns = cars.select_dtypes(include=['object', 'bool']).columns

relationship_strength_mileage = {}
for column in qualitative_columns:
    if cars[column].nunique() > 1:
        category_means = cars.groupby(column)['mileage'].mean()
        overall_mean = cars['mileage'].mean()
        relationship_strength_mileage[column] = ((category_means - overall_mean) ** 2).sum()

relationship_strength_mileage_sorted = sorted(relationship_strength_mileage.items(), key=lambda x: x[1], reverse=True)

for column, strength in relationship_strength_mileage_sorted:
    print(f"Column: {column}, Relationship Strength: {strength}")

In [None]:
grouped_mileage_make_model_year = cars.groupby(
    ['standard_make', 'standard_model', 'year_of_registration']
)['mileage'].median()

grouped_mileage_make_model_year = grouped_mileage_make_model_year.to_dict()

def estimate_mileage_make_model_year(row):
    if pd.isnull(row['mileage']):
        return grouped_mileage_make_model_year.get(
            (row['standard_make'], row['standard_model'], row['year_of_registration']), None
        )
    return row['mileage']

cars['mileage'] = cars.apply(estimate_mileage_make_model_year, axis=1)

remaining_missing_mileage = cars['mileage'].isnull().sum()
print(f"Remaining missing mileage values after considering make, model, and year: {remaining_missing_mileage}")

In [None]:
grouped_mileage_make_model = cars.groupby(
    ['standard_make', 'standard_model']
)['mileage'].median()

grouped_mileage_make_model = grouped_mileage_make_model.to_dict()

def estimate_mileage_make_model(row):
    if pd.isnull(row['mileage']):
        return grouped_mileage_make_model.get(
            (row['standard_make'], row['standard_model']), None
        )
    return row['mileage']

cars['mileage'] = cars.apply(estimate_mileage_make_model, axis=1)

remaining_missing_mileage = cars['mileage'].isnull().sum()
print(f"Remaining missing mileage values after considering make and model: {remaining_missing_mileage}")

In [None]:
print(cars['body_type'].isnull().sum())
null_percentage = (cars['body_type'].isnull().sum() / len(cars)) * 100
print(f"Percentage of missing body_type: {null_percentage:.2f}%")

In [None]:
print(cars['body_type'].value_counts())

In [None]:
numerical_columns = cars.select_dtypes(include=['float64', 'int64']).columns

relationship_strength_numeric = {}
for column in numerical_columns:
    if cars[column].notnull().sum() > 0:
        category_means = cars.groupby('body_type')[column].mean()
        overall_mean = cars[column].mean()
        relationship_strength_numeric[column] = ((category_means - overall_mean) ** 2).sum()

relationship_strength_numeric_sorted = sorted(relationship_strength_numeric.items(), key=lambda x: x[1], reverse=True)

for column, strength in relationship_strength_numeric_sorted:
    print(f"Column: {column}, Relationship Strength: {strength}")

In [None]:
qualitative_columns = cars.select_dtypes(include=['object', 'bool']).columns

relationship_strength_body_type = {}
for column in qualitative_columns:
    if cars[column].nunique() > 1:
        category_means = cars.groupby(column)['body_type'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)
        overall_mode = cars['body_type'].mode()[0]
        relationship_strength_body_type[column] = ((category_means != overall_mode).sum())

relationship_strength_body_type_sorted = sorted(relationship_strength_body_type.items(), key=lambda x: x[1], reverse=True)

for column, strength in relationship_strength_body_type_sorted:
    print(f"Column: {column}, Relationship Strength: {strength}")

In [None]:
grouped_by_features = cars.groupby(
    ['standard_make', 'standard_model', 'price_bins', 'mileage_bins', 'crossover_car_and_van']
)['body_type'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)

def estimate_body_type_from_grouped_features(row):
    if pd.isnull(row['body_type']):
        return grouped_by_features.get(
            (row['standard_make'], row['standard_model'], row['price_bins'], row['mileage_bins'], row['crossover_car_and_van']),
            None
        )
    return row['body_type']

cars['body_type'] = cars.apply(estimate_body_type_from_grouped_features, axis=1)

remaining_missing = cars['body_type'].isnull().sum()
print(f"Remaining missing body_type values: {remaining_missing}")

In [None]:
grouped_by_features = cars.groupby(
    ['standard_model', 'price_bins', 'mileage_bins']
)['body_type'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)

def estimate_body_type_from_grouped_features(row):
    if pd.isnull(row['body_type']):
        return grouped_by_features.get(
            (row['standard_model'], row['price_bins'], row['mileage_bins']),
            None
        )
    return row['body_type']

cars['body_type'] = cars.apply(estimate_body_type_from_grouped_features, axis=1)

remaining_missing = cars['body_type'].isnull().sum()
print(f"Remaining missing body_type values: {remaining_missing}")

In [None]:
grouped_by_features = cars.groupby(
    ['standard_make', 'price_bins', 'mileage_bins']
)['body_type'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)

def estimate_body_type_from_grouped_features(row):
    if pd.isnull(row['body_type']):
        return grouped_by_features.get(
            (row['standard_make'], row['price_bins'], row['mileage_bins']),
            None
        )
    return row['body_type']

cars['body_type'] = cars.apply(estimate_body_type_from_grouped_features, axis=1)

remaining_missing = cars['body_type'].isnull().sum()
print(f"Remaining missing body_type values: {remaining_missing}")

In [None]:
grouped_body_type_model = cars.groupby(
    ['standard_model']
)['body_type'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)

def estimate_body_type_model(row):
    if pd.isnull(row['body_type']):
        return grouped_body_type_model.get(
            row['standard_model'], None
        )
    return row['body_type']

cars['body_type'] = cars.apply(estimate_body_type_model, axis=1)

remaining_missing_body_type = cars['body_type'].isnull().sum()
print(f"Remaining missing body_type values after using model: {remaining_missing_body_type}")

In [None]:
grouped_body_type_make = cars.groupby(
    ['standard_make']
)['body_type'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)

def estimate_body_type_make(row):
    if pd.isnull(row['body_type']):
        return grouped_body_type_make.get(
            row['standard_make'], None
        )
    return row['body_type']

cars['body_type'] = cars.apply(estimate_body_type_make, axis=1)

remaining_missing_body_type = cars['body_type'].isnull().sum()
print(f"Remaining missing body_type values after using make: {remaining_missing_body_type}")

In [None]:
missing_body_type_records = cars[cars['body_type'].isnull()]

print(missing_body_type_records)

missing_body_type_records.to_csv('missing_body_type_records.csv', index=False)

In [None]:
grouped_by_features = cars.groupby(
    ['mileage_bins', 'year_of_registration', 'price_bins']
)['body_type'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)

def estimate_body_type_from_grouped_features(row):
    if pd.isnull(row['body_type']):
        return grouped_by_features.get(
            (row['mileage_bins'], row['year_of_registration'], row['price_bins']),
            None
        )
    return row['body_type']

cars['body_type'] = cars.apply(estimate_body_type_from_grouped_features, axis=1)

remaining_missing = cars['body_type'].isnull().sum()
print(f"Remaining missing body_type values: {remaining_missing}")

In [None]:
grouped_by_features = cars.groupby(
    ['mileage_bins', 'price_bins']
)['body_type'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)

def estimate_body_type_from_grouped_features(row):
    if pd.isnull(row['body_type']):
        return grouped_by_features.get(
            (row['mileage_bins'], row['price_bins']),
            None
        )
    return row['body_type']

cars['body_type'] = cars.apply(estimate_body_type_from_grouped_features, axis=1)

remaining_missing = cars['body_type'].isnull().sum()
print(f"Remaining missing body_type values: {remaining_missing}")

In [None]:
min_mileage = cars['mileage'].min()
max_mileage = cars['mileage'].max()

print(f"Minimum Mileage: {min_mileage}")
print(f"Maximum Mileage: {max_mileage}")

In [None]:
Q1 = cars['mileage'].quantile(0.25)
Q3 = cars['mileage'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = cars[(cars['mileage'] < lower_bound) | (cars['mileage'] > upper_bound)]

outliers_count = outliers.shape[0]
print(f"Number of outliers: {outliers_count}")
print(f"Lower bound: {lower_bound}")
print(f"Upper bound: {upper_bound}")

print(outliers[['mileage']].head())

In [None]:
high_outliers = cars[cars['mileage'] > upper_bound]

print(f"Number of high outliers: {high_outliers.shape[0]}")

print(high_outliers[['mileage', 'price', 'year_of_registration', 'vehicle_condition']].head())

In [None]:
percentiles = cars['mileage'].dropna().quantile([0.75, 0.9, 0.95, 0.99])

def format_ticks(value, _):
    return f'{int(value):,}'

plt.figure(figsize=(10, 6))
plt.hist(cars['mileage'].dropna(), bins=50, alpha=0.7, label='Mileage Distribution', color='orange')
plt.axvline(percentiles[0.75], color='green', linestyle='--', label='75th Percentile')
plt.axvline(percentiles[0.9], color='orange', linestyle='--', label='90th Percentile')
plt.axvline(percentiles[0.95], color='red', linestyle='--', label='95th Percentile')
plt.axvline(percentiles[0.99], color='purple', linestyle='--', label='99th Percentile')

plt.title('Mileage Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Mileage', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.legend()

plt.gca().xaxis.set_major_formatter(FuncFormatter(format_ticks))

plt.show()

In [None]:
unique_body_types = cars['body_type'].unique()

print("Unique values in 'body_type':")
print(unique_body_types)

In [None]:
sports_types = ['Convertible', 'Coupe']
passenger_types = ['SUV', 'Saloon', 'Hatchback', 'Limousine', 'Estate', 'MPV']
commercial_types = ['Pickup', 'Combi Van', 'Panel Van', 'Chassis Cab', 'Car Derived Van']
special_types = ['Minibus', 'Window Van', 'Camper']

def categorize_body_type(body_type):
    if body_type in sports_types:
        return 'Sports'
    elif body_type in passenger_types:
        return 'Passenger'
    elif body_type in commercial_types:
        return 'Commercial'
    elif body_type in special_types:
        return 'Special'
    else:
        return 'Unknown'

cars['vehicle_category'] = cars['body_type'].apply(categorize_body_type)

print(cars['vehicle_category'].value_counts())

In [None]:
cars.columns

In [None]:
def calculate_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    return upper_bound

passenger_upper = calculate_iqr(cars[cars['vehicle_category'] == 'Passenger']['mileage'])
commercial_upper = calculate_iqr(cars[cars['vehicle_category'] == 'Commercial']['mileage'])
special_upper = calculate_iqr(cars[cars['vehicle_category'] == 'Special']['mileage'])
sports_upper = calculate_iqr(cars[cars['vehicle_category'] == 'Sports']['mileage'])

passenger_percentiles = cars[cars['vehicle_category'] == 'Passenger']['mileage'].quantile([0.95, 0.99])
commercial_percentiles = cars[cars['vehicle_category'] == 'Commercial']['mileage'].quantile([0.95, 0.99])
special_percentiles = cars[cars['vehicle_category'] == 'Special']['mileage'].quantile([0.95, 0.99])
sports_percentiles = cars[cars['vehicle_category'] == 'Sports']['mileage'].quantile([0.95, 0.99])

thresholds = {
    'Passenger': {
        'IQR_Upper': passenger_upper,
        '95th Percentile': passenger_percentiles[0.95],
        '99th Percentile': passenger_percentiles[0.99]
    },
    'Commercial': {
        'IQR_Upper': commercial_upper,
        '95th Percentile': commercial_percentiles[0.95],
        '99th Percentile': commercial_percentiles[0.99]
    },
    'Special': {
        'IQR_Upper': special_upper,
        '95th Percentile': special_percentiles[0.95],
        '99th Percentile': special_percentiles[0.99]
    },
    'Sports': {
        'IQR_Upper': sports_upper,
        '95th Percentile': sports_percentiles[0.95],
        '99th Percentile': sports_percentiles[0.99]
    }
}

import pandas as pd
thresholds_df = pd.DataFrame(thresholds).T

thresholds_df

In [None]:
from matplotlib.ticker import FuncFormatter

Sports_mileage = cars[cars['vehicle_category'] == 'Sports']['mileage']

def format_ticks(value, _):
    return f'{int(value):,}'

plt.figure(figsize=(10, 6))
plt.hist(Sports_mileage.dropna(), bins=50, alpha=0.7, color='green', label='Sports')
plt.title('Mileage Distribution for Sports Vehicles')
plt.xlabel('Mileage')
plt.ylabel('Frequency')

plt.axvline(Sports_mileage.quantile(0.95), color='red', linestyle='--', label='95th Percentile')
plt.axvline(Sports_mileage.quantile(0.99), color='green', linestyle='--', label='99th Percentile')

plt.gca().xaxis.set_major_formatter(FuncFormatter(format_ticks))

plt.legend()

plt.show()

In [None]:
from matplotlib.ticker import FuncFormatter

passenger_mileage = cars[cars['vehicle_category'] == 'Passenger']['mileage']

def format_ticks(value, _):
    return f'{int(value):,}'

plt.figure(figsize=(10, 6))
plt.hist(passenger_mileage.dropna(), bins=50, alpha=0.7, color='blue', label='Passenger')
plt.title('Mileage Distribution for Passenger Vehicles')
plt.xlabel('Mileage')
plt.ylabel('Frequency')

plt.axvline(passenger_mileage.quantile(0.95), color='red', linestyle='--', label='95th Percentile')
plt.axvline(passenger_mileage.quantile(0.99), color='green', linestyle='--', label='99th Percentile')

plt.gca().xaxis.set_major_formatter(FuncFormatter(format_ticks))

plt.legend()

plt.show()

In [None]:
commercial_mileage = cars[cars['vehicle_category'] == 'Commercial']['mileage']

def format_ticks(value, _):
    return f'{int(value):,}'

plt.figure(figsize=(10, 6))
plt.hist(commercial_mileage.dropna(), bins=50, alpha=0.7, color='orange', label='Commercial')
plt.title('Mileage Distribution for Commercial Vehicles')
plt.xlabel('Mileage')
plt.ylabel('Frequency')

plt.axvline(commercial_mileage.quantile(0.95), color='red', linestyle='--', label='95th Percentile')
plt.axvline(commercial_mileage.quantile(0.99), color='green', linestyle='--', label='99th Percentile')

plt.gca().xaxis.set_major_formatter(FuncFormatter(format_ticks))

plt.legend()

plt.show()

In [None]:
special_mileage = cars[cars['vehicle_category'] == 'Special']['mileage']

plt.figure(figsize=(10, 6))
plt.hist(special_mileage.dropna(), bins=50, alpha=0.7, color='purple', label='Special')
plt.title('Mileage Distribution for Special Vehicles')
plt.xlabel('Mileage')
plt.ylabel('Frequency')

plt.axvline(special_mileage.quantile(0.95), color='red', linestyle='--', label='95th Percentile')
plt.axvline(special_mileage.quantile(0.99), color='green', linestyle='--', label='99th Percentile')

plt.legend()

plt.show()

In [None]:
import matplotlib.ticker as mticker

palette = sns.color_palette("Set2")

plt.figure(figsize=(12, 7))
ax = sns.boxplot(x='vehicle_category', y='mileage', data=cars, palette=palette, linewidth=2.5)

ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{int(x):,}'))

plt.title('Mileage Distribution by Vehicle Category (Before Outlier Handling)', fontsize=16, fontweight='bold', color='darkblue')
plt.ylabel('Mileage', fontsize=14, fontweight='bold')
plt.xlabel('Vehicle Category', fontsize=14, fontweight='bold')

plt.xticks(rotation=45, fontsize=12)

plt.grid(axis='y', linestyle='--', alpha=0.7)

for flier in ax.artists:
    flier.set_alpha(0.7)

plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.2)

plt.show()

In [None]:
base_upper_bounds = cars.groupby('vehicle_category')['mileage'].apply(
    lambda x: x.quantile(0.99)
)

flexible_upper_bounds = base_upper_bounds * 1.2

def replace_outliers_with_flexibility(row):
    upper_bound = flexible_upper_bounds.get(row['vehicle_category'], None)
    if upper_bound and row['mileage'] > upper_bound:
        return upper_bound
    return row['mileage']

cars['mileage'] = cars.apply(replace_outliers_with_flexibility, axis=1)

print(cars['mileage'].describe())

In [None]:
import matplotlib.ticker as mticker

palette = sns.color_palette("Set2")

plt.figure(figsize=(12, 7))
ax = sns.boxplot(x='vehicle_category', y='mileage', data=cars, palette=palette, linewidth=2.5)

ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{int(x):,}'))

plt.title('Mileage Distribution by Vehicle Category (After Outlier Handling)', fontsize=16, fontweight='bold', color='darkblue')
plt.ylabel('Mileage', fontsize=14, fontweight='bold')
plt.xlabel('Vehicle Category', fontsize=14, fontweight='bold')
plt.xticks(rotation=45, fontsize=12)

plt.grid(axis='y', linestyle='--', alpha=0.7)

for flier in ax.artists:
    flier.set_alpha(0.7)

plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.2)

plt.show()

In [None]:
min_mileage = cars['mileage'].min()
max_mileage = cars['mileage'].max()

print(f"Minimum Mileage: {min_mileage}")
print(f"Maximum Mileage: {max_mileage}")

In [None]:
min_year = cars['year_of_registration'].min()
max_year = cars['year_of_registration'].max()

print(f"Minimum year: {min_year}")
print(f"Maximum year: {max_year}")

In [None]:
missing_year_count = cars['year_of_registration'].isnull().sum()

print(f"Number of missing values in 'year_of_registration': {missing_year_count}")

In [None]:
correlation_price = cars['year_of_registration'].corr(cars['price'])
correlation_mileage = cars['year_of_registration'].corr(cars['mileage'])

print(f"Correlation between year_of_registration and price: {correlation_price}")
print(f"Correlation between year_of_registration and mileage: {correlation_mileage}")

In [None]:
qualitative_columns = cars.select_dtypes(include=['object', 'bool']).columns

relationship_strength = {}
for column in qualitative_columns:
    if cars[column].nunique() > 1:
        category_means = cars.groupby(column)['year_of_registration'].mean()
        overall_mean = cars['year_of_registration'].mean()
        relationship_strength[column] = ((category_means - overall_mean) ** 2).sum()

relationship_strength_sorted = sorted(relationship_strength.items(), key=lambda x: x[1], reverse=True)

for column, strength in relationship_strength_sorted:
    print(f"Column: {column}, Relationship Strength: {strength}")

In [None]:
grouped_years_model_make_mileage_condition = cars[cars['year_of_registration'].notnull()].groupby(
    ['standard_model', 'standard_make', 'mileage_bins', 'vehicle_condition']
)['year_of_registration'].median().round()

def estimate_year_model_make_mileage_condition(row):
    if pd.isnull(row['year_of_registration']):
        return grouped_years_model_make_mileage_condition.get(
            (row['standard_model'], row['standard_make'], row['mileage_bins'], row['vehicle_condition']),
            None
        )
    return row['year_of_registration']

cars['year_of_registration'] = cars.apply(estimate_year_model_make_mileage_condition, axis=1)

remaining_missing_count = cars['year_of_registration'].isnull().sum()
print(f"Number of vehicles still missing year: {remaining_missing_count}")

In [None]:
grouped_years_model_mileage_condition = cars[cars['year_of_registration'].notnull()].groupby(
    ['standard_model', 'mileage_bins', 'vehicle_condition']
)['year_of_registration'].median().round()

def estimate_year_model_mileage_condition(row):
    if pd.isnull(row['year_of_registration']):
        return grouped_years_model_mileage_condition.get(
            (row['standard_model'], row['mileage_bins'], row['vehicle_condition']),
            None
        )
    return row['year_of_registration']

cars['year_of_registration'] = cars.apply(estimate_year_model_mileage_condition, axis=1)

remaining_missing_count = cars['year_of_registration'].isnull().sum()
print(f"Number of vehicles still missing year: {remaining_missing_count}")

In [None]:
grouped_years_make_mileage_condition = cars[cars['year_of_registration'].notnull()].groupby(
    ['standard_make', 'mileage_bins', 'vehicle_condition']
)['year_of_registration'].median().round()

def estimate_year_make_mileage_condition(row):
    if pd.isnull(row['year_of_registration']):
        return grouped_years_make_mileage_condition.get(
            (row['standard_make'], row['mileage_bins'], row['vehicle_condition']),
            None
        )
    return row['year_of_registration']

cars['year_of_registration'] = cars.apply(estimate_year_make_mileage_condition, axis=1)

remaining_missing_count = cars['year_of_registration'].isnull().sum()
print(f"Number of vehicles still missing year: {remaining_missing_count}")

In [None]:
grouped_years_simple = cars[cars['year_of_registration'].notnull()].groupby(
    ['standard_model', 'standard_make']
)['year_of_registration'].median().round()

def estimate_year_simple(row):
    if pd.isnull(row['year_of_registration']):
        return grouped_years_simple.get(
            (row['standard_model'], row['standard_make']),
            None
        )
    return row['year_of_registration']

cars['year_of_registration'] = cars.apply(estimate_year_simple, axis=1)

remaining_missing_count = cars['year_of_registration'].isnull().sum()
print(f"Number of vehicles still missing year: {remaining_missing_count}")

In [None]:
cars_before_1960 = cars[cars['year_of_registration'] < 1960]

before_1960_price_summary = cars_before_1960['price'].describe()

full_price_summary = cars['price'].describe()

print("Summary of prices for cars before 1960:")
print(before_1960_price_summary)
print("\nSummary of prices for the full dataset:")
print(full_price_summary)

In [None]:
records_before_1960 = cars[cars['year_of_registration'] < 1960].shape[0]
print(f"Number of records before 1960: {records_before_1960}")

In [None]:
Q1 = cars['price'].quantile(0.25)
Q3 = cars['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = cars[(cars['price'] < lower_bound) | (cars['price'] > upper_bound)]

outliers_count = len(outliers)

print(f"Total Outliers in Price: {outliers_count}")

In [None]:
outliers_by_make = outliers['standard_make'].value_counts().head(10)

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
outliers_by_make.plot(kind='bar', color='skyblue')
plt.title('Top 10 Brands with Most Price Outliers')
plt.xlabel('Brand (Standard Make)')
plt.ylabel('Count of Outliers')
plt.xticks(rotation=45)
plt.show()

In [None]:
Q1 = cars['price'].quantile(0.25)
Q3 = cars['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = cars[(cars['price'] < lower_bound) | (cars['price'] > upper_bound)]

outliers_count_by_model = outliers['standard_model'].value_counts()

outliers_count_df = outliers_count_by_model.reset_index()
outliers_count_df.columns = ['standard_model', 'outlier_count']

outliers_count_df_sorted = outliers_count_df.sort_values(by='outlier_count', ascending=False)

print("Outlier Counts by Model:")
print(outliers_count_df_sorted)

In [None]:
Q1 = cars['price'].quantile(0.25)
Q3 = cars['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = cars[(cars['price'] < lower_bound) | (cars['price'] > upper_bound)]

outliers_count_by_model = outliers['standard_model'].value_counts()

outliers_count_df = outliers_count_by_model.reset_index()
outliers_count_df.columns = ['standard_model', 'outlier_count']

outliers_count_df_sorted = outliers_count_df.sort_values(by='outlier_count', ascending=False)

print("Price Outlier Counts by Model:")
print(outliers_count_df_sorted)

In [None]:
luxury_threshold = cars['price'].quantile(0.99)
print(luxury_threshold)

In [None]:
luxury_threshold = cars['price'].quantile(0.99)
print(luxury_threshold)

cars['is_luxury'] = cars['price'].apply(lambda x: 1 if x > luxury_threshold else 0)

luxury_count = cars['is_luxury'].sum()
print(luxury_count)

In [None]:
print(cars['fuel_type'].isnull().sum())
null_percentage = (cars['fuel_type'].isnull().sum() / len(cars)) * 100
print(f"Percentage of missing fuel_type: {null_percentage:.2f}%")

In [None]:
print(cars['fuel_type'].value_counts())

In [None]:
numerical_columns = cars.select_dtypes(include=['float64', 'int64']).columns

relationship_strength_numeric_fuel = {}
for column in numerical_columns:
    if cars[column].notnull().sum() > 0:
        category_means = cars.groupby('fuel_type')[column].mean()
        overall_mean = cars[column].mean()
        relationship_strength_numeric_fuel[column] = ((category_means - overall_mean) ** 2).sum()

relationship_strength_numeric_fuel_sorted = sorted(relationship_strength_numeric_fuel.items(), key=lambda x: x[1], reverse=True)
print("Relationship strength with numerical columns:")
for column, strength in relationship_strength_numeric_fuel_sorted:
    print(f"{column}: {strength}")

In [None]:
qualitative_columns = cars.select_dtypes(include=['object', 'bool']).columns

relationship_strength_fuel_qualitative = {}
for column in qualitative_columns:
    if cars[column].nunique() > 1:
        category_modes = cars.groupby(column)['fuel_type'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)
        overall_mode = cars['fuel_type'].mode()[0]
        relationship_strength_fuel_qualitative[column] = ((category_modes != overall_mode).sum())

relationship_strength_fuel_qualitative_sorted = sorted(relationship_strength_fuel_qualitative.items(), key=lambda x: x[1], reverse=True)
print("Relationship strength with qualitative columns:")
for column, strength in relationship_strength_fuel_qualitative_sorted:
    print(f"{column}: {strength}")

In [None]:
columns_to_group_by_fuel = ['standard_model', 'standard_make', 'year_of_registration']

def fill_missing_fuel_model_make_year(group):
    try:
        return group.fillna(group.mode()[0]) if not group.mode().empty else group
    except IndexError:
        return group

cars['fuel_type'] = cars.groupby(columns_to_group_by_fuel)['fuel_type'].transform(fill_missing_fuel_model_make_year)

remaining_missing_fuel_type = cars['fuel_type'].isna().sum()
print(f"Remaining missing values in 'fuel_type' after grouping by model, make, and year: {remaining_missing_fuel_type}")

In [None]:
columns_to_group_by_fuel = ['standard_model', 'year_of_registration']

def fill_missing_fuel_model_year(group):
    try:
        return group.fillna(group.mode()[0]) if not group.mode().empty else group
    except IndexError:
        return group

cars['fuel_type'] = cars.groupby(columns_to_group_by_fuel)['fuel_type'].transform(fill_missing_fuel_model_year)

remaining_missing_fuel_type = cars['fuel_type'].isna().sum()
print(f"Remaining missing values in 'fuel_type' after grouping by model and year: {remaining_missing_fuel_type}")

In [None]:
columns_to_group_by_fuel = ['standard_make', 'year_of_registration']

def fill_missing_fuel_make_year(group):
    try:
        return group.fillna(group.mode()[0]) if not group.mode().empty else group
    except IndexError:
        return group

cars['fuel_type'] = cars.groupby(columns_to_group_by_fuel)['fuel_type'].transform(fill_missing_fuel_make_year)

remaining_missing_fuel_type = cars['fuel_type'].isna().sum()
print(f"Remaining missing values in 'fuel_type' after grouping by make and year: {remaining_missing_fuel_type}")


In [None]:
columns_to_group_by_fuel = ['year_of_registration', 'vehicle_category']

def fill_missing_fuel_year_category(group):
    try:
        return group.fillna(group.mode()[0]) if not group.mode().empty else group
    except IndexError:
        return group

cars['fuel_type'] = cars.groupby(columns_to_group_by_fuel)['fuel_type'].transform(fill_missing_fuel_year_category)

remaining_missing_fuel_type = cars['fuel_type'].isna().sum()
print(f"Remaining missing fuel_type values after grouping by year and vehicle_category: {remaining_missing_fuel_type}")

In [None]:
columns_to_group_by_fuel = ['vehicle_category']

def fill_missing_fuel_category(group):
    try:
        return group.fillna(group.mode()[0]) if not group.mode().empty else group
    except IndexError:
        return group

cars['fuel_type'] = cars.groupby(columns_to_group_by_fuel)['fuel_type'].transform(fill_missing_fuel_category)

remaining_missing_fuel_type = cars['fuel_type'].isna().sum()
print(f"Remaining missing fuel_type values after grouping by vehicle_category: {remaining_missing_fuel_type}")

In [None]:
print(cars['standard_colour'].isnull().sum())
null_percentage = (cars['standard_colour'].isnull().sum() / len(cars)) * 100
print(f"Percentage of missing standard_colour: {null_percentage:.2f}%")

In [None]:
print(cars['fuel_type'].value_counts())

In [None]:
numerical_columns = cars.select_dtypes(include=['float64', 'int64']).columns

relationship_strength_numeric_colour = {}
for column in numerical_columns:
    if cars[column].notnull().sum() > 0:
        category_means = cars.groupby('standard_colour')[column].mean()
        overall_mean = cars[column].mean()
        relationship_strength_numeric_colour[column] = ((category_means - overall_mean) ** 2).sum()

relationship_strength_numeric_colour_sorted = sorted(relationship_strength_numeric_colour.items(), key=lambda x: x[1], reverse=True)
print("Relationship strength with numerical columns:")
for column, strength in relationship_strength_numeric_colour_sorted:
    print(f"{column}: {strength}")

In [None]:
qualitative_columns = cars.select_dtypes(include=['object', 'bool']).columns

relationship_strength_colour_qualitative = {}
for column in qualitative_columns:
    if cars[column].nunique() > 1:
        category_modes = cars.groupby(column)['standard_colour'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)
        overall_mode = cars['standard_colour'].mode()[0]
        relationship_strength_colour_qualitative[column] = ((category_modes != overall_mode).sum())

relationship_strength_colour_qualitative_sorted = sorted(relationship_strength_colour_qualitative.items(), key=lambda x: x[1], reverse=True)
print("Relationship strength with qualitative columns:")
for column, strength in relationship_strength_colour_qualitative_sorted:
    print(f"{column}: {strength}")

In [None]:
columns_to_group_by_colour = ['standard_model', 'standard_make', 'body_type']

def fill_missing_colour(group):
    try:
        return group.fillna(group.mode()[0]) if not group.mode().empty else group
    except IndexError:
        return group

cars['standard_colour'] = cars.groupby(columns_to_group_by_colour)['standard_colour'].transform(fill_missing_colour)

remaining_missing_colour = cars['standard_colour'].isna().sum()
print(f"Remaining missing values in 'standard_colour' after grouping by model, make, and body type: {remaining_missing_colour}")

In [None]:
columns_to_group_by_colour = ['standard_model', 'standard_make']

def fill_missing_colour(group):
    try:
        return group.fillna(group.mode()[0]) if not group.mode().empty else group
    except IndexError:
        return group

cars['standard_colour'] = cars.groupby(columns_to_group_by_colour)['standard_colour'].transform(fill_missing_colour)

remaining_missing_colour = cars['standard_colour'].isna().sum()
print(f"Remaining missing values in 'standard_colour' after grouping by model and make: {remaining_missing_colour}")

In [None]:
columns_to_group_by_colour = ['body_type']

def fill_missing_colour(group):
    try:
        return group.fillna(group.mode()[0]) if not group.mode().empty else group
    except IndexError:
        return group

cars['standard_colour'] = cars.groupby(columns_to_group_by_colour)['standard_colour'].transform(fill_missing_colour)

remaining_missing_colour = cars['standard_colour'].isna().sum()
print(f"Remaining missing values in 'standard_colour' after grouping by body type: {remaining_missing_colour}")

# Feature Engineering

In [None]:
cars.columns

In [None]:
cars = cars.drop(columns=['public_reference', 'reg_code', 'mileage_bins', 'price_bins', 'vehicle_category'])

In [None]:
cars['body_type'] = cars.apply(
    lambda row: 'Crossover' if row['crossover_car_and_van'] == True else row['body_type'], axis=1
    )

cars.drop(columns=['crossover_car_and_van'], inplace=True)

print(cars['body_type'].value_counts())

In [None]:
records_before_2000 = cars[cars['year_of_registration'] < 2000].shape[0]

print(f"Number of records before 2000: {records_before_2000}")

In [None]:
cars['is_old'] = cars['year_of_registration'].apply(lambda x: 1 if x < 2000 else 0)

In [None]:
old_cars_mean_price = cars[cars['is_old'] == 1]['price'].mean()
new_cars_mean_price = cars[cars['is_old'] == 0]['price'].mean()
print(f"Average price for old cars: {old_cars_mean_price}")
print(f"Average price for new cars: {new_cars_mean_price}")

In [None]:
numeric_columns = cars.select_dtypes(include=['float64', 'int64'])

correlation_with_price = numeric_columns.corr()['price'].sort_values(ascending=False)

print(correlation_with_price)

plt.figure(figsize=(10, 8))
sns.heatmap(numeric_columns.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
from scipy.stats import f_oneway

categorical_columns = ['vehicle_condition', 'standard_colour', 'standard_make', 'standard_model', 'fuel_type', 'body_type']

for column in categorical_columns:
    groups = [cars[cars[column] == category]['price'] for category in cars[column].unique()]
    f_stat, p_value = f_oneway(*groups)
    print(f"Column: {column}, F-Statistic: {f_stat:.2f}, P-Value: {p_value:.4f}")

In [None]:
features_to_check = ['mileage', 'price']

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

for i, feature in enumerate(features_to_check):
    if feature == 'price':
        data = cars[feature] / 1000
        sns.kdeplot(data, shade=True, ax=axes[i], label='Price (in thousands)')
    else:
        sns.kdeplot(cars[feature], shade=True, ax=axes[i], label='Mileage')

    axes[i].set_title(f"Distribution of {feature}")
    axes[i].ticklabel_format(style='plain', axis='both')
    axes[i].legend()

plt.tight_layout()
plt.show()

In [None]:
cars['mileage_log'] = np.log1p(cars['mileage'])
cars['price_log'] = np.log1p(cars['price'])

cars.drop(columns=['mileage', 'price'], inplace=True)

print(cars.head())

In [None]:
features_to_check = ['mileage_log', 'price_log']

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

for i, feature in enumerate(features_to_check):
    if feature == 'price_log':
        data = cars[feature] / 1000
        sns.kdeplot(data, shade=True, ax=axes[i], label='Price (in thousands)')
    else:
        sns.kdeplot(cars[feature], shade=True, ax=axes[i], label='mileage_log')

    axes[i].set_title(f"Distribution of {feature}")
    axes[i].ticklabel_format(style='plain', axis='both')
    axes[i].legend()

plt.tight_layout()
plt.show()

In [None]:
cars['vehicle_condition'].value_counts()

In [None]:
cars['is_luxury'].value_counts()

# Encodings


In [None]:
cars.columns

In [None]:
# Apply One-Hot Encoding
vehicle_condition_encoded = pd.get_dummies(cars['vehicle_condition'], prefix='vehicle_condition')

is_luxury_encoded = pd.get_dummies(cars['is_luxury'], prefix='is_luxury')

is_old_encoded = pd.get_dummies(cars['is_old'], prefix='is_old')

cars = pd.concat([cars, vehicle_condition_encoded, is_luxury_encoded, is_old_encoded], axis=1)

cars.drop(['vehicle_condition', 'is_luxury', 'is_old'], axis=1, inplace=True)

cars.head()

In [None]:
# Apply Target Encoding
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(cars, test_size=0.2, random_state=42)

def target_encode(feature, target, train, full_data):
    """
    Performs target encoding for a given feature and updates the full dataset.

    Parameters:
        feature: str, name of the column to encode
        target: str, name of the target column
        train: DataFrame, training set
        full_data: DataFrame, full dataset to update

    Returns:
        Updates the full_data DataFrame with encoded values for the feature.
    """
    category_means = train.groupby(feature)[target].mean()

    full_data[f"{feature}_encoded"] = full_data[feature].map(category_means)

    if full_data[f"{feature}_encoded"].isnull().any():
        print(f"Warning: Unknown categories in feature '{feature}' handled with global mean.")
    full_data[f"{feature}_encoded"].fillna(train[target].mean(), inplace=True)

features_to_encode = ['standard_colour', 'fuel_type', 'standard_make', 'standard_model', 'body_type']

for feature in features_to_encode:
    target_encode(feature, 'price_log', train_data, train_data)
    target_encode(feature, 'price_log', train_data, test_data)

train_data.drop(columns=features_to_encode, inplace=True)
test_data.drop(columns=features_to_encode, inplace=True)

X_train = train_data.drop(columns=['price_log'])
y_train = train_data['price_log']

X_test = test_data.drop(columns=['price_log'])
y_test = test_data['price_log']

print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)

In [None]:
unknown_counts = {}
for feature in ['standard_colour_encoded', 'fuel_type_encoded',
                'standard_make_encoded', 'standard_model_encoded',
                'body_type_encoded']:
    if feature in test_data.columns:
        unknown_categories = set(test_data[feature].unique()) - set(train_data[feature].unique())
        unknown_counts[feature] = len(unknown_categories)

print("Count of unknown categories in test data:")
print(unknown_counts)

# **Part 1**

# Automated Feature Selection


In [None]:
X_train.columns

In [None]:
# RFECV (RandomForestRegressor)
lr_model = LinearRegression()

rfecv = RFECV(estimator=lr_model, step=1, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
rfecv.fit(X_train, y_train)

selected_features = X_train.columns[rfecv.support_]
print("Selected features by RFECV:", selected_features.tolist())
print("Number of features selected:", rfecv.n_features_)

X_train_rfecv = rfecv.transform(X_train)
X_test_rfecv = rfecv.transform(X_test)

final_model = RandomForestRegressor(random_state=42)
final_model.fit(X_train_rfecv, y_train)
y_pred_rfecv = final_model.predict(X_test_rfecv)
mae_rfecv = mean_absolute_error(y_test, y_pred_rfecv)

error_factor = np.exp(mae_rfecv)
relative_error_percent = (error_factor - 1) * 100

print(f"MAE with RFECV-selected features (log scale): {mae_rfecv:.2f}")
print(f"Approx. real-world price error: {relative_error_percent:.1f}%")

plt.figure(figsize=(7, 5))
plt.plot(range(1, len(rfecv.cv_results_["mean_test_score"]) + 1), -rfecv.cv_results_["mean_test_score"], label='CV MAE')
plt.axhline(y=mae_rfecv, color='red', linestyle='--', label='Test MAE = {:.2f}'.format(mae_rfecv))
plt.xlabel("Number of features selected")
plt.ylabel("Mean Absolute Error")
plt.title("RFECV Cross-Validation vs Test Performance")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
rf_full = RandomForestRegressor(random_state=42)
rf_full.fit(X_train, y_train)
y_pred_full = rf_full.predict(X_test)
mae_full = mean_absolute_error(y_test, y_pred_full)
percent_full = (np.exp(mae_full) - 1) * 100

rf_rfecv = RandomForestRegressor(random_state=42)
rf_rfecv.fit(X_train_rfecv, y_train)
y_pred_rfecv = rf_rfecv.predict(X_test_rfecv)
mae_rfecv = mean_absolute_error(y_test, y_pred_rfecv)
percent_rfecv = (np.exp(mae_rfecv) - 1) * 100

print(f"MAE with all features (log scale): {mae_full:.3f} → approx. error: {percent_full:.1f}%")
print(f"MAE with RFECV-selected features:    {mae_rfecv:.3f} → approx. error: {percent_rfecv:.1f}%")

In [None]:
feature_ranks = pd.DataFrame({
    'Feature': X_train.columns,
    'Rank': rfecv.ranking_
}).sort_values('Rank')


print(feature_ranks)

# Tree Ensembles


In [None]:
# Bagging (RandomForestRegressor)
base_model = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}

grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

y_train_pred = best_rf_model.predict(X_train)
y_test_pred = best_rf_model.predict(X_test)

mae_train = mean_absolute_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

mae_test = mean_absolute_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

cv_r2_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='r2')
mean_cv_r2 = np.mean(cv_r2_scores)

print("Results of Random Forest Regressor Model")
print("Best Parameters:", grid_search.best_params_)
print(f"Mean CV R² Score: {mean_cv_r2:.3f}")
print("\nTrain Metrics:")
print(f"MAE: {mae_train:.3f} | MSE: {mse_train:.3f} | RMSE: {rmse_train:.3f} | R²: {r2_train:.3f}")
print("\nTest Metrics:")
print(f"MAE: {mae_test:.3f} | MSE: {mse_test:.3f} | RMSE: {rmse_test:.3f} | R²: {r2_test:.3f}")

In [None]:
# Gradient Boosting Regressor
base_model = GradientBoostingRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
best_gbr_model = grid_search.best_estimator_

y_train_pred = best_gbr_model.predict(X_train)
y_test_pred = best_gbr_model.predict(X_test)

mae_train = mean_absolute_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

mae_test = mean_absolute_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

cv_r2_scores = cross_val_score(best_gbr_model, X_train, y_train, cv=5, scoring='r2')
mean_cv_r2 = np.mean(cv_r2_scores)

print(f"Results of GradientBoostingRegressor Model")
print("Best Parameters:", grid_search.best_params_)
print(f"Mean CV R² Score: {mean_cv_r2:.3f}")

print("\nTrain Metrics:")
print(f"MAE: {mae_train:.3f} | MSE: {mse_train:.3f} | RMSE: {rmse_train:.3f} | R²: {r2_train:.3f}")

print("\nTest Metrics:")
print(f"MAE: {mae_test:.3f} | MSE: {mse_test:.3f} | RMSE: {rmse_test:.3f} | R²: {r2_test:.3f}")

# Ensemble of Tree Ensembles

In [None]:
# Voting (VotingRegressor)
voting = VotingRegressor(estimators=[
    ('rf', best_rf_model),
    ('gb', best_gbr_model)
])

voting.fit(X_train, y_train)

y_train_pred = voting.predict(X_train)
y_test_pred = voting.predict(X_test)

mae_train = mean_absolute_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

mae_test = mean_absolute_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

print("Results of Voting Regressor")
print("\nTrain Metrics:")
print(f"MAE: {mae_train:.3f} | MSE: {mse_train:.3f} | RMSE: {rmse_train:.3f} | R²: {r2_train:.3f}")

print("\nTest Metrics:")
print(f"MAE: {mae_test:.3f} | MSE: {mse_test:.3f} | RMSE: {rmse_test:.3f} | R²: {r2_test:.3f}")

In [None]:
# Stacking (StackingRegressor)
estimators = [
    ('rf', best_rf_model),
    ('gb', best_gbr_model)
]

final_estimator = LinearRegression()

stacking = StackingRegressor(
    estimators=estimators,
    final_estimator=final_estimator,
    passthrough=False,
    n_jobs=-1
)

stacking.fit(X_train, y_train)

y_train_pred = stacking.predict(X_train)
y_test_pred = stacking.predict(X_test)

mae_train = mean_absolute_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

mae_test = mean_absolute_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

print("Results of Stacking Regressor")
print("\nTrain Metrics:")
print(f"MAE: {mae_train:.3f} | MSE: {mse_train:.3f} | RMSE: {rmse_train:.3f} | R²: {r2_train:.3f}")

print("\nTest Metrics:")
print(f"MAE: {mae_test:.3f} | MSE: {mse_test:.3f} | RMSE: {rmse_test:.3f} | R²: {r2_test:.3f}")

In [None]:
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', None)

results = {}

models = {
    "Random Forest": best_rf_model,
    "Gradient Boosting": best_gbr_model,
    "Voting Regressor": voting,
    "Stacking Regressor": stacking
}

for name, model in models.items():
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    mae_train = mean_absolute_error(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train, y_train_pred)

    mae_test = mean_absolute_error(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, y_test_pred)

    results[name] = {
        "MAE Train": round(mae_train, 3),
        "MAE Test": round(mae_test, 3),
        "RMSE Train": round(rmse_train, 3),
        "RMSE Test": round(rmse_test, 3),
        "R² Train": round(r2_train, 3),
        "R² Test": round(r2_test, 3)
    }

results_df = pd.DataFrame(results).T
results_df.index.name = "Model"

print("\n Final Comparison Table (Train vs Test):\n")
print(results_df)

# Feature Importance

In [None]:
rf_importances = pd.Series(best_rf_model.feature_importances_, index=X_train.columns).sort_values(ascending=True)
gb_importances = pd.Series(best_gbr_model.feature_importances_, index=X_train.columns).sort_values(ascending=True)

fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)

axes[0].barh(rf_importances.index, rf_importances.values, color='skyblue')
axes[0].set_title("Random Forest Feature Importance")
axes[0].set_xlabel("Importance Score")
axes[0].grid(True)

axes[1].barh(gb_importances.index, gb_importances.values, color='salmon')
axes[1].set_title("Gradient Boosting Feature Importance")
axes[1].set_xlabel("Importance Score")
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
rf_perm = permutation_importance(best_rf_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
rf_sorted_idx = rf_perm.importances_mean.argsort()
rf_labels = X_test.columns[rf_sorted_idx]
rf_values = rf_perm.importances_mean[rf_sorted_idx]

gb_perm = permutation_importance(best_gbr_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
gb_sorted_idx = gb_perm.importances_mean.argsort()
gb_labels = X_test.columns[gb_sorted_idx]
gb_values = gb_perm.importances_mean[gb_sorted_idx]

fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)

axes[0].barh(rf_labels, rf_values, color='lightblue')
axes[0].set_title("Permutation Importance – Random Forest")
axes[0].set_xlabel("Mean Importance")
axes[0].grid(True)

axes[1].barh(gb_labels, gb_values, color='lightcoral')
axes[1].set_title("Permutation Importance – Gradient Boosting")
axes[1].set_xlabel("Mean Importance")
axes[1].grid(True)

plt.tight_layout()
plt.show()

# SHAP

In [None]:
X_test_sample = X_test.sample(n=1000, random_state=42)

In [None]:
for col in X_test_sample.select_dtypes(include='bool').columns:
    X_test_sample[col] = X_test_sample[col].astype('int64')

In [None]:
explainer_rf = shap.TreeExplainer(best_rf_model)
shap_values_rf = explainer_rf(X_test_sample)

explainer_gb = shap.Explainer(best_gbr_model)
shap_values_gb = explainer_gb(X_test_sample)

plt.figure()
shap.summary_plot(shap_values_rf, X_test_sample, show=False)
plt.title("Random Forest", fontsize=14)
plt.tight_layout()
buf1 = io.BytesIO()
plt.savefig(buf1, format='png')
plt.close()

plt.figure()
shap.summary_plot(shap_values_gb, X_test_sample, show=False)
plt.title("Gradient Boosting", fontsize=14)
plt.tight_layout()
buf2 = io.BytesIO()
plt.savefig(buf2, format='png')
plt.close()

img1 = Image.open(buf1)
img2 = Image.open(buf2)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].imshow(img1)
axes[0].axis('off')
axes[0].set_title("SHAP – Random Forest")

axes[1].imshow(img2)
axes[1].axis('off')
axes[1].set_title("SHAP – Gradient Boosting")

plt.tight_layout()
plt.show()

In [None]:
mean_shap_rf = np.abs(shap_values_rf.values).mean(axis=0)
shap_importance_rf = pd.Series(mean_shap_rf, index=X_test_sample.columns).sort_values()

mean_shap_gb = np.abs(shap_values_gb.values).mean(axis=0)
shap_importance_gb = pd.Series(mean_shap_gb, index=X_test_sample.columns).sort_values()

fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)

axes[0].barh(shap_importance_rf.index, shap_importance_rf.values, color='skyblue')
axes[0].set_title("Mean SHAP Value – Random Forest")
axes[0].set_xlabel("Average |SHAP value|")
axes[0].grid(True)

axes[1].barh(shap_importance_gb.index, shap_importance_gb.values, color='salmon')
axes[1].set_title("Mean SHAP Value – Gradient Boosting")
axes[1].set_xlabel("Average |SHAP value|")
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
instance = X_test_sample.iloc[0:1]

# ========== RANDOM FOREST ==========
explainer_rf = shap.TreeExplainer(best_rf_model)
shap_values_rf = explainer_rf(instance)

plt.figure(figsize=(12, 10))  # فضای کافی برای اسم‌ها
shap.plots.waterfall(shap_values_rf[0], max_display=13, show=False)
buf1 = io.BytesIO()
plt.savefig(buf1, format='png', bbox_inches='tight')
plt.close()

# ========== GRADIENT BOOSTING ==========
explainer_gb = shap.TreeExplainer(best_gbr_model)
shap_values_gb = explainer_gb(instance)

plt.figure(figsize=(12, 10))
shap.plots.waterfall(shap_values_gb[0], max_display=13, show=False)
buf2 = io.BytesIO()
plt.savefig(buf2, format='png', bbox_inches='tight')
plt.close()

In [None]:
img1 = Image.open(buf1)
img2 = Image.open(buf2)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].imshow(img1)
axes[0].axis('off')
axes[0].set_title("Waterfall – Random Forest")

axes[1].imshow(img2)
axes[1].axis('off')
axes[1].set_title("Waterfall – Gradient Boosting")

plt.tight_layout()
plt.show()

# PDP

In [None]:
features = ['year_of_registration', 'mileage_log']

fig, ax = plt.subplots(1, 2, figsize=(12, 5))
PartialDependenceDisplay.from_estimator(
    best_gbr_model,
    X_test,
    features=features,
    kind='average',
    ax=ax,
    grid_resolution=100
)

mileage_ax = ax[1]
custom_mileage_vals = [1000, 5000, 10000, 20000, 50000, 100000, 200000]
custom_ticks_log = np.log(custom_mileage_vals)
custom_labels = ['1000', '5000', '10000', '20000', '50000', '100000', '200000']

mileage_ax.set_xticks(custom_ticks_log)
mileage_ax.set_xticklabels(custom_labels)
mileage_ax.set_xlabel("Mileage (real km)")

fig.suptitle("Partial Dependence Plots – Gradient Boosting", fontsize=14)
for axis in ax:
    axis.tick_params(axis='x', labelrotation=45)
plt.tight_layout()
plt.show()

In [None]:
features = ['year_of_registration', 'mileage_log']

fig, ax = plt.subplots(1, 2, figsize=(12, 5))
display = PartialDependenceDisplay.from_estimator(
    best_gbr_model,
    X_test,
    features=features,
    kind='both',
    ax=ax,
    grid_resolution=100,
    pd_line_kw = {'color': 'red'}
)

mileage_ax = ax[1]
custom_mileage_vals = [1000, 5000, 10000, 20000, 50000, 100000, 200000]
custom_ticks_log = np.log(custom_mileage_vals)
custom_labels = ['1000', '5000', '10000', '20000', '50000', '100000', '200000']

mileage_ax.set_xticks(custom_ticks_log)
mileage_ax.set_xticklabels(custom_labels)
mileage_ax.set_xlabel("Mileage")

fig.suptitle("Partial Dependence Plots with ICE – Gradient Boosting", fontsize=14)
plt.tight_layout()

for axis in ax:
    axis.tick_params(axis='x', labelrotation=45)

plt.show()

In [None]:
features = ['year_of_registration', 'mileage_log']

fig, ax = plt.subplots(1, 2, figsize=(12, 5))
PartialDependenceDisplay.from_estimator(
    best_rf_model,
    X_test,
    features=features,
    kind='average',
    ax=ax,
    grid_resolution=100
)

mileage_ax = ax[1]
custom_mileage_vals = [1000, 5000, 10000, 20000, 50000, 100000, 200000]
custom_ticks_log = np.log(custom_mileage_vals)
custom_labels = ['1000', '5000', '10000', '20000', '50000', '100000', '200000']

mileage_ax.set_xticks(custom_ticks_log)
mileage_ax.set_xticklabels(custom_labels)
mileage_ax.set_xlabel("Mileage")

fig.suptitle("Partial Dependence Plots – Random Forest", fontsize=14)
for axis in ax:
    axis.tick_params(axis='x', labelrotation=45)
plt.tight_layout()
plt.show()

In [None]:
features = ['year_of_registration', 'mileage_log']

fig, ax = plt.subplots(1, 2, figsize=(12, 5))
display = PartialDependenceDisplay.from_estimator(
    best_rf_model,
    X_test,
    features=features,
    kind='both',
    ax=ax,
    grid_resolution=100,
    pd_line_kw = {'color': 'red'}
)

mileage_ax = ax[1]
custom_mileage_vals = [1000, 5000, 10000, 20000, 50000, 100000, 200000]
custom_ticks_log = np.log(custom_mileage_vals)
custom_labels = ['1000', '5000', '10000', '20000', '50000', '100000', '200000']

mileage_ax.set_xticks(custom_ticks_log)
mileage_ax.set_xticklabels(custom_labels)
mileage_ax.set_xlabel("Mileage")

fig.suptitle("Partial Dependence Plots with ICE – Random Forest", fontsize=14)
plt.tight_layout()

for axis in ax:
    axis.tick_params(axis='x', labelrotation=45)

plt.show()

# **Part 2**

# Dimensionality Reduction (Linear)

In [None]:
pd.DataFrame({
    'Column Name': X_train.columns,
    'Data Type': [X_train[col].dtype for col in X_train.columns]
})

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
pca = PCA()
X_pca = pca.fit_transform(X_train_scaled)

explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

plt.figure(figsize=(6, 4))
plt.plot(range(1, len(cumulative_variance)+1), cumulative_variance, marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by PCA Components')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
pca = PCA(n_components=8)
X_pca = pca.fit_transform(X_train_scaled)

print("Original shape:", X_train_scaled.shape)
print("Transformed shape:", X_pca.shape)

In [None]:
loadings = pd.DataFrame(pca.components_.T,
                        columns=[f'PC{i+1}' for i in range(pca.n_components_)],
                        index=X_train.columns)

for i in range(pca.n_components_):
    print(f"\n Top features for PC{i+1}:")
    print(loadings.iloc[:, i].abs().sort_values(ascending=False).head(5))

In [None]:
pca = PCA(n_components=8)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

results_pca = {}

models_pca = {
    "Random Forest (PCA)": RandomForestRegressor(random_state=42),
    "Gradient Boosting (PCA)": GradientBoostingRegressor(random_state=42),
    "Voting Regressor (PCA)": VotingRegressor(estimators=[
        ('rf', RandomForestRegressor(random_state=42)),
        ('gbr', GradientBoostingRegressor(random_state=42))
    ]),
    "Stacking Regressor (PCA)": StackingRegressor(estimators=[
        ('rf', RandomForestRegressor(random_state=42)),
        ('gbr', GradientBoostingRegressor(random_state=42))
    ], final_estimator=RandomForestRegressor(random_state=42))
}

for name, model in models_pca.items():
    model.fit(X_train_pca, y_train)
    y_train_pred = model.predict(X_train_pca)
    y_test_pred = model.predict(X_test_pca)

    results_pca[name] = {
        "MAE Train": round(mean_absolute_error(y_train, y_train_pred), 3),
        "MAE Test": round(mean_absolute_error(y_test, y_test_pred), 3),
        "RMSE Train": round(np.sqrt(mean_squared_error(y_train, y_train_pred)), 3),
        "RMSE Test": round(np.sqrt(mean_squared_error(y_test, y_test_pred)), 3),
        "R² Train": round(r2_score(y_train, y_train_pred), 3),
        "R² Test": round(r2_score(y_test, y_test_pred), 3)
    }

results_pca_df = pd.DataFrame(results_pca).T
results_pca_df.index.name = "Model"

combined_results = pd.concat([results_df, results_pca_df])
print("\n📊 Final Comparison – Before and After PCA:\n")
print(combined_results)

# Dimensionality Reduction (Non-Linear)

# t_SNE

In [None]:
sample_size = 10000
X_sample = X_train.sample(n=sample_size, random_state=42)

scaler = StandardScaler()
X_sample_scaled = scaler.fit_transform(X_sample)

In [None]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_tsne = tsne.fit_transform(X_sample_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], s=2)
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('t-SNE Projection of Car Dataset')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
tsne_df = pd.DataFrame(X_tsne, columns=['t-SNE 1', 't-SNE 2'])
tsne_df['year_of_registration'] = X_sample['year_of_registration'].values

plt.figure(figsize=(8, 5))
sns.scatterplot(
    data=tsne_df,
    x='t-SNE 1', y='t-SNE 2',
    hue='year_of_registration',
    palette='coolwarm',
    s=60,
    alpha=0.8
)
plt.title('t-SNE Visualization Colored by Year of Registration')
plt.legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
tsne_df = pd.DataFrame(X_tsne, columns=['t-SNE 1', 't-SNE 2'])

mileage_real = np.exp(X_sample['mileage_log'].values)

bins = [0, 5000, 20000, 50000, 100000, 200000, 500000]
labels = ['<5000', '5000–20000', '20000–50000', '50000–100000', '100000–200000', '200000+']
mileage_group = pd.cut(mileage_real, bins=bins, labels=labels)

tsne_df['mileage_group'] = mileage_group

plt.figure(figsize=(8, 5))
sns.scatterplot(
    data=tsne_df,
    x='t-SNE 1', y='t-SNE 2',
    hue='mileage_group',
    palette='viridis',
    s=60,
    alpha=0.8
)
plt.title('t-SNE Colored by Mileage Groups')
plt.legend(title='Mileage Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Isomap

In [None]:
sample_size = 10000
X_sample = X_train.sample(n=sample_size, random_state=42)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_sample_scaled = scaler.fit_transform(X_sample)

isomap = Isomap(n_neighbors=10, n_components=2)
X_isomap = isomap.fit_transform(X_sample_scaled)

plt.figure(figsize=(6,4))
plt.scatter(X_isomap[:, 0], X_isomap[:, 1], s=2)
plt.xlabel('Isomap Component 1')
plt.ylabel('Isomap Component 2')
plt.title('Isomap Projection of Car Dataset')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
isomap_df = pd.DataFrame(X_isomap, columns=['Isomap 1', 'Isomap 2'])
isomap_df['year_of_registration'] = X_sample['year_of_registration'].values

plt.figure(figsize=(8, 5))
sns.scatterplot(
    data=isomap_df,
    x='Isomap 1',
    y='Isomap 2',
    hue='year_of_registration',
    palette='coolwarm',
    s=50,
    alpha=0.8
)
plt.title('Isomap Visualization Colored by Year of Registration')
plt.xlabel('Isomap 1')
plt.ylabel('Isomap 2')
plt.legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
n_components = 2

isomap = Isomap(n_components=n_components)
X_isomap = isomap.fit_transform(X_sample_scaled)

plt.figure(figsize=(6, 4))
sns.scatterplot(
    x=X_isomap[:, 0],
    y=X_isomap[:, 1],
    hue=X_sample['is_old_1'],
    palette='plasma',
    s=30
)
plt.title("Isomap Visualization Colored by is_old_1")
plt.xlabel("Isomap 1")
plt.ylabel("Isomap 2")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
isomap_df = pd.DataFrame(X_isomap, columns=['Isomap 1', 'Isomap 2'])

mileage_real = np.exp(X_sample['mileage_log'].values)

bins = [0, 5000, 10000, 20000, 50000, 100000, 200000, np.inf]
labels = ['<5000', '5000–10000', '10000–20000', '20000–50000', '50000–100000', '100000–200000', '200000+']
isomap_df['Mileage Category'] = pd.cut(mileage_real, bins=bins, labels=labels)

plt.figure(figsize=(8, 5))
sns.scatterplot(data=isomap_df, x='Isomap 1', y='Isomap 2', hue='Mileage Category', palette='viridis', s=50)
plt.title('Isomap Visualization Colored by Mileage (Binned)')
plt.xlabel('Isomap 1')
plt.ylabel('Isomap 2')
plt.legend(title='Mileage', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Polynomial Regression

In [None]:
poly2 = PolynomialFeatures(degree=2)
X_train_poly2 = poly2.fit_transform(X_train['mileage_log'].values.reshape(-1, 1))
model2 = LinearRegression()
model2.fit(X_train_poly2, y_train)

poly4 = PolynomialFeatures(degree=4)
X_train_poly4 = poly4.fit_transform(X_train['mileage_log'].values.reshape(-1, 1))
model4 = LinearRegression()
model4.fit(X_train_poly4, y_train)

y_train_pred2 = model2.predict(X_train_poly2)
y_train_pred4 = model4.predict(X_train_poly4)

print("Polynomial Regression (Degree 2):")
print(f"MAE: {mean_absolute_error(y_train, y_train_pred2):.4f}")
print(f"MSE: {mean_squared_error(y_train, y_train_pred2):.4f}")
print(f"R2 Score: {r2_score(y_train, y_train_pred2):.4f}")

print("\n Polynomial Regression (Degree 4):")
print(f"MAE: {mean_absolute_error(y_train, y_train_pred4):.4f}")
print(f"MSE: {mean_squared_error(y_train, y_train_pred4):.4f}")
print(f"R2 Score: {r2_score(y_train, y_train_pred4):.4f}")

mileage_range = np.linspace(X_train['mileage_log'].min(), X_train['mileage_log'].max(), 500).reshape(-1, 1)
mileage_range_poly2 = poly2.transform(mileage_range)
mileage_range_poly4 = poly4.transform(mileage_range)

y_pred_poly2 = model2.predict(mileage_range_poly2)
y_pred_poly4 = model4.predict(mileage_range_poly4)

plt.figure(figsize=(6, 4))
plt.scatter(X_train['mileage_log'], y_train, s=10, label="Data Points")
plt.plot(mileage_range, y_pred_poly2, color='red', label="Polynomial Regression (Degree 2)")
plt.plot(mileage_range, y_pred_poly4, color='green', label="Polynomial Regression (Degree 4)")
plt.xlabel('Mileage (log)')
plt.ylabel('Price')
plt.title('Polynomial Regression Fit (Degree 2 vs Degree 4)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
selected_features = ['year_of_registration', 'standard_model_encoded', 'mileage_log']
X_selected = X_train[selected_features]

poly2 = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly2 = poly2.fit_transform(X_selected)
model2 = LinearRegression()
model2.fit(X_train_poly2, y_train)

poly4 = PolynomialFeatures(degree=4, include_bias=False)
X_train_poly4 = poly4.fit_transform(X_selected)
model4 = LinearRegression()
model4.fit(X_train_poly4, y_train)

y_train_pred2 = model2.predict(X_train_poly2)
y_train_pred4 = model4.predict(X_train_poly4)

print("Polynomial Regression with 3 Features (Degree 2):")
print(f"MAE: {mean_absolute_error(y_train, y_train_pred2):.4f}")
print(f"MSE: {mean_squared_error(y_train, y_train_pred2):.4f}")
print(f"R2 Score: {r2_score(y_train, y_train_pred2):.4f}")

print("\n Polynomial Regression with 3 Features (Degree 4):")
print(f"MAE: {mean_absolute_error(y_train, y_train_pred4):.4f}")
print(f"MSE: {mean_squared_error(y_train, y_train_pred4):.4f}")
print(f"R2 Score: {r2_score(y_train, y_train_pred4):.4f}")

In [None]:
selected_features = ['year_of_registration', 'standard_model_encoded', 'mileage_log',
                     'standard_make_encoded', 'body_type_encoded']
X_selected = X_train[selected_features]

poly2 = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly2 = poly2.fit_transform(X_selected)
model2 = LinearRegression()
model2.fit(X_train_poly2, y_train)

poly4 = PolynomialFeatures(degree=4, include_bias=False)
X_train_poly4 = poly4.fit_transform(X_selected)
model4 = LinearRegression()
model4.fit(X_train_poly4, y_train)

y_train_pred2 = model2.predict(X_train_poly2)
y_train_pred4 = model4.predict(X_train_poly4)

print("Polynomial Regression with 5 Features (Degree 2):")
print(f"MAE: {mean_absolute_error(y_train, y_train_pred2):.4f}")
print(f"MSE: {mean_squared_error(y_train, y_train_pred2):.4f}")
print(f"R2 Score: {r2_score(y_train, y_train_pred2):.4f}")

print("\n Polynomial Regression with 5 Features (Degree 4):")
print(f"MAE: {mean_absolute_error(y_train, y_train_pred4):.4f}")
print(f"MSE: {mean_squared_error(y_train, y_train_pred4):.4f}")
print(f"R2 Score: {r2_score(y_train, y_train_pred4):.4f}")

# Clustering for Feature Engineering

In [None]:
X_selected = X_train[['year_of_registration', 'mileage_log']].sample(n=10000, random_state=42)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)

df_kmeans = X_selected.copy()
df_kmeans['mileage'] = np.exp(df_kmeans['mileage_log'])
df_kmeans['cluster_label'] = kmeans_labels

plt.figure(figsize=(6, 4))
sns.scatterplot(
    data=df_kmeans,
    x='mileage',
    y='year_of_registration',
    hue='cluster_label',
    palette='tab10',
    s=30
)
plt.title('KMeans Clusters Visualization (k=3)')
plt.xlabel('Mileage')
plt.ylabel('Year of Registration')
plt.legend(title='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
X_sample = X_train[['year_of_registration', 'mileage_log']].sample(n=10000, random_state=42)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_sample)

k_range = range(2, 11)
inertias = []
silhouette_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, labels))

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(k_range, inertias, marker='o', color='blue')
axes[0].set_title('Elbow Method (Inertia)')
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].grid(True)

axes[1].plot(k_range, silhouette_scores, marker='o', color='red')
axes[1].set_title('Silhouette Score')
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
sample_size = 10000
X_sample = X_train.sample(n=sample_size, random_state=42)

scaler = StandardScaler()
X_sample_scaled = scaler.fit_transform(X_sample)

linked = linkage(X_sample_scaled, method='ward')

plt.figure(figsize=(8, 4))
dendrogram(
    linked,
    truncate_mode='lastp',
    p=30,
    leaf_rotation=90,
    leaf_font_size=10,
    show_contracted=True
)
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('Cluster Size')
plt.ylabel('Distance')
plt.tight_layout()
plt.show()

In [None]:
n_clusters = 3
cluster_labels = fcluster(linked, n_clusters, criterion='maxclust')

X_sample_with_clusters = X_sample.copy()
X_sample_with_clusters['mileage'] = np.exp(X_sample_with_clusters['mileage_log'])  # تبدیل log به مقدار واقعی
X_sample_with_clusters['cluster_label'] = cluster_labels

print(X_sample_with_clusters['cluster_label'].value_counts())

plt.figure(figsize=(6, 4))
plt.scatter(
    X_sample_with_clusters['mileage'],
    X_sample_with_clusters['year_of_registration'],
    c=X_sample_with_clusters['cluster_label'],
    cmap='viridis',
    s=10
)
plt.xlabel('Mileage')
plt.ylabel('Year of Registration')
plt.title('Clusters Visualization')
plt.colorbar(label='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
X_selected = X_train[['year_of_registration', 'mileage_log']].sample(n=10000, random_state=42)

scaler = StandardScaler()
X_selected_scaled = scaler.fit_transform(X_selected)

dbscan = DBSCAN(eps=0.7, min_samples=10)
dbscan_labels = dbscan.fit_predict(X_selected_scaled)

df_dbscan = X_selected.copy()
df_dbscan['mileage'] = np.exp(df_dbscan['mileage_log'])
df_dbscan['cluster_label'] = dbscan_labels

plt.figure(figsize=(6, 4))
sns.scatterplot(
    data=df_dbscan,
    x='mileage',
    y='year_of_registration',
    hue='cluster_label',
    palette='tab10',
    s=30
)
plt.title('DBSCAN Clusters Visualization')
plt.xlabel('Mileage')
plt.ylabel('Year of Registration')
plt.legend(title='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()