# Vehicle Population Prediction
This notebook processes vehicle registration data from 2019 to 2024 to predict the vehicle population for 2025. Key steps include data preprocessing, feature engineering, handling missing values, and balancing data for modeling.

## Data Loading and Initial Exploration
We begin by loading the dataset and performing initial exploratory data analysis (EDA).

In [None]:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Load dataset
data = pd.read_csv('training.csv')

# Load testing dataset (if needed)
# data = pd.read_csv('scoring.csv')

# Display column names
data.columns

# Display the first few rows of the dataset
data.head()

# Count occurrences of values in 'Electric Mile Range'
data['Electric Mile Range'].value_counts()

# Display dataset information (data types, missing values, etc.)
data.info()


## Data Cleaning
We replace unknown, not applicable, and empty values with `NaN` for easier handling.

In [None]:

# Convert 'Unknown', 'Not Applicable', and empty data to NaN
for col in data.columns:
  data.loc[(data[col].isna()) | (data[col] == 'Unknown') | (data[col] == 'Not Applicable'), col] = np.NaN

# Calculate the percentage of missing values for each column
missing_percentage = data.isnull().sum() * 100 / len(data)
missing_percentage = missing_percentage.rename('Missing Percentage (%)')
missing_percentage.sort_values(ascending=False)


## Splitting Data by Year
The dataset is divided into subsets based on the registration year (2019-2024).

In [None]:

# Divide the data based on the 'Date' column
data_dict = {}
for year in range(2019, 2024):
  data_name = f"data_{year}"
  data_dict[data_name] = data[data['Date'] == year]

# Examine the proportion of missing values across years
for year in range(2019, 2024):
  data_year = f"data_{year}"
  missing_percentage = data_dict[data_year].isnull().sum() * 100 / len(data_dict[data_year])
  missing_percentage = missing_percentage.rename(f'Missing Percentage in {year} (%)')
  print(missing_percentage.sort_values(ascending=False))
  print('='*90)


## Feature Engineering
- Drop the 'Electric Mile Range' and 'Region' columns.
- Compute the vehicle's age by subtracting 'Model Year' from 'Date'.
- Convert categorical values (e.g., '≥4' in 'Number of Vehicles Registered at the Same Address') to numerical format.

In [None]:

# Drop irrelevant columns
data.drop(['Electric Mile Range', 'Region'], axis=1, inplace=True)

# Calculate the vehicle's age
data["Year Diff"] = data['Date'] - data['Model Year']

# Standardizing categorical values
data.loc[data['Number of Vehicles Registered at the Same Address'] == '≥4', ['Number of Vehicles Registered at the Same Address']] = '4'


## Exploratory Data Analysis (EDA)
### 1. Distribution of Fuel Type
Visualizing the count of vehicles by fuel type.

In [None]:

# Count of vehicles by fuel type
data.groupby('Fuel Type').size().sort_values(ascending=False).plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right']].set_visible(False)
plt.title('Fuel Type Count')
plt.xlabel('Count')
plt.ylabel('Fuel Type')
plt.show()


### 2. Average Vehicle Population by Fuel Type

In [None]:

# Mean value of the vehicle population for each fuel type
data.groupby('Fuel Type')['Vehicle Population'].mean().sort_values(ascending=False).plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right']].set_visible(False)
plt.title('Mean Vehicle Population by Fuel Type')
plt.xlabel('Mean Vehicle Population')
plt.ylabel('Fuel Type')
plt.show()


### 3. Distribution of Vehicles Registered at the Same Address

In [None]:

# Count of vehicles registered at the same address
data.groupby('Number of Vehicles Registered at the Same Address').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right']].set_visible(False)
plt.xlabel('Count')
plt.ylabel('Number of Vehicles Registered at the Same Address')
plt.title('Number of Vehicles Registered at the Same Address')
plt.show()


## One-Hot Encoding
Convert categorical features to numerical using one-hot encoding.

In [None]:

def Onehotencoding(df):
    # One-Hot Encode categorical columns
    categorical_cols = df.select_dtypes(exclude=np.number).columns
    df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    # Separate features and target variable
    X = df_encoded.drop('Vehicle Population', axis=1)  # Features
    y = df_encoded['Vehicle Population']  # Target variable
    return X, y


## KNN Imputation
Handle missing values using K-Nearest Neighbors (KNN) imputation.

In [None]:
def KNNimputation(X_encoded):
    # Separate numerical and categorical columns
    numerical_cols = X_encoded.select_dtypes(include=np.number).columns
    categorical_cols = [col for col in X_encoded.columns if col not in numerical_cols]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_encoded[numerical_cols])

    X_scaled_df = pd.DataFrame(X_scaled, columns=numerical_cols)


    imputer = KNNImputer(n_neighbors=min(3, X_encoded.shape[0]))
    X_imputed_array = imputer.fit_transform(X_encoded)  # Impute on full dataset (numerical + categorical)
    X_imputed_df = pd.DataFrame(X_imputed_array, columns=X_encoded.columns)

    X_imputed_df[numerical_cols] = scaler.inverse_transform(X_imputed_df[numerical_cols])
    print(X_imputed_df)

    X_imputed_df[categorical_cols] = X_imputed_df[categorical_cols].round().astype(int)

    return X_imputed_df

## Data Processing Pipeline
Combines one-hot encoding and KNN imputation into a single function.

In [None]:

def Processing(df):
    X, y = Onehotencoding(df)
    processed_data = KNNimputation(X)
    processed_data = processed_data.reset_index(drop=True)
    y = y.reset_index(drop=True)
    y_df = pd.DataFrame(y, columns=['Vehicle Population'])
    return pd.concat([processed_data, y_df], axis=1)


In [None]:
processed_df_2019 = Processing(data_dict['data_2019'])
processed_df_2020 = Processing(data_dict['data_2020'])
processed_df_2021 = Processing(data_dict['data_2021'])
processed_df_2022 = Processing(data_dict['data_2022'])
processed_df_2023 = Processing(data_dict['data_2023'])
processed__dataset = pd.concat([processed_df_2019, processed_df_2020, processed_df_2021, processed_df_2022, processed_df_2023], axis=0, ignore_index=True)

In [None]:
# save the dataset to a csv file
processed__dataset.to_csv('processed_dataset.csv', index=False)

## Handling Data Imbalance
Using SMOTE to balance the dataset before modeling.

In [None]:

# Placeholder for SMOTE handling (not yet implemented in the provided code)
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X, y)
