# Title of ML project : Telangana Regional Transport Authority Vehicle Online Sales Data 01-01-2025 to 31-01-2025

# Name : Renjitha E R

# Organization : Entri Elevate

![Bus Image](/full/path/to/86419624.webp)




# Overview of Problem Statement

In 2025, a significant number of vehicles will require insurance renewal. The goal of this project is to predict the number of vehicles that need insurance renewal in 2025 using machine learning techniques. This prediction is crucial for insurance companies, government agencies, and vehicle owners to plan ahead, allocate resources efficiently, and ensure seamless insurance renewal processes.

# Objective

Predicting the number of vehicles that need insurance renewal in 2025

# Data Description

Source Of Data: The dataset collected from Telengana Government Site

# Feature

slno                   :	Serial number of the record.
modelDesc              :	Description of the vehicle model.
fuel                   :	Type of fuel used (e.g., Petrol, Diesel, Electric).
colour                 :	Vehicle color.
vehicleClass           :	Category of the vehicle (e.g., Motor Cycle, Goods Carriage).
makeYear               :	The manufacturing year of the vehicle.
seatCapacity           :	Number of seats in the vehicle.
insuranceValidity      :	The expiry date of the vehicle's insurance.
secondVehicle          :	Indicates whether this is the owner's second vehicle (Y or N).
tempRegistrationNumber :	Temporary registration number of the vehicle.
category               :	Specifies if the vehicle is Transport or Non-Transport.
makerName              : 	Name of the vehicle manufacturer (e.g., Honda, Bajaj, Ashok Leyland).
OfficeCd               :	Code of the registering office (RTO).
fromdate               :	Registration start date.
to date                :	Registration end date.

# Data Collection

In [117]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import skew
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

In [118]:
# Load the dataset
df =pd.read_csv('ts_transport_online_sales_01_01_2025to31_01_2025.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'ts_transport_online_sales_01_01_2025to31_01_2025.csv'

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.drop(columns=['fuel', 'colour', 'seatCapacity'], inplace=True)

In [None]:
df.shape

# Data Preprocessing - Data Cleaning

In [None]:
# Convert 'insuranceValidity' to datetime
df['insuranceValidity'] = pd.to_datetime(df['insuranceValidity'],errors='coerce')

In [None]:
# Extract the year of insurance validity
df['insuranceYear'] = df['insuranceValidity'].dt.year

In [None]:
print(df)

In [None]:
# Count vehicles needing renewal in 2025
renewals_2025 = df[df['insuranceYear'] == 2025].shape[0]
print("Number of vehicles needing renewal in 2025:", renewals_2025)

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna(subset=['insuranceValidity'])

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
# Box Plot for visualize outliers
columns = df.select_dtypes(include = ['number'])

for col in columns.columns:
    plt.figure(figsize=(10,6))
    columns.boxplot(column = col)
    plt.title(f" Box Plot for {col}")
    plt.ylabel('Values')
    plt.show()

In [None]:
# Outlier's removel with IQR
def handle_outlier(df):
    for col in df.select_dtypes(include = ['int64','float64']).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)

        IQR = Q3 - Q1

        lower = Q1 - 1.5 *IQR
        upper = Q3 + 1.5 *IQR

        df[col] = df[col].apply(lambda x: 
        lower if x<lower else 
        upper if x>upper else
        x)
    return df
df1 = handle_outlier(df)

In [None]:
# Box-Plot after removing outliers
column = df1.select_dtypes(include=['int64','float64'])

for col in column.columns:
    plt.figure()
    column.boxplot(column = col)
    plt.title(f"Box Plot for {col} after IQR")
    plt.ylabel('Value')
    plt.show()

# Exploratory Data Analysis (EDA)

In [None]:
print(df.columns)

In [None]:
# Pie Chart - Distribution of vehicle categories
plt.figure(figsize=(8, 8))
df['category'].value_counts().plot.pie(autopct='%1.1f%%', cmap='coolwarm')
plt.title("Vehicle Category Distribution")
plt.ylabel("")
plt.show()

In [None]:
# Bar Plot - Top 10 vehicle makers
top_makers = df['makerName'].value_counts().head(10)
top_makers.plot(kind='bar', color='skyblue')
plt.title("Top 10 Vehicle Makers")
plt.xlabel("Maker Name")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Histogram for Insurance Year Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['insuranceYear'].dropna(), bins=20, kde=False, color='blue')
plt.title("Insurance Year Distribution (Histogram)")
plt.xlabel("Insurance Year")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Kernel Density Estimation (KDE) for Insurance Year
plt.figure(figsize=(10, 6))
sns.kdeplot(df['insuranceYear'].dropna(), fill=True, color='green')
plt.title("Kernel Density Estimation (KDE) for Insurance Year")
plt.xlabel("Insurance Year")
plt.ylabel("Density")
plt.show()

In [None]:
# Generate heatmap for numerical columns only
numerical_df = df.select_dtypes(include=['number'])  # Select only numeric columns
plt.figure(figsize=(10, 6))
sns.heatmap(numerical_df.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

# Feature Selection

In [None]:
# Splitting the dataset into training and testing sets
X = df1.drop('insuranceYear', axis=1)
y = df1['insuranceYear']

In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = ['vehicleClass', 'secondVehicle', 'category', 'OfficeCd', 'fromdate', 'todate', 'makerName', 'modelDesc', 'makeYear', 'tempRegistrationNumber']  # List your categorical columns

# Apply Label Encoding to each categorical column
encoder = LabelEncoder()
for col in categorical_columns:
    X[col] = encoder.fit_transform(X[col])

# Check transformed data
print(X.head())

In [None]:
# Convert datetime columns to numerical values
def preprocess_data(X):
    for col in X.select_dtypes(include=['datetime64']):  # Identify datetime columns
        X[col] = X[col].astype('int64') // 10**9  # Convert to Unix timestamp (seconds)
    return X

# Function to select best K features for regression
def select_best_features(X, y, K=10):
    X = preprocess_data(X)  # Convert datetime columns
    selector = SelectKBest(score_func=f_regression, k=K)  # Use f_regression for regression problems
    X_new = selector.fit_transform(X, y)  # Apply feature selection
    
    # Get feature scores
    feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': selector.scores_})
    feature_scores = feature_scores.sort_values(by='Score', ascending=False)
    
    print("Top Selected Features:\n", feature_scores.head(K))  # Print selected features
    
    return X_new, feature_scores  # Return both selected features and scores

# Set K (number of top features to select)
K = 10

# Preprocess and Apply feature selection
X = preprocess_data(X)
X_selected, feature_scores = select_best_features(X, y, K)

# Print all feature scores (optional)
print("\nAll Feature Scores:\n", feature_scores)

# Split Data into Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Check the shapes of the resulting datasets
print("Training feature set shape:", X_train.shape)
print("Testing feature set shape:", X_test.shape)
print("Training target set shape:", y_train.shape)
print("Testing target set shape:", y_test.shape)

# Feature Scaling

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.fit_transform(X_test)

# Build the ML Model

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [None]:
models = {
    '1. Linear Regression' : LinearRegression(),
    '2. Decision Tree Regressor' : DecisionTreeRegressor(),
    '3. Random Forest Regressor' : RandomForestRegressor(),
    '4. Gradient Boosting Regressor' : GradientBoostingRegressor(),
    '5. Support Vector Regressor' : SVR(),
    '6. Lasso Regression' : Lasso(),
    '7. AdaBoost Regressor' : AdaBoostRegressor()
}

# Model Evaluation

In [None]:
# Train and evaluate models
results = {}
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    results[model_name] = {'MSE': mse, 'MAE': mae, 'R² Score': r2, 'RMSE': rmse}

# Convert results to DataFrame
import pandas as pd
results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
# Finding The Best Model
best_model = results_df['R² Score'].idxmax()
print(f'The best Model based on R2 Score is:\n {best_model}')
print(results_df.loc[best_model])

# Hyperparameter Tuning

In [None]:
# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10,],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [None]:
# Initialize the Grid Search
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, 
                           cv=5, scoring='neg_mean_squared_error',verbose =1, n_jobs=-1)

In [None]:
# Fit the Grid Search to the data
grid_search.fit(X_train_scaled, y_train)

best_param = grid_search.best_params_
best_score = grid_search.best_score_

# Get the best parameters and score
print(f"Best Parameters: {best_param}")
print(f"Best Cross-Validation MSE Score: {-best_score}")

In [None]:
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Predict on the test data
y_test_pred = best_model.predict(X_test_scaled)

# Evaluate the model
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Printing the results
print(f"Test MSE: {test_mse}")
print(f"Test MAE: {test_mae}")
print(f"Test R2: {test_r2}")
print(f"Test RMSE: {test_rmse}")

In [None]:
# Evaluating Model Performance with Residual Analysis

# Predicting values  
y_train_pred = best_model.predict(X_train_scaled)  
residuals = y_train - y_train_pred

In [None]:
# Cross-validate the model  
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')  
print(f'Cross-Validation MSE: {-cv_scores.mean()}')

# Pipeline Creation

In [None]:
# spliting data into X and y
X = df1.drop(['insuranceYear'], axis=1)
y = df1['insuranceYear']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.compose import ColumnTransformer

# Separate numeric and non-numeric columns
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Preprocessing for numeric data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Final pipeline with the regressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=50,              # Best number of estimators from GridSearchCV
        min_samples_split=5,          # Best min_samples_split from GridSearchCV
        min_samples_leaf=2,           # Best min_samples_leaf from GridSearchCV
        max_depth=10,                 # Best max_depth from GridSearchCV
        random_state=42))])           # Ensure reproducibility

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Saving the Model

In [None]:
# Save the entire pipeline
joblib.dump(pipeline,'random_forest_pipeline.joblib')

In [None]:
loaded_pipeline = joblib.load('random_forest_pipeline.joblib')

In [172]:
# Select first 5,000 rows from original dataset
unseen_data = df.sample(n=5000, random_state=42).reset_index(drop=True)

# Save unseen data to CSV
unseen_data.to_csv("unseen_data.csv", index=False)

print("Unseen dataset created and saved as 'unseen_data.csv'")

Unseen dataset created and saved as 'unseen_data.csv'


In [176]:
# Load unseen dataset 
unseen_data_df = pd.read_csv("unseen_data.csv")
print("Unseen data successfully loaded!")

Unseen data successfully loaded!


In [178]:
# Check Unseen data to confirm data is loaded
print("Preview of Unseen Data:")
print(unseen_data_df.head())

Preview of Unseen Data:
    slno                              modelDesc vehicleClass    makeYear  \
0  60769             TVS - JUPITER 125 BSVI-PH2  MOTOR CYCLE  01/01/2025   
1   3188  INNOVA CRYSTA 2.4Z (MT) (7S) BSVI-PH2    MOTOR CAR  01/11/2024   
2  22289                SHINE 125 DISK BSVI-PH2  MOTOR CYCLE  01/08/2024   
3  41692              ACTIVA 125 DISC. BSVI-PH2  MOTOR CYCLE  01/01/2025   
4  40024              SPLENDOR+ -(DRS) BSVI-PH2  MOTOR CYCLE  01/11/2024   

  insuranceValidity secondVehicle tempRegistrationNumber       category  \
0        2030-01-27             N            TG02CTR6572  Non Transport   
1        2028-01-01             Y            TG03BTR0063  Non Transport   
2        2030-01-08             N            TG02CTR5145  Non Transport   
3        2030-01-19             Y            TG12ATR8873  Non Transport   
4        2030-01-18             N             TG31TR3822  Non Transport   

                        makerName          OfficeCd    fromdate     

In [182]:
# Drop target column if present (ensure we are not using labels)
if 'insuranceYear' in unseen_data_df.columns:
    unseen_X = unseen_data_df.drop(columns=['insuranceYear'])
else:
    unseen_X = unseen_data_df.copy()

In [184]:
# Load the saved pipeline (Check if the file exists)
try:
    loaded_pipeline = joblib.load("random_forest_pipeline.joblib")
    print("Pipeline successfully loaded!")
except FileNotFoundError:
    print("Error: The file 'risk_malware_pipeline.pkl' was not found. Please check the file path.")
    exit()

Pipeline successfully loaded!


In [186]:
#  Make predictions on unseen data
unseen_predictions = loaded_pipeline.predict(unseen_X)

In [190]:
# Add predictions to the dataset
unseen_data_df['Predicted_Insurance_Year'] = unseen_predictions

In [192]:
# Save the results
unseen_data_df.to_csv("unseen_predictions.csv", index=False)

print("unseen data saved as 'unseen_predictions.csv'")

unseen data saved as 'unseen_predictions.csv'


In [198]:
from sklearn.metrics import accuracy_score, regression_report

# Check if actual labels are present in unseen dataset
if 'insuranceYear' in unseen_data_df.columns:
    # Extract actual labels
    actual_labels = unseen_data_df['insuranceYear']
    
    # Evaluate model performance
    accuracy = accuracy_score(actual_labels, unseen_predictions)
    print(f"Accuracy on Unseen Data: {accuracy:.4f}\n")

    print("Regression Report:")
    print(regression_report(actual_labels, unseen_predictions))

else:
    print("No actual labels found in the unseen dataset. Evaluation cannot be performed.")

ImportError: cannot import name 'regression_report' from 'sklearn.metrics' (C:\Users\user\anaconda3\Lib\site-packages\sklearn\metrics\__init__.py)