# 2026 Vehicle Renewal Prediction

This notebook builds a machine learning model to predict the likelihood of vehicle renewals in 2026 using 2025 renewal data.

In [None]:
# STEP 0: IMPORT LIBRARIES

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
## Step 1: Load Data
## The dataset is loaded from a CSV file exported from Excel and cleaned for analysis.

df_2025 = pd.read_csv("C:/Users/REGINALD/Documents/2026 Projects/2025_RENEWALS.csv")

In [None]:
# STEP 2: CLEAN COLUMN NAMES

df_2025.columns = df_2025.columns.str.strip()


In [None]:
## Step 3: Data Cleaning
## Dates are cleaned and converted into a proper datetime format to enable feature extraction.
## Invalid rows are removed.

df_2025['DATE_CLEAN'] = df_2025['DATE'].str.replace(r'(st|nd|rd|th)', '', regex=True)
df_2025['DATE_CLEAN'] = df_2025['DATE_CLEAN'] + ' 2025'
df_2025['DATE'] = pd.to_datetime(df_2025['DATE_CLEAN'], format='%d %b %Y', errors='coerce')
df_2025 = df_2025.dropna(subset=['DATE'])  # drop bad rows


In [None]:
## Step 4: Feature Engineering
## We create features that may influence
#  renewal behavior, including seasonality, contact availability, and Sacco size.

df_2025['Reg_Month'] = df_2025['DATE'].dt.month
df_2025['Has_Contact'] = df_2025['CONTACT'].notna().astype(int)
df_2025['Sacco_Size'] = df_2025.groupby('SACCO')['REGISTRATION'].transform('count')

In [None]:
# STEP 5: SIMULATE TARGET

np.random.seed(42)
df_2025['Renewed_2026'] = np.random.choice([0, 1], size=len(df_2025), p=[0.2, 0.8])

In [None]:
# STEP 6: FEATURES AND TARGET

features = ['Reg_Month', 'Has_Contact', 'Sacco_Size']
X = df_2025[features]
y = df_2025['Renewed_2026']

In [None]:
## Step 7: Model Training
## A Random Forest classifier is trained to learn renewal patterns.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("===== MODEL EVALUATION =====")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
## Step 8: 2026 Renewal Prediction
## The model generates renewal probabilities and predictions for each customer.

df_2025['Renewal_Prob_2026'] = model.predict_proba(X)[:, 1]
df_2025['Predicted_Renewal_2026'] = model.predict(X)

In [None]:
# Round probability for nicer display
df_2025['Renewal_Prob_2026'] = df_2025['Renewal_Prob_2026'].round(2)


# STEP 9: DISPLAY ALL DATA WITH STYLING


# Show all rows and columns in Jupyter
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Sort by probability descending
df_results = df_2025[['REGISTRATION', 'SACCO', 'NAME', 'CONTACT',
                      'Renewal_Prob_2026', 'Predicted_Renewal_2026']].sort_values(
                          by='Renewal_Prob_2026', ascending=False)

# Use Pandas styling to highlight high probabilities
def highlight_high_prob(val):
    if val >= 0.8:
        color = 'background-color: lightgreen'
    elif val >= 0.5:
        color = 'background-color: lightyellow'
    else:
        color = ''
    return color

styled_table = df_results.style.applymap(highlight_high_prob, subset=['Renewal_Prob_2026'])

# Display the styled table
styled_table
