# Random Forest Model Prediction 

In [1]:
import joblib
from datetime import datetime
import pandas as pd 

In [2]:
X_master = pd.read_csv('../data/processed/master_features.csv')
tracking = pd.read_csv('../data/processed/tracking_labels.csv')
loyalty_numbers = tracking['Loyalty Number'].copy()
y = tracking['Churn']

In [3]:
rf_features = [
    'Gender', 'Education', 'Marital Status', 'Salary',
    'Loyalty Card', 'CLV', 'Enrollment Type',
    'Province',
    'Total Flights', 'Distance',
    'Points Accumulated', 'Points Redeemed', 'Dollar Cost Points Redeemed',
    'Activity 1 Month Before', 'Activity 2 Months Before', 'Activity 3 Months Before',
    'Points Most Recent', 'Overall Trend', 'Avg Monthly Points', 'Activity Volatility',
    'Customer Age (Years)']

X = X_master[rf_features].copy()

print("=== FEATURES SELECTED ===")
print(f"Total features: {X.columns}")

=== FEATURES SELECTED ===
Total features: Index(['Gender', 'Education', 'Marital Status', 'Salary', 'Loyalty Card',
       'CLV', 'Enrollment Type', 'Province', 'Total Flights', 'Distance',
       'Points Accumulated', 'Points Redeemed', 'Dollar Cost Points Redeemed',
       'Activity 1 Month Before', 'Activity 2 Months Before',
       'Activity 3 Months Before', 'Points Most Recent', 'Overall Trend',
       'Avg Monthly Points', 'Activity Volatility', 'Customer Age (Years)'],
      dtype='str')


In [4]:
categorical_cols = ['Gender', 'Education', 'Marital Status', 'Loyalty Card', 'Enrollment Type', 'Province']
X_all_encoded = pd.get_dummies(X, columns= categorical_cols, drop_first= True)
X_all_encoded.head()

Unnamed: 0,Salary,CLV,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed,Activity 1 Month Before,Activity 2 Months Before,Activity 3 Months Before,...,Province_British Columbia,Province_Manitoba,Province_New Brunswick,Province_Newfoundland,Province_Nova Scotia,Province_Ontario,Province_Prince Edward Island,Province_Quebec,Province_Saskatchewan,Province_Yukon
0,92552.0,7919.2,46,81190,81190.0,1513,272,5367.0,1973.0,4212.0,...,False,False,False,False,False,False,False,False,False,False
1,73522.0,2887.74,51,68918,68918.0,1195,215,2204.0,4003.0,4655.0,...,False,False,False,False,False,True,False,False,False,False
2,73522.0,2838.07,47,72856,72856.0,593,107,2728.0,5960.0,7232.0,...,True,False,False,False,False,False,False,False,False,False
3,63253.0,4170.57,22,38236,38236.0,861,155,5289.0,0.0,6032.0,...,True,False,False,False,False,False,False,False,False,False
4,91163.0,6622.05,37,54997,54997.0,1007,182,377.0,2297.0,12195.0,...,False,False,False,False,False,True,False,False,False,False


In [5]:
rf_model = joblib.load('../models/churn_model.pkl')
predictions = rf_model.predict_proba(X_all_encoded)[:, 1]


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    0.2s finished


In [6]:
# Create risk categories based on churn probability
def categorize_risk(prob):
    if prob >= 0.7:
        return 'Critical'
    elif prob >= 0.5:
        return 'High'
    elif prob >= 0.35:
        return 'Medium'
    else:
        return 'Low'
    
# 3. Create risk categories
risk_categories_all = [categorize_risk(prob) for prob in predictions]

In [7]:
predictions_df = pd.DataFrame({
    'Loyalty Number': loyalty_numbers,
    'Churn Risk Score': predictions.round(4),
    'Churn Risk %': (predictions * 100).round(2),
    'Risk Category': risk_categories_all,
    'Prediction Date': datetime.now().strftime('%Y-%m-%d')
})

if predictions_df.count().nunique() != 1:
    print('Error: Mismatching Number of Values!')
else:
    print('Perfect File Created!')

print('===== Predictions File =====')
print('Value      |      Count')
print(predictions_df.count())
          
predictions_df.head(10)

Perfect File Created!
===== Predictions File =====
Value      |      Count
Loyalty Number      15176
Churn Risk Score    15176
Churn Risk %        15176
Risk Category       15176
Prediction Date     15176
dtype: int64


Unnamed: 0,Loyalty Number,Churn Risk Score,Churn Risk %,Risk Category,Prediction Date
0,100018,0.3138,31.38,Low,2026-02-04
1,100102,0.2512,25.12,Low,2026-02-04
2,100140,0.233,23.3,Low,2026-02-04
3,100214,0.5754,57.54,High,2026-02-04
4,100272,0.4079,40.79,Medium,2026-02-04
5,100301,0.1372,13.72,Low,2026-02-04
6,100364,0.3532,35.32,Medium,2026-02-04
7,100380,0.4926,49.26,Medium,2026-02-04
8,100428,0.3202,32.02,Low,2026-02-04
9,100504,0.7639,76.39,Critical,2026-02-04


In [9]:
predictions_df.to_csv('../predictions/churn_predictions.csv')