# Model Training on Processed data
>Also we will do the model evalution

In [1]:
import pickle
import numpy as np
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Load the data

In [2]:
with open('./data/X_smote.pkl', 'rb') as file:
    X_train_smote = pickle.load(file)
with open('./data/y_smote.pkl', 'rb') as file:
    y_train_smote = pickle.load(file)

In [3]:
X_train_smote.head()

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled
0,109.0,13.222629,54.614678,0.0,0,0,1,1,3,16.911603,108.727123,41.0,0,1.365587,8.0,0,15.0,0,1
1,7.0,15.282497,13.065042,1.0,2,1,1,0,0,6.513209,5.6148,36.0,3,4.716851,0.0,1,23.0,1,0
2,72.0,7.908776,32.771247,2.0,3,1,0,1,0,29.005506,83.297789,14.0,3,4.531153,4.0,1,12.0,0,0
3,10.0,12.991161,14.538057,0.0,1,1,2,0,3,13.256101,63.559086,45.0,1,1.193549,9.0,0,0.0,0,1
4,112.0,15.747117,61.028191,1.0,3,0,0,0,3,4.688291,5.807534,40.0,0,4.128238,4.0,1,12.0,0,1


## Model Training with default hyperparameters

In [4]:
# dictionary of models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "RandomForestClassifier": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

In [6]:
# dictionary to store the cv results
cv_scores = {}

for model_name, model in models.items():
    print(f"Training {model_name} with default parameters")
    scores = cross_val_score(model, X_train_smote, y_train_smote, cv = 5, scoring='accuracy')
    cv_scores[model_name] = scores
    print(f"{model_name} cv accuracy: {np.mean(scores):.2f}")
    print("="*21)

Training Decision Tree with default parameters
Decision Tree cv accuracy: 0.77
Training RandomForestClassifier with default parameters
RandomForestClassifier cv accuracy: 0.85
Training XGBoost with default parameters
XGBoost cv accuracy: 0.86


`XGBClassifier` gives the highest accuracy than other models with default parameters.

In [27]:
xgb = XGBClassifier(random_state=42)

In [28]:
xgb.fit(X_train_smote, y_train_smote)

### Model Evalution

In [29]:
with open('./data/X_test.pkl', 'rb') as file:
    X_test = pickle.load(file)
with open('./data/y_test.pkl', 'rb') as file:
    y_test = pickle.load(file)

In [30]:
# make prediction
y_pred = xgb.predict(X_test)

In [31]:
# Evaluate
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [32]:
# Print Results
print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Accuracy: 0.81840928668116

Confusion Matrix:
 [[38470  1498]
 [ 7356  1434]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.96      0.90     39968
         1.0       0.49      0.16      0.24      8790

    accuracy                           0.82     48758
   macro avg       0.66      0.56      0.57     48758
weighted avg       0.78      0.82      0.78     48758



- `Strong Performance for Class 0.0`: The model performs well for class 0.0, with high precision and recall.
- `Poor Performance for Class 1.0`: The model struggles significantly with class 1.0, exhibiting low precision and very low recall. 
- Accuracy: 0.82 (82%)
>Overall, the model correctly predicted 82% of the instances.

### Save the model for future use:

In [33]:
with open('./data/xgb_model.pkl', 'wb') as file:
    pickle.dump(xgb, file)

### Load the saved model and build the predictive system

In [34]:
with open('./data/xgb_model.pkl', 'rb') as file:
    xgb_model = pickle.load(file)

### lets take a raw data and make predciction

In [57]:
customer_data = {
    "AccountAge": 20,
    "MonthlyCharges": 11.055215098286784,
    "TotalCharges": 221.10430196573566,
    "SubscriptionType": "Premium",
    "PaymentMethod": "Mailed check",
    "PaperlessBilling": "No",
    "ContentType": "Both",
    "MultiDeviceAccess": "No",
    "DeviceRegistered": "Mobile",
    "ViewingHoursPerWeek": 36.75810391025656,
    "AverageViewingDuration": 63.53137733399087,
    "ContentDownloadsPerMonth": 10,
    "GenrePreference": "Sci-Fi",
    "UserRating": 2.1764975145384615,
    "SupportTicketsPerMonth": 4,
    "Gender": "Male",
    "WatchlistSize": 3,
    "ParentalControl": "No",
    "SubtitlesEnabled": "No",
}

In [58]:
import pandas as pd

In [116]:
new_df = pd.DataFrame([customer_data])

In [60]:
new_df

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled
0,20,11.055215,221.104302,Premium,Mailed check,No,Both,No,Mobile,36.758104,63.531377,10,Sci-Fi,2.176498,4,Male,3,No,No


In [73]:
import sys
import os
notebook_dir = os.getcwd()
src_dir = os.path.join(notebook_dir, '..', 'src')
if src_dir not in sys.path:
    sys.path.append(src_dir)  # Add the src directory to sys.path

In [76]:
from utils import extract_num_columns, extract_cat_columns, transform_column, make_float

In [63]:
num = extract_num_columns(new_df)[:-1]
cat =  extract_cat_columns(new_df)
print(num,"\n", cat)

Index(['AccountAge', 'MonthlyCharges', 'TotalCharges', 'ViewingHoursPerWeek',
       'AverageViewingDuration', 'ContentDownloadsPerMonth', 'UserRating',
       'SupportTicketsPerMonth'],
      dtype='object') 
 Index(['SubscriptionType', 'PaymentMethod', 'PaperlessBilling', 'ContentType',
       'MultiDeviceAccess', 'DeviceRegistered', 'GenrePreference', 'Gender',
       'ParentalControl', 'SubtitlesEnabled'],
      dtype='object')


In [64]:
new_df = make_float(new_df, num) # converts into float

In [65]:
new_df['TotalCharges'] = transform_column(new_df['TotalCharges'])

In [108]:
new_df

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled
0,20,11.055215,221.104302,1.0,Mailed check,No,Both,No,Mobile,36.758104,63.531377,10,Sci-Fi,2.176498,4,Male,3,No,No


In [67]:
with open('./data/encoders.pkl', 'rb') as file:
    encoders = pickle.load(file)

In [111]:
def encode_transform(df, columns, encoders):
    # Apply the encoding to the training data.
    for column in columns:
        encoder = encoders[column]
        if column != "SubscriptionType":
            df[column] = encoder.transform(df[column])
        else:
            df[column] = encoder.transform(df[[column]]) #fit transform expects a 2d array.
    
    return df 

In [117]:
# from utils import encode_transform
new_df = encode_transform(new_df, cat, encoders)

In [118]:
new_df.shape

(1, 19)

### make the prediction

In [124]:
result = xgb_model.predict(new_df)[0]
pred_prob = xgb_model.predict_proba(new_df)

In [125]:
print(f"Prediction: {'Churn' if result == 0 else 'No Churn'}")
print(f"Prediction probability: {pred_prob}")

Prediction: Churn
Prediction probability: [[0.67893076 0.32106924]]
