In [2]:
import os
import pickle
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.cluster import KMeans
import kagglehub

path = kagglehub.dataset_download("jpacse/datasets-for-churn-telecom")
files = os.listdir(path)
train_file = os.path.join(path, files[0])
df = pd.read_csv(train_file)


numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()


num_imputer = SimpleImputer(strategy="median")
df[numerical_features] = num_imputer.fit_transform(df[numerical_features])
cat_imputer = SimpleImputer(strategy="most_frequent")
df[categorical_features] = cat_imputer.fit_transform(df[categorical_features])


label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

mm_scaler = MinMaxScaler()
df[numerical_features] = mm_scaler.fit_transform(df[numerical_features])

excluded_columns = ["Churn", "ServiceArea", "CustomerID"]
X_all = df.drop(columns=excluded_columns, errors='ignore')
y = df['Churn']
x_all_columns = X_all.columns.tolist()

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_all, y)
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Standard scaling
scaler_xgb = StandardScaler()
X_train_scaled = scaler_xgb.fit_transform(X_train)
X_test_scaled = scaler_xgb.transform(X_test)

# XGBoost Training
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5,
                          subsample=0.8, colsample_bytree=0.8, random_state=42,
                          use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train_scaled, y_train)
y_pred = xgb_model.predict(X_test_scaled)
print("XGB Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))

# KMeans clustering
important_features = [
    'CurrentEquipmentDays', 'RetentionCalls', 'MadeCallToRetentionTeam',
    'Handsets', 'TotalRecurringCharge', 'HandsetModels', 'RetentionOffersAccepted',
    'ReceivedCalls', 'AgeHH1', 'MonthlyMinutes', 'HandsetRefurbished',
    'RespondsToMailOffers', 'HandsetWebCapable', 'OffPeakCallsInOut',
    'PeakCallsInOut', 'CreditRating'
]
X_cluster = df[important_features]
scaler_kmeans = StandardScaler()
X_cluster_scaled = scaler_kmeans.fit_transform(X_cluster)
kmeans_model = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans_model.fit(X_cluster_scaled)

# Save all preprocessing and models
for obj, name in [
    (xgb_model, "xgb_model.pkl"),
    (scaler_xgb, "scaler_xgb.pkl"),
    (kmeans_model, "kmeans_model.pkl"),
    (scaler_kmeans, "scaler_kmeans.pkl"),
    (num_imputer, "num_imputer.pkl"),
    (cat_imputer, "cat_imputer.pkl"),
    (label_encoders, "label_encoders.pkl"),
    (x_all_columns, "x_all_columns.pkl"),
    (important_features, "important_features.pkl"),
    (numerical_features, "numerical_features.pkl"),
    (categorical_features, "categorical_features.pkl")
]:
    with open(name, "wb") as f:
        pickle.dump(obj, f)

print("Models and preprocessing objects have been pickled.")


# Load all the models and preprocessors
def load_assets():
    with open("xgb_model.pkl", "rb") as f: xgb_model = pickle.load(f)
    with open("scaler_xgb.pkl", "rb") as f: scaler_xgb = pickle.load(f)
    with open("kmeans_model.pkl", "rb") as f: kmeans_model = pickle.load(f)
    with open("scaler_kmeans.pkl", "rb") as f: scaler_kmeans = pickle.load(f)
    with open("num_imputer.pkl", "rb") as f: num_imputer = pickle.load(f)
    with open("cat_imputer.pkl", "rb") as f: cat_imputer = pickle.load(f)
    with open("label_encoders.pkl", "rb") as f: label_encoders = pickle.load(f)
    with open("x_all_columns.pkl", "rb") as f: x_all_columns = pickle.load(f)
    with open("important_features.pkl", "rb") as f: important_features = pickle.load(f)
    with open("numerical_features.pkl", "rb") as f: numerical_features = pickle.load(f)
    with open("categorical_features.pkl", "rb") as f: categorical_features = pickle.load(f)

    return {
        'xgb_model': xgb_model,
        'scaler_xgb': scaler_xgb,
        'kmeans_model': kmeans_model,
        'scaler_kmeans': scaler_kmeans,
        'num_imputer': num_imputer,
        'cat_imputer': cat_imputer,
        'label_encoders': label_encoders,
        'x_all_columns': x_all_columns,
        'important_features': important_features,
        'numerical_features': numerical_features,
        'categorical_features': categorical_features
    }

# Function to predict on a single row (pass as numpy array)
def predict_single_row(row_array, column_names, assets=None):
    if assets is None:
        assets = load_assets()

    row_df = pd.DataFrame([row_array], columns=column_names)

    numerical = assets['numerical_features']
    categorical = assets['categorical_features']

    num_features_present = [col for col in numerical if col in row_df.columns]
    if num_features_present:
        row_df[num_features_present] = assets['num_imputer'].transform(row_df[num_features_present])

    cat_features_present = [col for col in categorical if col in row_df.columns]
    if cat_features_present:
        row_df[cat_features_present] = assets['cat_imputer'].transform(row_df[cat_features_present])

        for col in cat_features_present:
            if col in assets['label_encoders']:
                row_df[col] = assets['label_encoders'][col].transform(row_df[col])

    for col in assets['x_all_columns']:
        if col not in row_df.columns:
            row_df[col] = 0

    X_input = row_df[assets['x_all_columns']]
    print("Input Final: ", X_input)
    X_input_scaled = assets['scaler_xgb'].transform(X_input)
    churn_pred = assets['xgb_model'].predict(X_input_scaled)[0]

    for col in assets['important_features']:
        if col not in row_df.columns:
            row_df[col] = 0

    X_cluster_input = row_df[assets['important_features']]
    X_cluster_scaled = assets['scaler_kmeans'].transform(X_cluster_input)
    cluster_pred = assets['kmeans_model'].predict(X_cluster_scaled)[0] + 1  # 1-indexed

    return {"PredictedChurn": churn_pred, "ClusterGroup": cluster_pred}


test_file = os.path.join(path, files[1])
df_test = pd.read_csv(train_file)

random_index = np.random.randint(0, len(df_test))
random_row = df_test.iloc[random_index]
print(f"Testing with row at index {random_index}")

row_array = random_row.values
row_columns = random_row.index.tolist()

assets = load_assets()
prediction = predict_single_row(row_array, row_columns, assets)

print("\nInput row sample (first 5 columns):")
print(random_row.head())
print("\nPrediction result:")
print(f"Predicted Churn: {prediction['PredictedChurn']}")
print(f"Cluster Group: {prediction['ClusterGroup']}")

Parameters: { "use_label_encoder" } are not used.



XGB Accuracy: 79.07%
Models and preprocessing objects have been pickled.
Testing with row at index 3501
Input Final:     MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
0           38.55          1050.0                  45.0   

   DirectorAssistedCalls  OverageMinutes  RoamingCalls  PercChangeMinutes  \
0                    0.0             0.0           0.0              -33.0   

   PercChangeRevenues  DroppedCalls  BlockedCalls  ...  \
0                -0.3           2.0           4.3  ...   

   ReferralsMadeBySubscriber  IncomeGroup  OwnsMotorcycle  \
0                        0.0          0.0               0   

   AdjustmentsToCreditRating  HandsetPrice  MadeCallToRetentionTeam  \
0                        0.0            15                        0   

   CreditRating  PrizmCode  Occupation  MaritalStatus  
0             0          2           3              1  

[1 rows x 55 columns]

Input row sample (first 5 columns):
CustomerID              3027326
Churn                

In [3]:
random_index = np.random.randint(0, len(df_test))
random_row = df_test.iloc[random_index]
print(f"Testing with row at index {random_index}")

row_array = random_row.values
row_columns = random_row.index.tolist()
print(row_array)
assets = load_assets()
prediction = predict_single_row(row_array, row_columns, assets)

print("\nInput row sample (first 5 columns):")
print(random_row.head())
print("\nPrediction result:")
print(f"Predicted Churn: {prediction['PredictedChurn']}")
print(f"Cluster Group: {prediction['ClusterGroup']}")

Testing with row at index 37285
[np.int64(3297450) 'No' np.float64(76.63) np.float64(570.0)
 np.float64(55.0) np.float64(8.41) np.float64(50.0) np.float64(5.6)
 np.float64(-216.0) np.float64(-27.7) np.float64(0.0) np.float64(0.0)
 np.float64(0.0) np.float64(0.0) np.float64(0.0) np.float64(0.0)
 np.float64(0.0) np.float64(0.0) np.float64(0.0) np.float64(0.0)
 np.float64(0.0) np.float64(0.0) np.float64(0.0) np.int64(10) np.int64(1)
 np.int64(1) 'BOSBOS781' np.float64(1.0) np.float64(1.0) np.float64(295.0)
 np.float64(44.0) np.float64(20.0) 'Yes' 'No' 'Yes' 'No' 'No' 'Known'
 'Yes' 'Yes' 'No' 'No' 'No' 'Yes' np.int64(0) np.int64(0) 'No' 'No'
 np.int64(0) np.int64(3) 'No' np.int64(0) 'Unknown' 'No' '5-Low'
 'Suburban' 'Other' 'Unknown']
Input Final:     MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
0           76.63           570.0                  55.0   

   DirectorAssistedCalls  OverageMinutes  RoamingCalls  PercChangeMinutes  \
0                   8.41            50.0       

In [4]:
random_index = np.random.randint(0, len(df_test))
random_row = df_test.iloc[random_index]
print(f"Testing with row at index {random_index}")

row_array = random_row.values
row_columns = random_row.index.tolist()

assets = load_assets()
prediction = predict_single_row(row_array, row_columns, assets)

print("\nInput row sample (first 5 columns):")
print(random_row.head())
print("\nPrediction result:")
print(f"Predicted Churn: {prediction['PredictedChurn']}")
print(f"Cluster Group: {prediction['ClusterGroup']}")

Testing with row at index 37666
Input Final:     MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
0           33.99           244.0                  40.0   

   DirectorAssistedCalls  OverageMinutes  RoamingCalls  PercChangeMinutes  \
0                    0.0             0.0           0.0              -47.0   

   PercChangeRevenues  DroppedCalls  BlockedCalls  ...  \
0                 0.0           9.3           0.0  ...   

   ReferralsMadeBySubscriber  IncomeGroup  OwnsMotorcycle  \
0                        0.0          8.0               0   

   AdjustmentsToCreditRating  HandsetPrice  MadeCallToRetentionTeam  \
0                        0.0            15                        0   

   CreditRating  PrizmCode  Occupation  MaritalStatus  
0             1          3           0              0  

[1 rows x 55 columns]

Input row sample (first 5 columns):
CustomerID              3300334
Churn                        No
MonthlyRevenue            33.99
MonthlyMinutes            244

In [5]:
random_index = np.random.randint(0, len(df_test))
random_row = df_test.iloc[random_index]
print(f"Testing with row at index {random_index}")

row_array = random_row.values
row_columns = random_row.index.tolist()

assets = load_assets()
prediction = predict_single_row(row_array, row_columns, assets)

print("\nInput row sample (first 5 columns):")
print(random_row.head())
print("\nPrediction result:")
print(f"Predicted Churn: {prediction['PredictedChurn']}")
print(f"Cluster Group: {prediction['ClusterGroup']}")

Testing with row at index 47431
Input Final:     MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
0          120.41           729.0                  40.0   

   DirectorAssistedCalls  OverageMinutes  RoamingCalls  PercChangeMinutes  \
0                   3.71           184.0           1.3              329.0   

   PercChangeRevenues  DroppedCalls  BlockedCalls  ...  \
0               105.5           4.7           0.0  ...   

   ReferralsMadeBySubscriber  IncomeGroup  OwnsMotorcycle  \
0                        0.0          5.0               0   

   AdjustmentsToCreditRating  HandsetPrice  MadeCallToRetentionTeam  \
0                        0.0             5                        0   

   CreditRating  PrizmCode  Occupation  MaritalStatus  
0             1          0           0              2  

[1 rows x 55 columns]

Input row sample (first 5 columns):
CustomerID              3374110
Churn                        No
MonthlyRevenue           120.41
MonthlyMinutes            729

In [6]:
random_index = np.random.randint(0, len(df_test))
random_row = df_test.iloc[random_index]
print(f"Testing with row at index {random_index}")

row_array = random_row.values
row_columns = random_row.index.tolist()

assets = load_assets()
prediction = predict_single_row(row_array, row_columns, assets)

print("\nInput row sample (first 5 columns):")
print(random_row.head())
print("\nPrediction result:")
print(f"Predicted Churn: {prediction['PredictedChurn']}")
print(f"Cluster Group: {prediction['ClusterGroup']}")

Testing with row at index 30427
Input Final:     MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
0           50.24           902.0                  60.0   

   DirectorAssistedCalls  OverageMinutes  RoamingCalls  PercChangeMinutes  \
0                   0.25             0.0           0.0              -24.0   

   PercChangeRevenues  DroppedCalls  BlockedCalls  ...  \
0                -0.2           8.0           1.0  ...   

   ReferralsMadeBySubscriber  IncomeGroup  OwnsMotorcycle  \
0                        0.0          0.0               0   

   AdjustmentsToCreditRating  HandsetPrice  MadeCallToRetentionTeam  \
0                        0.0            15                        0   

   CreditRating  PrizmCode  Occupation  MaritalStatus  
0             2          0           3              1  

[1 rows x 55 columns]

Input row sample (first 5 columns):
CustomerID              3241954
Churn                        No
MonthlyRevenue            50.24
MonthlyMinutes            902

In [7]:
random_index = np.random.randint(0, len(df_test))
random_row = df_test.iloc[random_index]
print(f"Testing with row at index {random_index}")

row_array = random_row.values
row_columns = random_row.index.tolist()

assets = load_assets()
prediction = predict_single_row(row_array, row_columns, assets)

print("\nInput row sample (first 5 columns):")
print(random_row.head())
print("\nPrediction result:")
print(f"Predicted Churn: {prediction['PredictedChurn']}")
print(f"Cluster Group: {prediction['ClusterGroup']}")

Testing with row at index 49821
Input Final:     MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
0          102.79           505.0                  68.0   

   DirectorAssistedCalls  OverageMinutes  RoamingCalls  PercChangeMinutes  \
0                   3.22           104.0           0.0              469.0   

   PercChangeRevenues  DroppedCalls  BlockedCalls  ...  \
0               133.7           1.3           6.7  ...   

   ReferralsMadeBySubscriber  IncomeGroup  OwnsMotorcycle  \
0                        0.0          5.0               0   

   AdjustmentsToCreditRating  HandsetPrice  MadeCallToRetentionTeam  \
0                        0.0             8                        0   

   CreditRating  PrizmCode  Occupation  MaritalStatus  
0             4          3           3              0  

[1 rows x 55 columns]

Input row sample (first 5 columns):
CustomerID              3390622
Churn                        No
MonthlyRevenue           102.79
MonthlyMinutes            505

In [8]:
random_index = np.random.randint(0, len(df_test))
random_row = df_test.iloc[random_index]
print(f"Testing with row at index {random_index}")

row_array = random_row.values
row_columns = random_row.index.tolist()

assets = load_assets()
prediction = predict_single_row(row_array, row_columns, assets)

print("\nInput row sample (first 5 columns):")
print(random_row.head())
print("\nPrediction result:")
print(f"Predicted Churn: {prediction['PredictedChurn']}")
print(f"Cluster Group: {prediction['ClusterGroup']}")

Testing with row at index 42091
Input Final:     MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
0           59.99           466.0                  70.0   

   DirectorAssistedCalls  OverageMinutes  RoamingCalls  PercChangeMinutes  \
0                    0.0             0.0           0.0               33.0   

   PercChangeRevenues  DroppedCalls  BlockedCalls  ...  \
0                 0.0          16.3           1.0  ...   

   ReferralsMadeBySubscriber  IncomeGroup  OwnsMotorcycle  \
0                        0.0          0.0               0   

   AdjustmentsToCreditRating  HandsetPrice  MadeCallToRetentionTeam  \
0                        0.0            15                        0   

   CreditRating  PrizmCode  Occupation  MaritalStatus  
0             1          0           3              1  

[1 rows x 55 columns]

Input row sample (first 5 columns):
CustomerID              3334934
Churn                        No
MonthlyRevenue            59.99
MonthlyMinutes            466