In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# load the csv data to a pandas dataframe
from google.colab import files
uploaded = files.upload()

df = pd.read_csv("/content/Merged_Churn_Dataset.csv")
user_df = pd.read_csv("/content/testing.csv")

Saving Merged_Churn_Dataset.csv to Merged_Churn_Dataset.csv


In [3]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [4]:
df = df.drop(columns=['churn_score', 'churn_label', 'churn_category',
                     'churn_reason', 'customer_status', 'satisfaction_score', 'cltv'])

high_corr_cols = ['married', 'referred_a_friend', 'total_revenue']
df = df.drop(columns=high_corr_cols)

In [5]:
df['offer'] = df['offer'].fillna('No Offer')
df['internet_type'] = df['internet_type'].fillna('Unknown')

In [6]:
binary_cols = [col for col in df.columns if df[col].dtypes == "O" and df[col].nunique() == 2]

In [7]:
df[binary_cols] = df[binary_cols].replace({"Yes": 1, "No": 0})

  df[binary_cols] = df[binary_cols].replace({"Yes": 1, "No": 0})


In [8]:
numerical_features_list = ["tenure", "monthly_charges", "total_charges"]

# Loại bỏ các cột có 1 giá trị duy nhất
columns_to_drop = []

for col in df.columns:
    if col not in numerical_features_list:
        # Kiểm tra số lượng giá trị duy nhất trong cột
        if df[col].nunique() == 1:
            columns_to_drop.append(col)

# Loại bỏ các cột có 1 giá trị duy nhất
df = df.drop(columns=columns_to_drop)

In [9]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    """

    Returns the names of categorical, numeric and categorical but cardinal variables in the data set.
    Note Categorical variables include categorical variables with numeric appearance.

    Parameters
    ------
        dataframe: dataframe
                Variable names of the dataframe to be taken
        cat_th: int, optional
                class threshold for numeric but categorical variables
        car_th: int, optinal
                class threshold for categorical but cardinal variables

    Returns
    ------
        cat_cols: list
                Categorical variable list
        num_cols: list
                Numeric variable list
        cat_but_car: list
                List of cardinal variables with categorical appearance

    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))


    Notes
    ------
        cat_cols + num_cols + cat_but_car = total number of variables
        num_but_cat is inside cat_cols.
        The sum of the 3 return lists equals the total number of variables: cat_cols + num_cols + cat_but_car = number of variables

    """

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat

    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]

    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')


    return cat_cols, num_cols, cat_but_car, num_but_cat

cat_cols, num_cols, cat_but_car,  num_but_cat = grab_col_names(df)


cat_cols = [col for col in cat_cols if col not in ["churn_value"]]

Observations: 7043
Variables: 39
cat_cols: 22
num_cols: 15
cat_but_car: 2
num_but_cat: 17


In [10]:
# Phân chia thời gian theo năm
bins = [0, 12, 24, 36, 48, 60, 72]
labels = [1, 2, 3, 4, 5, 6]
df["New_tenure_year"] = pd.cut(df["tenure"], bins=bins, labels=labels, right=True)

df["New_contract_type"] = df["contract"].apply(
    lambda x: 2 if x == "Two Year" else (1 if x == "One Year" else 0))

df["New_family_size"] = df["partner"] + df["dependents"] + 1
df['New_total_services'] = (df[['online_security', 'online_backup', 'device_protection', 'premium_tech_support',
                                       'streaming_tv', 'streaming_movies', 'streaming_music']]== 'Yes').sum(axis=1) \
                                        + df["internet_service"] + df["phone_service"]
df["New_flag_auto_payment"] = df["payment_method"].apply(lambda x: 1 if
                                                       x in ["Bank transfer (automatic)","Credit card (automatic)"]
                                                       else 0)
df["New_avg_service_fee"] = df.apply(
    lambda x: x["monthly_charges"] / (x["New_total_services"] if x["New_total_services"] > 0 else 1),
    axis=1
)

df["New_no_protection"] = df.apply(lambda x: 1 if (x["online_backup"] != "Yes") or (x["device_protection"] != "Yes") or (x["premium_tech_support"] != "Yes") else 0, axis=1)


In [11]:
cat_cols, num_cols, cat_but_car,  num_but_cat = grab_col_names(df)

Observations: 7043
Variables: 46
cat_cols: 28
num_cols: 16
cat_but_car: 2
num_but_cat: 23


In [12]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

In [13]:
binary_cols2 = [col for col in df.columns if df[col].dtypes == "O" or df[col].dtypes == "int" and df[col].nunique() == 2]
binary_cols2 = [col for col in binary_cols2 if col != 'customer_id']
for col in binary_cols2:
    df = label_encoder(df, col)

In [14]:
cat_cols = [col for col in cat_cols if col not in binary_cols2 and col not in ["churn_value", "New_total_services"]]
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

df = one_hot_encoder(df, cat_cols, drop_first=True)

In [15]:
df = df.replace([np.inf, -np.inf], 0)
num_cols = [col for col in num_cols if col != 'tenure']

In [16]:
outlier_cols = ['number_of_referrals', 'number_of_dependents', 'avg_monthly_gb_download',
                'total_refunds', 'total_extra_data_charges', 'total_long_distance_charges',
                'total_population']


normal_cols = [col for col in num_cols if col not in outlier_cols]

# Khởi tạo các bộ chuẩn hóa
standard_scaler = StandardScaler()
robust_scaler = RobustScaler()


df[normal_cols] = standard_scaler.fit_transform(df[normal_cols])

df[outlier_cols] = robust_scaler.fit_transform(df[outlier_cols])

scaler = RobustScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [17]:
df

Unnamed: 0,customer_id,gender,age,under_30,senior_citizen,partner,dependents,number_of_dependents,tenure,internet_service,phone_service,multiple_lines,avg_monthly_gb_download,unlimited_data,offer,number_of_referrals,online_security,online_backup,device_protection,premium_tech_support,streaming_tv,streaming_movies,streaming_music,internet_type,contract,paperless_billing,payment_method,monthly_charges,avg_monthly_long_distance_charges,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,churn_value,city,zip_code,total_population,latitude,longitude,New_total_services,New_flag_auto_payment,New_avg_service_fee,New_tenure_year_2,New_tenure_year_3,New_tenure_year_4,New_tenure_year_5,New_tenure_year_6,New_contract_type_1,New_contract_type_2,New_family_size_2,New_family_size_3
0,0002-ORFBO,0,-0.321,0,0,1,0,0.000,9,1,1,0,-0.042,1,0,0.667,0,1,0,1,1,0,0,0,1,1,3,-0.087,0.717,-0.237,0.000,0.000,-0.018,0,346,-0.091,-0.386,-0.330,0.156,1,0,-0.224,False,False,False,False,False,True,False,True,False
1,0003-MKNFE,1,0.000,0,0,0,0,0.000,9,1,1,1,-0.292,0,0,0.000,0,0,0,0,0,1,1,0,0,0,3,-0.192,-0.449,-0.252,38.330,10.000,-0.272,0,368,-0.716,0.407,-0.490,0.364,1,0,-0.359,False,False,False,False,False,False,False,False,False
2,0004-TLHLJ,1,0.143,0,0,0,0,0.000,4,1,1,0,0.542,1,5,0.000,0,0,1,0,0,0,0,2,0,1,2,0.065,0.396,-0.329,0.000,0.000,-0.238,1,222,-0.276,1.318,-0.614,0.438,1,0,-0.028,False,False,False,False,False,False,False,False,False
3,0011-IGKFF,1,1.143,0,1,1,0,0.000,13,1,1,0,-0.542,1,4,0.333,0,1,1,0,1,1,0,2,0,1,2,0.509,0.181,-0.046,0.000,0.000,-0.036,1,587,0.321,0.862,0.434,-0.660,1,0,0.540,True,False,False,False,False,False,False,True,False
4,0013-EXCHZ,0,1.036,0,1,1,0,0.000,3,1,1,0,-0.250,1,0,1.000,0,0,0,1,1,0,0,2,0,1,3,0.249,-0.571,-0.333,0.000,0.000,-0.338,1,139,-0.157,0.749,-0.474,0.135,1,0,0.208,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,9987-LUTYD,0,-0.929,1,0,0,0,0.000,13,1,1,0,1.750,1,4,0.000,1,0,0,1,0,0,1,1,1,0,3,-0.280,0.875,-0.192,0.000,0.000,0.183,0,490,-0.489,0.802,-0.826,0.680,1,0,-0.471,True,False,False,False,False,True,False,False,False
7039,9992-RRAMN,1,-0.214,0,0,1,0,0.000,22,1,1,1,0.000,1,4,0.333,0,0,0,0,0,1,1,2,0,1,2,0.271,-0.246,0.141,0.000,0.000,-0.040,1,827,0.573,-0.030,0.367,-0.356,1,0,0.236,True,False,False,False,False,False,False,True,False
7040,9992-UJOEL,1,-0.857,1,0,0,0,0.000,2,1,1,0,1.417,1,5,0.000,0,1,0,0,0,0,0,1,0,1,3,-0.369,-0.157,-0.384,0.000,0.000,-0.325,0,296,0.593,-0.508,0.696,-1.061,1,0,-0.585,False,False,False,False,False,False,False,False,False
7041,9993-LHIEB,1,-0.893,1,0,1,0,0.000,67,1,1,0,1.708,1,1,1.667,1,0,1,1,0,1,1,0,2,0,3,-0.046,-0.764,0.955,0.000,0.000,-0.231,0,930,-0.447,-0.159,-0.768,0.611,1,0,-0.171,False,False,False,False,True,False,True,True,False


In [18]:
user_df = user_df[['customer_id']]
user_df_merged = pd.merge(user_df, df, on="customer_id", how="left")

In [19]:
user_df_merged

Unnamed: 0,customer_id,gender,age,under_30,senior_citizen,partner,dependents,number_of_dependents,tenure,internet_service,phone_service,multiple_lines,avg_monthly_gb_download,unlimited_data,offer,number_of_referrals,online_security,online_backup,device_protection,premium_tech_support,streaming_tv,streaming_movies,streaming_music,internet_type,contract,paperless_billing,payment_method,monthly_charges,avg_monthly_long_distance_charges,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,churn_value,city,zip_code,total_population,latitude,longitude,New_total_services,New_flag_auto_payment,New_avg_service_fee,New_tenure_year_2,New_tenure_year_3,New_tenure_year_4,New_tenure_year_5,New_tenure_year_6,New_contract_type_1,New_contract_type_2,New_family_size_2,New_family_size_3
0,0111-KLBQG,1,0.893,0,1,1,1,1.0,32,1,1,0,0.25,1,3,3.0,0,1,0,0,1,1,1,2,1,1,3,0.434,0.33,0.433,0.0,0.0,0.552,0,638,-0.391,-0.416,-0.505,0.786,1,0,0.445,False,True,False,False,False,True,False,False,True
1,0112-QAWRZ,1,0.107,0,0,1,1,2.0,16,1,1,1,-0.208,1,4,2.667,0,0,1,0,1,0,0,2,0,1,0,0.376,0.95,0.014,0.0,0.0,0.337,0,727,-0.388,-0.511,-0.691,1.267,1,1,0.371,True,False,False,False,False,False,False,False,True
2,0112-QWPNC,1,0.536,0,0,1,0,0.0,49,1,1,1,-0.292,1,2,0.333,1,0,1,1,1,1,1,0,1,0,2,0.258,0.613,0.787,0.0,0.0,1.371,1,1033,0.014,-0.507,-0.434,0.487,1,0,0.218,False,False,False,True,False,True,False,True,False
3,0114-IGABW,0,-0.214,0,0,1,0,0.0,71,1,0,0,0.5,1,1,0.667,0,1,1,1,1,1,1,0,2,0,0,-0.223,-0.842,0.812,0.0,0.0,-0.358,0,140,-0.028,-0.326,-0.147,-0.376,0,1,0.977,False,False,False,False,True,False,True,True,False
4,0114-PEGZZ,0,-0.25,0,0,0,0,0.0,33,1,1,1,-0.208,1,0,0.0,0,1,1,0,1,1,1,2,0,0,2,0.684,0.986,0.665,0.0,0.0,1.105,0,837,0.795,-0.506,1.131,-0.645,1,0,0.766,False,True,False,False,False,False,False,False,False


In [20]:
!pip install lifelines

Collecting lifelines
  Downloading lifelines-0.30.0-py3-none-any.whl.metadata (3.2 kB)
Collecting autograd-gamma>=0.3 (from lifelines)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines)
  Downloading formulaic-1.1.1-py3-none-any.whl.metadata (6.9 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=0.2.2->lifelines)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading lifelines-0.30.0-py3-none-any.whl (349 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.3/349.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading formulaic-1.1.1-py3-none-any.whl (115 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.7/115.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (se

In [21]:
from google.colab import files
uploaded = files.upload()

Saving cox_model.pkl to cox_model.pkl


In [22]:
import joblib

# Tải mô hình Cox đã lưu
cox_model = joblib.load('cox_model.pkl')

# Giả sử user_df_merged là DataFrame chứa dữ liệu của bạn (với các đặc trưng và customer_id)
# user_df_merged = pd.read_csv('your_user_data.csv')

# Kiểm tra các cột trong user_df_merged (đảm bảo nó có đầy đủ các cột cần thiết)
print(user_df_merged.columns)

# Chọn các cột cần thiết để tính toán hazard score từ user_df_merged (đảm bảo rằng các cột này có trong user_df_merged)
cox_columns = ['contract', 'number_of_referrals', 'number_of_dependents', 'monthly_charges', 'New_avg_service_fee',
               'dependents', 'age', 'latitude', 'city', 'internet_type', 'New_family_size_2', 'total_charges',
               'total_population', 'payment_method', 'longitude', 'zip_code', 'New_family_size_3', 'New_contract_type_2',
               'avg_monthly_gb_download', 'senior_citizen']

# Chọn các cột của user_df_merged mà cần thiết cho tính toán hazard score
user_df_for_cox = user_df_merged[cox_columns]

# Tính toán hazard score (linear predictor)
user_df_merged['hazard_score'] = cox_model.predict_partial_hazard(user_df_for_cox)

# Tính toán baseline hazard
baseline_hazard = cox_model.baseline_hazard_

# Định nghĩa hàm để tính cumulative hazard
def get_cumulative_hazard(tenure):
    return baseline_hazard.loc[baseline_hazard.index <= tenure, 'baseline hazard'].sum()

# Hàm tính toán survival probability
def get_survival_probability(tenure, hazard_score, baseline_hazard):
    cumulative_hazard = np.array([get_cumulative_hazard(t + tenure) for t in [3, 6, 12]])  # Các mốc 3, 6, 12 tháng
    survival_probabilities = np.exp(-cumulative_hazard)
    return survival_probabilities

# Tính các đặc trưng survival cho user_df_merged
user_df_merged['baseline_hazard'] = user_df_merged['tenure'].apply(get_cumulative_hazard)

# Xác định hazard group
user_df_merged['hazard_group'] = pd.cut(user_df_merged['hazard_score'],
                                         bins=[-np.inf, user_df_merged['hazard_score'].quantile(0.25),
                                               user_df_merged['hazard_score'].quantile(0.5),
                                               user_df_merged['hazard_score'].quantile(0.75), np.inf],
                                         labels=['Low', 'Medium-Low', 'Medium-High', 'High'])

# Tính survival probabilities cho 3, 6, và 12 tháng
user_df_merged['survival_prob_3m'] = user_df_merged.apply(
    lambda row: get_survival_probability(row['tenure'], row['hazard_score'], baseline_hazard)[0], axis=1)
user_df_merged['survival_prob_6m'] = user_df_merged.apply(
    lambda row: get_survival_probability(row['tenure'], row['hazard_score'], baseline_hazard)[1], axis=1)
user_df_merged['survival_prob_12m'] = user_df_merged.apply(
    lambda row: get_survival_probability(row['tenure'], row['hazard_score'], baseline_hazard)[2], axis=1)

# Kiểm tra kết quả
print(user_df_merged[['customer_id', 'hazard_score', 'hazard_group', 'survival_prob_3m', 'survival_prob_6m', 'survival_prob_12m']].head())

# Lưu kết quả vào tệp CSV (nếu cần)
user_df_merged.to_csv('user_with_survival_features.csv', index=False)

Index(['customer_id', 'gender', 'age', 'under_30', 'senior_citizen', 'partner',
       'dependents', 'number_of_dependents', 'tenure', 'internet_service',
       'phone_service', 'multiple_lines', 'avg_monthly_gb_download',
       'unlimited_data', 'offer', 'number_of_referrals', 'online_security',
       'online_backup', 'device_protection', 'premium_tech_support',
       'streaming_tv', 'streaming_movies', 'streaming_music', 'internet_type',
       'contract', 'paperless_billing', 'payment_method', 'monthly_charges',
       'avg_monthly_long_distance_charges', 'total_charges', 'total_refunds',
       'total_extra_data_charges', 'total_long_distance_charges',
       'churn_value', 'city', 'zip_code', 'total_population', 'latitude',
       'longitude', 'New_total_services', 'New_flag_auto_payment',
       'New_avg_service_fee', 'New_tenure_year_2', 'New_tenure_year_3',
       'New_tenure_year_4', 'New_tenure_year_5', 'New_tenure_year_6',
       'New_contract_type_1', 'New_contract_type

In [23]:
from sklearn.preprocessing import OneHotEncoder, RobustScaler

In [24]:
user_df_merged = pd.get_dummies(user_df_merged, columns=['hazard_group'], drop_first=False)

# Sử dụng RobustScaler cho các cột số liệu
columns_to_scale = ['hazard_score', 'baseline_hazard', 'survival_prob_3m', 'survival_prob_6m', 'survival_prob_12m']

# Khởi tạo RobustScaler
scaler = RobustScaler()

# Chuẩn hóa các cột cho cả train và test
user_df_merged[columns_to_scale] = scaler.fit_transform(user_df_merged[columns_to_scale])
user_df_merged[columns_to_scale] = scaler.transform(user_df_merged[columns_to_scale])

In [25]:
user_df_merged

Unnamed: 0,customer_id,gender,age,under_30,senior_citizen,partner,dependents,number_of_dependents,tenure,internet_service,phone_service,multiple_lines,avg_monthly_gb_download,unlimited_data,offer,number_of_referrals,online_security,online_backup,device_protection,premium_tech_support,streaming_tv,streaming_movies,streaming_music,internet_type,contract,paperless_billing,payment_method,monthly_charges,avg_monthly_long_distance_charges,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,churn_value,city,zip_code,total_population,latitude,longitude,New_total_services,New_flag_auto_payment,New_avg_service_fee,New_tenure_year_2,New_tenure_year_3,New_tenure_year_4,New_tenure_year_5,New_tenure_year_6,New_contract_type_1,New_contract_type_2,New_family_size_2,New_family_size_3,hazard_score,baseline_hazard,survival_prob_3m,survival_prob_6m,survival_prob_12m,hazard_group_Low,hazard_group_Medium-Low,hazard_group_Medium-High,hazard_group_High
0,0111-KLBQG,1,0.893,0,1,1,1,1.0,32,1,1,0,0.25,1,3,3.0,0,1,0,0,1,1,1,2,1,1,3,0.434,0.33,0.433,0.0,0.0,0.552,0,638,-0.391,-0.416,-0.505,0.786,1,0,0.445,False,True,False,False,False,True,False,False,True,-0.293,-0.21,-2.825,-1.95,-1.179,True,False,False,False
1,0112-QAWRZ,1,0.107,0,0,1,1,2.0,16,1,1,1,-0.208,1,4,2.667,0,0,1,0,1,0,0,2,0,1,0,0.376,0.95,0.014,0.0,0.0,0.337,0,727,-0.388,-0.511,-0.691,1.267,1,1,0.371,True,False,False,False,False,False,False,False,True,0.119,-0.56,-2.388,-1.612,-0.869,False,False,True,False
2,0112-QWPNC,1,0.536,0,0,1,0,0.0,49,1,1,1,-0.292,1,2,0.333,1,0,1,1,1,1,1,0,1,0,2,0.258,0.613,0.787,0.0,0.0,1.371,1,1033,0.014,-0.507,-0.434,0.487,1,0,0.218,False,False,False,True,False,True,False,True,False,-0.216,3.053,-5.896,-4.145,-2.651,False,True,False,False
3,0114-IGABW,0,-0.214,0,0,1,0,0.0,71,1,0,0,0.5,1,1,0.667,0,1,1,1,1,1,1,0,2,0,0,-0.223,-0.842,0.812,0.0,0.0,-0.358,0,140,-0.028,-0.326,-0.147,-0.376,0,1,0.977,False,False,False,False,True,False,True,True,False,-0.305,133.683,-11.737,-6.373,-2.987,True,False,False,False
4,0114-PEGZZ,0,-0.25,0,0,0,0,0.0,33,1,1,1,-0.208,1,0,0.0,0,1,1,0,1,1,1,2,0,0,2,0.684,0.986,0.665,0.0,0.0,1.105,0,837,0.795,-0.506,1.131,-0.645,1,0,0.766,False,True,False,False,False,False,False,False,False,1.064,-0.147,-2.883,-1.994,-1.208,False,False,False,True


In [27]:
from google.colab import files
uploaded = files.upload()

Saving kmeans_model.pkl to kmeans_model.pkl


In [28]:
user_df_merged = user_df_merged.drop(columns=['customer_id', 'churn_value'])

In [29]:
user_df_merged

Unnamed: 0,gender,age,under_30,senior_citizen,partner,dependents,number_of_dependents,tenure,internet_service,phone_service,multiple_lines,avg_monthly_gb_download,unlimited_data,offer,number_of_referrals,online_security,online_backup,device_protection,premium_tech_support,streaming_tv,streaming_movies,streaming_music,internet_type,contract,paperless_billing,payment_method,monthly_charges,avg_monthly_long_distance_charges,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,city,zip_code,total_population,latitude,longitude,New_total_services,New_flag_auto_payment,New_avg_service_fee,New_tenure_year_2,New_tenure_year_3,New_tenure_year_4,New_tenure_year_5,New_tenure_year_6,New_contract_type_1,New_contract_type_2,New_family_size_2,New_family_size_3,hazard_score,baseline_hazard,survival_prob_3m,survival_prob_6m,survival_prob_12m,hazard_group_Low,hazard_group_Medium-Low,hazard_group_Medium-High,hazard_group_High
0,1,0.893,0,1,1,1,1.0,32,1,1,0,0.25,1,3,3.0,0,1,0,0,1,1,1,2,1,1,3,0.434,0.33,0.433,0.0,0.0,0.552,638,-0.391,-0.416,-0.505,0.786,1,0,0.445,False,True,False,False,False,True,False,False,True,-0.293,-0.21,-2.825,-1.95,-1.179,True,False,False,False
1,1,0.107,0,0,1,1,2.0,16,1,1,1,-0.208,1,4,2.667,0,0,1,0,1,0,0,2,0,1,0,0.376,0.95,0.014,0.0,0.0,0.337,727,-0.388,-0.511,-0.691,1.267,1,1,0.371,True,False,False,False,False,False,False,False,True,0.119,-0.56,-2.388,-1.612,-0.869,False,False,True,False
2,1,0.536,0,0,1,0,0.0,49,1,1,1,-0.292,1,2,0.333,1,0,1,1,1,1,1,0,1,0,2,0.258,0.613,0.787,0.0,0.0,1.371,1033,0.014,-0.507,-0.434,0.487,1,0,0.218,False,False,False,True,False,True,False,True,False,-0.216,3.053,-5.896,-4.145,-2.651,False,True,False,False
3,0,-0.214,0,0,1,0,0.0,71,1,0,0,0.5,1,1,0.667,0,1,1,1,1,1,1,0,2,0,0,-0.223,-0.842,0.812,0.0,0.0,-0.358,140,-0.028,-0.326,-0.147,-0.376,0,1,0.977,False,False,False,False,True,False,True,True,False,-0.305,133.683,-11.737,-6.373,-2.987,True,False,False,False
4,0,-0.25,0,0,0,0,0.0,33,1,1,1,-0.208,1,0,0.0,0,1,1,0,1,1,1,2,0,0,2,0.684,0.986,0.665,0.0,0.0,1.105,837,0.795,-0.506,1.131,-0.645,1,0,0.766,False,True,False,False,False,False,False,False,False,1.064,-0.147,-2.883,-1.994,-1.208,False,False,False,True


In [30]:
# Load the trained KMeans model
kmeans = joblib.load('kmeans_model.pkl')

user_df_merged['kmeans_cluster'] = kmeans.fit_predict(user_df_merged)


In [31]:
user_df_merged

Unnamed: 0,gender,age,under_30,senior_citizen,partner,dependents,number_of_dependents,tenure,internet_service,phone_service,multiple_lines,avg_monthly_gb_download,unlimited_data,offer,number_of_referrals,online_security,online_backup,device_protection,premium_tech_support,streaming_tv,streaming_movies,streaming_music,internet_type,contract,paperless_billing,payment_method,monthly_charges,avg_monthly_long_distance_charges,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,city,zip_code,total_population,latitude,longitude,New_total_services,New_flag_auto_payment,New_avg_service_fee,New_tenure_year_2,New_tenure_year_3,New_tenure_year_4,New_tenure_year_5,New_tenure_year_6,New_contract_type_1,New_contract_type_2,New_family_size_2,New_family_size_3,hazard_score,baseline_hazard,survival_prob_3m,survival_prob_6m,survival_prob_12m,hazard_group_Low,hazard_group_Medium-Low,hazard_group_Medium-High,hazard_group_High,kmeans_cluster
0,1,0.893,0,1,1,1,1.0,32,1,1,0,0.25,1,3,3.0,0,1,0,0,1,1,1,2,1,1,3,0.434,0.33,0.433,0.0,0.0,0.552,638,-0.391,-0.416,-0.505,0.786,1,0,0.445,False,True,False,False,False,True,False,False,True,-0.293,-0.21,-2.825,-1.95,-1.179,True,False,False,False,0
1,1,0.107,0,0,1,1,2.0,16,1,1,1,-0.208,1,4,2.667,0,0,1,0,1,0,0,2,0,1,0,0.376,0.95,0.014,0.0,0.0,0.337,727,-0.388,-0.511,-0.691,1.267,1,1,0.371,True,False,False,False,False,False,False,False,True,0.119,-0.56,-2.388,-1.612,-0.869,False,False,True,False,0
2,1,0.536,0,0,1,0,0.0,49,1,1,1,-0.292,1,2,0.333,1,0,1,1,1,1,1,0,1,0,2,0.258,0.613,0.787,0.0,0.0,1.371,1033,0.014,-0.507,-0.434,0.487,1,0,0.218,False,False,False,True,False,True,False,True,False,-0.216,3.053,-5.896,-4.145,-2.651,False,True,False,False,2
3,0,-0.214,0,0,1,0,0.0,71,1,0,0,0.5,1,1,0.667,0,1,1,1,1,1,1,0,2,0,0,-0.223,-0.842,0.812,0.0,0.0,-0.358,140,-0.028,-0.326,-0.147,-0.376,0,1,0.977,False,False,False,False,True,False,True,True,False,-0.305,133.683,-11.737,-6.373,-2.987,True,False,False,False,1
4,0,-0.25,0,0,0,0,0.0,33,1,1,1,-0.208,1,0,0.0,0,1,1,0,1,1,1,2,0,0,2,0.684,0.986,0.665,0.0,0.0,1.105,837,0.795,-0.506,1.131,-0.645,1,0,0.766,False,True,False,False,False,False,False,False,False,1.064,-0.147,-2.883,-1.994,-1.208,False,False,False,True,0


In [32]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [33]:
from google.colab import files
uploaded = files.upload()

Saving catboost_model.pkl to catboost_model.pkl


In [34]:
catboost_model = joblib.load('catboost_model.pkl')

In [35]:
# Dự đoán xác suất churn
pred_probs = catboost_model.predict_proba(user_df_merged)

# Hoặc nếu bạn muốn nhãn dự đoán (0 hoặc 1):
pred_labels = catboost_model.predict(user_df_merged)

In [36]:
pred_probs

array([[0.99557181, 0.00442819],
       [0.99252009, 0.00747991],
       [0.62644672, 0.37355328],
       [0.97289246, 0.02710754],
       [0.12686656, 0.87313344]])

In [37]:
pred_labels

array([0, 0, 0, 0, 1])