In [1]:
import pandas as pd
import numpy as np
 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
 
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
# Data preparation, read the csv file, make the column names more homogenous, and deal with categorical and numerical values.

!pip install kagglehub



In [3]:
import kagglehub
path = kagglehub.dataset_download("blastchar/telco-customer-churn")
print("Path to dataset files:", path)

Path to dataset files: /home/codespace/.cache/kagglehub/datasets/blastchar/telco-customer-churn/versions/1


In [5]:
import os

# Build full path to the CSV file
csv_file = os.path.join(
    "/home/codespace/.cache/kagglehub/datasets/blastchar/telco-customer-churn/versions/1",
    "WA_Fn-UseC_-Telco-Customer-Churn.csv"
)

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
# Data preparation
 
df.columns = df.columns.str.lower().str.replace(' ', '_')
 
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
 
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')
 
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)
 
df.churn = (df.churn == 'yes').astype(int)

In [7]:
# Data splitting. We use the train_test_split function to divide the dataset in full_train and test data.

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)


In [8]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']
 
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [9]:
# Train function. It has three arguments – the training dataframe and the target values y_train, and the third argument is C which is a 
# LogisticRegression parameter for our model.
# First step here is to create dictionaries from the categorical columns, the numerical columns are ignored here. 
# Next we create a DictVectorizer instance which we need to use fit_transform function on the dictionaries. 
# So we get the X_train. Then we create our model which is a logistic regression model, that we can use for training (fit function) based on 
# the training data (X_train and y_train). To apply the model later we need to return the DictVectorizer and the model as well.

In [10]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')
 
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
 
    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
 
    return dv, model

In [11]:
# We need also the DictVectorizer. Both predict function. Besides both arguments we also need a dataframe where we can provide a prediction for.
# First step here is the same like in training function, we need to get the dictionaries. 
# This can be transformed by the DictVectorizer so we get the X, what we need to make a prediction on.
# What we return here is the predicted probability for churning.

In [12]:
def predict(df, dv, model):
     dicts = df[categorical + numerical].to_dict(orient='records')
 
     X = dv.transform(dicts)
     y_pred = model.predict_proba(X)[:,1]
 
     return y_pred

In [13]:
#  Setup two parameters. The first one is the C value for the Logistic Regression model, and the ‘n_splits’ parameter tells us how many splits 
# we’re going to use in K-Fold cross-validation. Here, we’re using 5 splits.

In [14]:
C = 1.0
n_splits = 5

In [15]:
# Implement K-Fold cross validation, where we use the parameters from the last code. The for loop loops over all folds and does a training for each. 
# After that we calculate the roc_auc_score and collect the values for each fold. 
# At the end the mean score and the standard deviation for all folds are printed.

In [16]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)  
 
scores = []
 
for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
 
    y_train = df_train.churn.values
    y_val = df_val.churn.values
 
    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)
 
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
 
print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

C=1.0 0.842 +- 0.007


In [17]:
scores

[0.844596557056621,
 0.845183998808105,
 0.83332129239414,
 0.8347609036260153,
 0.8517147249850961]

In [18]:
# Last step is to train the final model based on the full_train data. The steps here are similar to the steps mentioned before. 
# First is model training, then predicting the test data, and lastly calculate the roc_auc_score. 
# We see a value of 85.7% which is a bit higher than the average of the k-folds. But there is not a big difference.

In [19]:
dv, model = train(df_full_train, df_full_train.churn.values, C=1.0)
y_pred = predict(df_test, dv, model)
y_test = df_test.churn.values
 
auc = roc_auc_score(y_test, y_pred)
auc

0.8583463334308341

In [20]:
# Saving the model to pickle

In [21]:
import pickle

In [None]:
# We need to name our model file before we can write it to a file. The following code demonstrates two ways of naming the file.

In [22]:
output_file = 'model_C=%s.bin' % C
output_file
# Output: 'model_C=1.0.bin'
 
output_file = f'model_C={C}.bin'
output_file
# Output: 'model_C=1.0.bin'

'model_C=1.0.bin'

In [23]:
# Now we want to create a file with that file name. ‘wb’ means Write Binary. 
# We need to save DictVectorizer and the model as well, because with just the model we’ll not be able to translate a customer into a feature matrix.
# Closing the file is crucial. Otherwise, we cannot be certain whether this file truly contains the content.

In [24]:
f_out = open(output_file, 'wb')
 
pickle.dump((dv, model), f_out)
 
f_out.close()

In [26]:
# To avoid accidentally forgetting to close the file, we can use the ‘with’ statement, which ensures that the file is closed automatically. 
# Everything we do inside the ‘with’ statement keeps the file open. However, once we exit this statement, the file is automatically closed.

In [27]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)
    #do staff

#do other staff

In [28]:
# LOAD THE MODEL

In [1]:
# Click restart the Kernal to start the process again without previouse variables

In [3]:
import pickle

In [4]:
model_file = 'model_C=1.0.bin'

In [5]:
# 'rb' - mens open the file for reading, if we leave 'wb' it will overwrite the file
with open(model_file, 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [6]:
dv, model

(DictVectorizer(sparse=False), LogisticRegression(max_iter=1000))

In [7]:
# After loading the model, let’s use it to score one sample customer.

In [8]:
customer = {
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'yes',
    'dependents': 'no',
    'phoneservice': 'no',
    'multiplelines': 'no_phone_service',
    'internetservice': 'dsl',
    'onlinesecurity': 'no',
    'onlinebackup': 'yes',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'no',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 29.85,
    'totalcharges': 29.85
}

In [9]:
# Before we can apply the predict function to this customer we need to turn it into a feature matrix. 
# The DictVectorizer expects a list of dictionaries, that’s why we create a list with one customer.

In [10]:
X = dv.transform([customer])
X

array([[ 1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,
         0.  ,  1.  ,  0.  ,  0.  , 29.85,  0.  ,  1.  ,  0.  ,  0.  ,
         0.  ,  1.  ,  1.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ,
         0.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,
         0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  , 29.85]])

In [11]:
# We use predict function to get the probability that this particular customer is going to churn. 
# We’re interested in the second element, so we need to set the row=0 and column=1.
model.predict_proba(X)

array([[0.37250474, 0.62749526]])

In [12]:
model.predict_proba(X)[0,1]

np.float64(0.6274952584954934)

In [14]:
# Download thie file
# Turning our notebook into a Python script
# We can turn the Jupyter Notebook code into a Python file. One easy way of doing this is click on “File” -> “Download as” and then “Python (.py)