In [1]:
#Import the Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multioutput import MultiOutputClassifier
import pickle
from pathlib import Path

In [2]:
#Load the dataset
df = pd.read_csv('churn pelanggan.csv')

In [3]:
#Remove unnecessary columns
cleaned_df = df.drop(['Customer_ID','Gender','Age','City','Zip_Code','Latitude','Longitude','Number_of_Referrals','Offer','Avg_Monthly_Long_Distance_Charges','Avg_Monthly_GB_Download','Device_Protection_Plan','Premium_Tech_Support','Total_Refunds','Total_Extra_Data_Charges','Total_Long_Distance_Charges','Monthly_Charge','Total_Charges','Total_Revenue','Churn_Category','Churn_Reason'], axis=1)

In [4]:
#Look at the number of rows and columns in the data set
cleaned_df.shape

(6589, 18)

In [5]:
#Check for missing or na values
cleaned_df.isna().sum()

Gender                     0
Married                    0
Number_of_Dependents       0
Tenure_in_Months           0
Phone_Service              0
Multipl_Lines            644
Internet_Service           0
Internet_Type           1344
Online_Security         1344
Online_Backup           1344
Streaming_TV            1344
Streaming_Movies        1344
Streaming_Music         1344
Unlimited_Data          1344
Contract                   0
Paperless_Billing          0
Payment_Method             0
Customer_Status            0
dtype: int64

In [6]:
cleaned_df = cleaned_df.dropna()

In [7]:
cleaned_df.to_csv('churn_pelanggan_bersih.csv')

In [8]:
#Check for missing or na values
cleaned_df.isna().sum()

Gender                  0
Married                 0
Number_of_Dependents    0
Tenure_in_Months        0
Phone_Service           0
Multipl_Lines           0
Internet_Service        0
Internet_Type           0
Online_Security         0
Online_Backup           0
Streaming_TV            0
Streaming_Movies        0
Streaming_Music         0
Unlimited_Data          0
Contract                0
Paperless_Billing       0
Payment_Method          0
Customer_Status         0
dtype: int64

In [9]:
# show the number of column and rows
cleaned_df.shape

(4601, 18)

In [10]:
#Convert all of the non-numeric columns to numeric
for column in cleaned_df.columns:
    if cleaned_df[column].dtype == np.number:
        continue
    cleaned_df[column] = LabelEncoder().fit_transform(cleaned_df[column])
print(cleaned_df.describe())

            Gender      Married  Number_of_Dependents  Tenure_in_Months  \
count  4601.000000  4601.000000           4601.000000       4601.000000   
mean      0.499674     0.502499              0.380352         33.631167   
std       0.500054     0.500048              0.878782         24.198487   
min       0.000000     0.000000              0.000000          0.000000   
25%       0.000000     0.000000              0.000000         11.000000   
50%       0.000000     1.000000              0.000000         31.000000   
75%       1.000000     1.000000              0.000000         57.000000   
max       1.000000     1.000000              7.000000         71.000000   

       Phone_Service  Multipl_Lines  Internet_Service  Internet_Type  \
count         4601.0    4601.000000            4601.0    4601.000000   
mean             0.0       0.563573               0.0       1.516627   
std              0.0       0.495996               0.0       0.701393   
min              0.0       0.000000 

In [11]:
cleaned_df

Unnamed: 0,Gender,Married,Number_of_Dependents,Tenure_in_Months,Phone_Service,Multipl_Lines,Internet_Service,Internet_Type,Online_Security,Online_Backup,Streaming_TV,Streaming_Movies,Streaming_Music,Unlimited_Data,Contract,Paperless_Billing,Payment_Method,Customer_Status
0,0,1,0,8,0,0,0,0,0,1,1,0,0,1,1,1,1,1
1,1,0,0,8,0,1,0,0,0,0,0,1,1,0,0,0,1,1
2,1,0,0,3,0,0,0,2,0,0,0,0,0,1,0,1,0,0
3,1,1,0,12,0,0,0,2,0,1,1,1,0,1,0,1,0,0
4,0,1,0,2,0,0,0,2,0,0,1,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6582,0,0,0,6,0,0,0,2,0,1,1,1,1,1,1,1,1,1
6583,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,1,1,0
6585,0,0,0,12,0,0,0,1,1,0,0,0,1,1,1,0,1,1
6586,1,1,0,21,0,1,0,2,0,0,0,1,1,1,0,1,0,0


In [30]:
#Show the new data set data types
cleaned_df.dtypes

Gender                  int32
Married                 int32
Number_of_Dependents    int64
Tenure_in_Months        int64
Phone_Service           int32
Multipl_Lines           int32
Internet_Service        int32
Internet_Type           int32
Online_Security         int32
Online_Backup           int32
Streaming_TV            int32
Streaming_Movies        int32
Streaming_Music         int32
Unlimited_Data          int32
Contract                int32
Paperless_Billing       int32
Payment_Method          int32
Customer_Status         int32
dtype: object

In [31]:
#Scaled the data 
x = cleaned_df.drop('Customer_Status', axis=1) #Feature data set
y = cleaned_df['Customer_Status']

x = StandardScaler().fit_transform(x)

In [32]:
#Split the data into 80% training and 20% testing
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [33]:
#Create the model
log_model = LogisticRegression().fit(x_train, y_train)
print('Model yan terbentuk adalah: \n', log_model)

Model yan terbentuk adalah: 
 LogisticRegression()


In [34]:
# Menyimpan model
pickle.dump(log_model, open('modelLog_churn.pkl', 'wb'))