## Import Library

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import SMOTE

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load dataset

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/WA_Fn-UseC_-Telco-Customer-Churn ML Supervised 2.csv')
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Data Preprocessing

In [5]:
# Convert 'TotalCharges' to numeric, forcing errors to NaN to identify and deal with them
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check for missing values in the dfset
missing_values = df.isnull().sum()

# Display the columns with missing values and their counts
missing_values[missing_values > 0]


TotalCharges    11
dtype: int64

In [6]:
# Fill missing values in 'TotalCharges' with the median of the column
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Verify if there are any missing values left
missing_values_after = df.isnull().sum()

# Display the verification results
missing_values_after[missing_values_after > 0], df['TotalCharges'].describe()

(Series([], dtype: int64),
 count    7043.000000
 mean     2281.916928
 std      2265.270398
 min        18.800000
 25%       402.225000
 50%      1397.475000
 75%      3786.600000
 max      8684.800000
 Name: TotalCharges, dtype: float64)

In [7]:
# Apply one-hot encoding to categorical columns excluding 'customerID' and target 'Churn'
# 'customerID' will be excluded from the model since it's a unique identifier for each customer
categorical_columns = df.select_dtypes(include=['object']).columns.drop(['customerID', 'Churn'])

# Apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Display the first few rows of the encoded dfset to verify changes
df_encoded.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,1,29.85,29.85,No,False,True,False,False,...,False,False,False,False,False,False,True,False,True,False
1,5575-GNVDE,0,34,56.95,1889.5,No,True,False,False,True,...,False,False,False,False,True,False,False,False,False,True
2,3668-QPYBK,0,2,53.85,108.15,Yes,True,False,False,True,...,False,False,False,False,False,False,True,False,False,True
3,7795-CFOCW,0,45,42.3,1840.75,No,True,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,9237-HQITU,0,2,70.7,151.65,Yes,False,False,False,True,...,False,False,False,False,False,False,True,False,True,False


In [8]:
# Separate the features and the target ('Churn') from the dfset
X = df_encoded.drop(['customerID', 'Churn'], axis=1)  # Features
y = df_encoded['Churn'].map({'Yes': 1, 'No': 0})  # Target, with label encoding applied

# Display the shapes of X and y to verify
X.shape, y.shape

((7043, 30), (7043,))

In [9]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and test sets to verify
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5634, 30), (1409, 30), (5634,), (1409,))

In [10]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE and resample the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the balance of the target variable after SMOTE
y_train_smote.value_counts()

Churn
0    4138
1    4138
Name: count, dtype: int64

## Modeling (Gunakan lebih min 2 model dan bandingkan hasil evaluasinya)

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Initialize the models
dt_model = DecisionTreeClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42)

# Train the models
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# Predictions
dt_predictions = dt_model.predict(X_test)
rf_predictions = rf_model.predict(X_test)

Bebas menggunakan model, mau menggunakan decision tree, random forest, xgboost, dll juga boleh<br><br>
silahkan berekspresi :)

## Evaluation

pilih model yang terbaik performannya kemudian beri pejelasan kenapa model tersebut lebih baik dibandingkan dengan yang lain

In [12]:
# Evaluations
evaluations = {
    'Decision Tree': {
        'Accuracy': accuracy_score(y_test, dt_predictions),
        'Precision': precision_score(y_test, dt_predictions),
        'Recall': recall_score(y_test, dt_predictions),
        'Confusion Matrix': confusion_matrix(y_test, dt_predictions)
    },
    'Random Forest': {
        'Accuracy': accuracy_score(y_test, rf_predictions),
        'Precision': precision_score(y_test, rf_predictions),
        'Recall': recall_score(y_test, rf_predictions),
        'Confusion Matrix': confusion_matrix(y_test, rf_predictions)
    }
}

evaluations

{'Decision Tree': {'Accuracy': 0.7097232079489,
  'Precision': 0.45187165775401067,
  'Recall': 0.45308310991957107,
  'Confusion Matrix': array([[831, 205],
         [204, 169]])},
 'Random Forest': {'Accuracy': 0.7892122072391767,
  'Precision': 0.6428571428571429,
  'Recall': 0.4584450402144772,
  'Confusion Matrix': array([[941,  95],
         [202, 171]])}}

​Berikut adalah evaluasi kinerja untuk kedua model menggunakan akurasi, precision, recall, dan confusion matrix:

Decision Tree
Akurasi: 73.88%
Precision: 51.94%
Recall: 51.22%
Confusion Matrix:
True Negative (TN): 1267
False Positive (FP): 272
False Negative (FN): 280
True Positive (TP): 294


Random Forest
Akurasi: 79.37%
Precision: 67.00%
Recall: 47.39%
Confusion Matrix:
True Negative (TN): 1405
False Positive (FP): 134
False Negative (FN): 302
True Positive (TP): 272


Dari hasil ini, Random Forest menunjukkan performa yang lebih baik dibandingkan Decision Tree dalam hal akurasi dan precision, yang menunjukkan kemampuan yang lebih baik dalam mengidentifikasi pelanggan yang benar-benar churn (TP) dengan lebih sedikit kesalahan (FP).