In [1]:
import pandas as pd
import numpy as np

In [2]:
filename='customer_churn.csv'
df = pd.read_csv(filename)
df.head(5)
print(df.shape)

(7043, 21)


In [3]:
# Loại bỏ cột "customerID" khỏi dữ liệu
df = df.drop("customerID", axis=1)

In [4]:
name=list(df.columns)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [5]:
types =df.dtypes
print(types)

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object


In [6]:
# Xử lý giá trị thiếu (nếu có)
df.dropna(inplace=True)

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in range(0,len(types)):
    if types[i]=='object':
            le.fit_transform(df[name[i]])
            df[name[i]]=le.transform(df[name[i]])

In [8]:
data= df.values
x= data[:,:-1]
y= data[:,-1]
print(x[0])
print(y[0])

[ 0.    0.    1.    0.    1.    0.    1.    0.    0.    2.    0.    0.
  0.    0.    0.    1.    2.   29.85 29.85]
0.0


In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler =MinMaxScaler()
scaler.fit(x)
X_scaler =scaler.transform(x)
print(X_scaler[0])

[0.         0.         1.         0.         0.         0.
 0.5        0.         0.         1.         0.         0.
 0.         0.         0.         1.         0.66666667 0.11542289
 0.0012751 ]


In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=\
train_test_split(X_scaler,y,test_size=0.2,random_state=42)
print(y_train)

[1. 1. 1. ... 0. 0. 1.]


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

params = {'n_neighbors': [1,3,5,7,9]}
grid_search = GridSearchCV(KNeighborsClassifier(), params, cv=5) 
grid_search.fit(x_train, y_train)

print(grid_search.best_params_)

{'n_neighbors': 9}


In [12]:
# Initialize the models
Logistic_reg = LogisticRegression()
best_n_neighbors= grid_search.best_params_['n_neighbors']
knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)  # You can adjust the number of neighbors (k) as needed
tree=DecisionTreeClassifier()
gaussian_nb = GaussianNB()

# Train the models
Logistic_reg.fit(x_train, y_train)
knn.fit(x_train, y_train)
tree.fit(x_train, y_train)
gaussian_nb.fit(x_train, y_train)
# Make predictions
y_pred_log = Logistic_reg.predict(x_test)
y_pred_knn = knn.predict(x_test)
y_pred_tree= tree.predict(x_test)
y_pred_gaussian = gaussian_nb.predict(x_test)
print(y_pred_log)
print(y_pred_knn)
print(y_pred_tree)
print(y_pred_gaussian)

[0. 0. 1. ... 0. 0. 0.]
[0. 0. 1. ... 0. 0. 0.]
[0. 0. 1. ... 0. 0. 0.]
[0. 0. 1. ... 0. 0. 0.]


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def Evaluate(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")


In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate Linear Regression
print("LogisticRegression:")
Evaluate(y_test, y_pred_log)
  

# Evaluate K-Nearest Neighbors
print("\nK-Nearest Neighbors:")
Evaluate(y_test, y_pred_knn)

print("\nDecisionTree")
Evaluate(y_test, y_pred_tree)
print("\nGaussian:")
Evaluate(y_test, y_pred_gaussian)


LogisticRegression:
Accuracy: 0.7860696517412935
Precision: 0.622895622895623
Recall: 0.4946524064171123
F1-score: 0.5514157973174367

K-Nearest Neighbors:
Accuracy: 0.7562189054726368
Precision: 0.5434173669467787
Recall: 0.5187165775401069
F1-score: 0.5307797537619698

DecisionTree
Accuracy: 0.7171286425017769
Precision: 0.47029702970297027
Recall: 0.5080213903743316
F1-score: 0.4884318766066838

Gaussian:
Accuracy: 0.7377398720682303
Precision: 0.5046382189239332
Recall: 0.7272727272727273
F1-score: 0.5958378970427163


In [30]:
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(score_func=f_classif, k=16)
x_train_new = selector.fit_transform(x_train, y_train)
x_test_new = selector.transform(x_test)
selected_features = selector.get_support(indices=True)
selected_column_names = df.columns[selected_features]
print("Dữ liệu quan trọng sau khi áp dụng SelectKBest:")
print(selected_column_names)

Dữ liệu quan trọng sau khi áp dụng SelectKBest:
Index(['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges'],
      dtype='object')


In [31]:
# Train the models
Logistic_reg.fit(x_train_new, y_train)
knn.fit(x_train_new, y_train)
tree.fit(x_train_new, y_train)
gaussian_nb.fit(x_train_new, y_train)
# Make predictions
y_pred_log = Logistic_reg.predict(x_test_new)
y_pred_knn = knn.predict(x_test_new)
y_pred_tree= tree.predict(x_test_new)
y_pred_gaussian = gaussian_nb.predict(x_test_new)


In [32]:
# Evaluate Linear Regression
print("LogisticRegression:")
Evaluate(y_test, y_pred_log)
  

# Evaluate K-Nearest Neighbors
print("\nK-Nearest Neighbors:")
Evaluate(y_test, y_pred_knn)

print("\nDecisionTree")
Evaluate(y_test, y_pred_tree)
print("\nGaussian:")
Evaluate(y_test, y_pred_gaussian)

LogisticRegression:
Accuracy: 0.7917555081734187
Precision: 0.6327868852459017
Recall: 0.516042780748663
F1-score: 0.5684830633284241

K-Nearest Neighbors:
Accuracy: 0.7505330490405118
Precision: 0.5316804407713499
Recall: 0.516042780748663
F1-score: 0.5237449118046132

DecisionTree
Accuracy: 0.7213930348258707
Precision: 0.47738693467336685
Recall: 0.5080213903743316
F1-score: 0.4922279792746114

Gaussian:
Accuracy: 0.7412935323383084
Precision: 0.5093632958801498
Recall: 0.7272727272727273
F1-score: 0.5991189427312774
