In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, cross_validate, KFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import pickle
from imblearn.over_sampling import SMOTE

Đọc dữ liệu và show thông tin

In [None]:
df = pd.read_csv('customer-v2.csv')
print(df.info())

Show tỉ lệ nhãn

In [None]:
label_value, count = np.unique(df['THANHLY'], return_counts=True)
plt.bar(['churn', 'non_churn'], count)
plt.show()

Xóa đi các dòng chứa dữ liệu không hợp lệ

In [None]:
df = df[df['GIADICHVU'] > 0]
print(df.info())

Chia tập dữ liệu

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

print(X.shape)
print(y.shape)

Chuẩn hóa dữ liệu

In [None]:
scaler = MinMaxScaler()
for feature in X.columns:
    if (X[feature].dtype == 'int64'):
        X[feature] = scaler.fit_transform(X[[feature]])

Mã hóa tập X với One-hot

In [None]:
for feature in X.columns:
    if (X[feature].dtype == 'object'):
        dummy = pd.get_dummies(X[feature], prefix=feature)
        X = pd.concat([X, dummy], axis=1)
        del X[feature]

SMOTE

In [None]:
smt = SMOTE()
X, y = smt.fit_resample(X, y)

In [None]:
label_value, count = np.unique(y, return_counts=True)
plt.bar(['churn', 'non_churn'], count)
plt.show()

In [None]:
print(X.shape)
print(y.shape)

Xây dựng model và tính các chỉ số đánh giá

In [None]:
classifier_algorithms = {
    "K Nearest Neighbors": KNeighborsClassifier(n_neighbors=7),
    "Decision Tree": DecisionTreeClassifier(max_depth=4, random_state=1), 
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=1, max_depth=4), 
    "Logistic Regression": LogisticRegression(random_state=1),
}
model = classifier_algorithms["K Nearest Neighbors"]
model.fit(np.array(X), y)
scoring = ("accuracy", "f1", "recall", "precision")
k_fold = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_validate(model, np.array(X), y, scoring=scoring, cv=k_fold, return_indices=True)
print('test_accuracy', sum(scores['test_accuracy']) / len(scores['test_accuracy']))
print('test_f1', sum(scores['test_f1']) / len(scores['test_f1']))
print('run_time', sum(scores['fit_time']) + sum(scores['score_time']))

In [None]:
folds = [i for i in range(1, 11)]
plt.figure(figsize=(10, 5))
plt.title("K Nearest Neighbors")
plt.scatter(x=folds, y=scores['test_accuracy'], label='Accuracy')
plt.scatter(x=folds, y=scores['test_f1'], label='F1')
plt.xlabel("Fold")
plt.legend()
plt.show()