## LOGISTIC REGRESSION ALGORITHM

In [None]:
import numpy as np
class LogisticRegressionWithMomentum:
    def __init__(self, learning_rate=0.01, epochs=10000, reg_lambda=0.01, momentum=0.9, tol=1e-4, patience=10, min_loss_threshold=0.1):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.reg_lambda = reg_lambda
        self.momentum = momentum
        self.tol = tol  # Ngưỡng cải thiện loss
        self.patience = patience  # Số epoch liên tiếp không cải thiện trước khi dừng
        self.min_loss_threshold = min_loss_threshold  # Ngưỡng tối thiểu của loss để dừng
        self.weights = None
        self.bias = None
        self.velocity_w = None
        self.velocity_b = None
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def compute_loss(self, y, y_predicted):
        num_samples = len(y)
        y_predicted = np.clip(y_predicted, 1e-15, 1 - 1e-15)
        loss = (-1 / num_samples) * np.sum(y * np.log(y_predicted + 1e-15) + (1 - y) * np.log(1 - y_predicted + 1e-15))
        clipped_weights = np.clip(self.weights, -1e5, 1e5)  # Giới hạn giá trị trọng số
        reg_loss = (self.reg_lambda / (2 * num_samples)) * np.sum(clipped_weights ** 2)
        return loss + reg_loss

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        self.velocity_w = np.zeros(num_features)
        self.velocity_b = 0

        best_loss = float('inf')
        patience_counter = 0

        for epoch in range(self.epochs):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)
            loss = self.compute_loss(y, y_predicted)
            if loss < best_loss - self.tol:
                best_loss = loss
                patience_counter = 0
            else:
                patience_counter += 1
            if patience_counter >= self.patience and best_loss < self.min_loss_threshold:
                print(f"Early stopping at epoch {epoch}: loss = {best_loss}")
                break
            dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y)) + (self.reg_lambda * self.weights) / num_samples
            db = (1 / num_samples) * np.sum((y_predicted - y))
            self.velocity_w = self.momentum * self.velocity_w - self.learning_rate * dw
            self.velocity_b = self.momentum * self.velocity_b - self.learning_rate * db
            self.weights += self.velocity_w
            self.bias += self.velocity_b

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return [1 if i > 0.5 else 0 for i in y_predicted]

## Train


In [None]:
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data= pd.read_csv('train.csv')

In [None]:
data

In [None]:
data.isnull().sum()

In [None]:
num_col=[col for col in data.columns if data[col].dtype in ['float64','int64']]
cat_col=[col for col in data.columns if data[col].dtype =='object']

## Categorical column

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
cat_one= LabelEncoder()
for col in cat_col:
    data[col]=cat_one.fit_transform(data[col])

In [None]:
data

## Num column

In [None]:
plt.figure(figsize=(25,15))
sns.heatmap(data.corr(),annot=True,fmt=".2f")

In [None]:
macor= data.corr()

In [None]:
low_corr_columns = [
    col for col in data.columns
    if all(abs(macor[col]['loan_status']) < 0.015 for other_col in data.columns if col != other_col)
]

In [None]:
data.drop(columns=low_corr_columns, inplace=True)

In [None]:
num_col=[col for col in data.columns if data[col].dtype in ['float64','int64']]
cat_col=[col for col in data.columns if data[col].dtype =='object']
num_col.remove('loan_status')

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=5, figsize=(35, 20))

# Vẽ boxplot cho từng cột
for i, col in enumerate(data.columns):
    row = i // 5
    col_idx = i % 5
    sns.boxplot(y=data[col], ax=axes[row, col_idx])

In [None]:
for col in num_col:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 0.3 * IQR
    upper_bound = Q3 + 0.3 * IQR


    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]


    def delete_outliers(data, column, m=3):
        mean = np.mean(data[column])
        std_dev = np.std(data[column])
        lower_bound = mean - m * std_dev
        upper_bound = mean + m * std_dev
        return lower_bound, upper_bound

    # Set a standard deviation threshold multiplied by m
    m = 3

    # Calculate limits for each variable and remove outliers
    for column in num_col:
        lower_bound, upper_bound = delete_outliers(data, column, m)
        data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

    #Reset the index
    data.reset_index(drop=True, inplace=True)
    data

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=5, figsize=(35, 20))

# Vẽ boxplot cho từng cột
for i, col in enumerate(data.columns):
    row = i // 5
    col_idx = i % 5
    sns.boxplot(y=data[col], ax=axes[row, col_idx])

In [None]:
data['loan_status'].value_counts()

In [None]:
data

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
X=data.drop(columns='loan_status')
Y= data['loan_status']

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Chuẩn hóa dữ liệu
X = scaler.fit_transform(X)


In [None]:
from imblearn.over_sampling import SMOTE


smote = SMOTE(random_state=42)
X, Y = smote.fit_resample(X, Y)

In [None]:
X

In [None]:
Y.value_counts()

In [None]:
modelCV= LogisticRegressionWithMomentum(epochs=100000)
modelCV.fit(X,Y)

## test

In [None]:
data2= pd.read_csv('test.csv')

In [None]:
num_col=[col for col in data2.columns if data2[col].dtype in ['float64','int64']]
cat_col=[col for col in data2.columns if data2[col].dtype =='object']

In [None]:
cat_one= LabelEncoder()
for col in cat_col:
    data2[col]=cat_one.fit_transform(data2[col])

In [None]:
data2

In [None]:
data2.drop(columns=low_corr_columns, inplace=True)

In [None]:
num_col=[col for col in data2.columns if data2[col].dtype in ['float64','int64']]
cat_col=[col for col in data2.columns if data2[col].dtype =='object']
num_col.remove('loan_status')

In [None]:
X2=data2.drop(columns='loan_status')
Y2= data2['loan_status']

In [None]:
X2

In [None]:
X2 = scaler.fit_transform(X2)

In [None]:
predictions = modelCV.predict(X2)


In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(Y2, predictions))
print(classification_report(Y2,predictions))