In [6]:
from sklearn.datasets import make_classification, make_moons
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport

import plots as pl
import metrics as met


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
def load_dataset(filename, data_columns, target_column):
    df = pd.read_csv(filename)
    X, y = df[data_columns].values, df[target_column].values
    return X, y.reshape(-1, 1)

In [8]:
class CustomRegression:
    def __init__(self, standardize=True, 
                 learning_rate=0.01, 
                 max_iter=1000,
                 tol=1e-4,
                 verbose=False):
        self.standardize = standardize
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.tol = tol
        self.verbose = verbose

    def normalize(self, X):
        mean = X.mean()
        std = X.std()
        X_new = (X - mean) / std
        return X_new, mean, std
    
    def linear(self, z):
        return z
    
    def hypothesis(self, X, theta):
        z = np.dot(X, theta)
        return self.linear(z)

    def cost_function(self, X, y, theta):
        m = X.shape[0]
        h = self.hypothesis(X, theta)
        cost = (1 / (2 * m)) * np.sum(np.square(h - y))
        return cost

    def gradient(self, X, y, theta):
        m = X.shape[0]
        grad = 1.0 / m * np.sum((self.hypothesis(X, theta) - y) * X)
        return grad

    def gradient_descent(self, X, y, theta):
        costs = []
        J = self.cost_function(X, y, theta)
        costs.append(J)

        if self.verbose:
            print(f"Iteration 0 Cost: {J}")

        for i in range(1, self.max_iter + 1):
            grad = self.gradient(X, y, theta)
            theta = theta - self.learning_rate * grad
            cost = self.cost_function(X, y, theta)
            
            costs.append(cost)

            if i % 100 == 0 and self.verbose:
                print(f"Iteration {i} Cost: {cost}")

            if np.abs(costs[i] - costs[i - 1]) < self.tol:
                print(f"Converged at iteration {i}")
                break

        return theta, costs
    
    def fit(self, X, y):
        X_new = X.copy()
        if self.standardize:
            X_new, self.mean, self.std = self.normalize(X_new)

        self.theta = np.zeros((X_new.shape[1], 1))
        self.theta, self.costs = self.gradient_descent(X_new, y, self.theta)

    def predict(self, X):
        X_new = X.copy()
        if self.standardize:
            X_new, self.mean, self.std = self.normalize(X_new)

        y_pred = np.where(self.hypothesis(X_new, self.theta) > 0.5, 1, 0)
        return y_pred

    def predict_proba(self, X):
        X_new = X.copy()
        if self.standardize:
            X_new, self.mean, self.std = self.normalize(X_new)

        h = self.hypothesis(X_new, self.theta)
        return h

In [9]:
# Load data
data_columns = ["age", "sex", "bmi", "children", "smoker", "region"]
target_column = "charges"
X, y = load_dataset(r'C:\Users\Артем\vscode_source\MLIntro\LinearRegression\IW1\insurance.csv', data_columns, target_column)

# Map data
X[:, 1] = np.where(X[:, 1] == 'female', 0, 1)
X

array([[19, 0, 27.9, 0, 'yes', 'southwest'],
       [18, 1, 33.77, 1, 'no', 'southeast'],
       [28, 1, 33.0, 3, 'no', 'southeast'],
       ...,
       [18, 0, 36.85, 0, 'no', 'southeast'],
       [21, 0, 25.8, 0, 'no', 'southwest'],
       [61, 0, 29.07, 0, 'yes', 'northwest']], dtype=object)

In [14]:
# EDA
dataframe_X = pd.DataFrame(X)
report = ProfileReport(dataframe_X, title='Data Features', correlations={
            "auto": {"calculate": True},
            "pearson": {"calculate": True},
            "spearman": {"calculate": True},
            "kendall": {"calculate": True},
            "phi_k": {"calculate": True},
            "cramers": {"calculate": True},
        })
report.to_notebook_iframe()

ModuleNotFoundError: No module named 'ipywidgets'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
lr = CustomRegression(standardize=True, learning_rate=0.01, max_iter=1000, tol=1e-4, verbose=True)
lr.fit(X_train, y_train)
y_test_pred = lr.predict(X_test)

accuracy, report, confusion = met.evaluate_classification(y_test, y_test_pred)
print(f"Accuracy: \n{accuracy}")
print(f"Report: \n{report}")
print(f"Confusion: \n{confusion}")
pl.plot_decision_boundary(lr, X_test, y_test)
plt.show()