## Linear regression without sklearn

In [2]:
import csv
import numpy as np

X = []
y = []
with open('insurance.csv', 'r') as file:
    next(file)
    for line in file:
        values = line.strip().split(',')
        age = float(values[0])
        bmi = float(values[2])
        children = float(values[3])
        smoker = 1.0 if values[4] == 'yes' else 0.0
        region = float({'southwest': 0, 'southeast': 1, 'northwest': 2, 'northeast': 3}.get(values[5], 0))
        charges = float(values[6])
        X.append([age, bmi, children, smoker, region])
        y.append(charges)

In [3]:
X = np.array(X)
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X_normalized = (X - X_mean) / X_std

In [4]:
correlations = []
for j in range(X_normalized.shape[1]):
    x_j = X_normalized[:, j]
    correlation = np.corrcoef(x_j, y)[0, 1]
    correlations.append(correlation)

In [5]:
print("Correlations between features and target (charges):")
for j, correlation in enumerate(correlations):
    print(f"Correlation_{j}: {correlation:.5f}")

Correlations between features and target (charges):
Correlation_0: 0.29901
Correlation_1: 0.19834
Correlation_2: 0.06800
Correlation_3: 0.78725
Correlation_4: 0.00621


In [6]:
correlation_threshold = 0.05
selected_features = []

for feature in range(len(correlations)):
    correlation = correlations[feature]
    if abs(correlation) >= correlation_threshold:
        selected_features.append(feature) 

X_selected = []
for row in X_normalized:
    selected_row = [row[i] for i in selected_features]
    X_selected.append(selected_row)

X_selected = np.array(X_selected)

In [7]:
learning_rate = 0.05
num_epochs = 1000

num_features = len(selected_features)
coefficients1 = [0.0] * num_features

for epoch in range(num_epochs):
    error = np.dot(X_selected, coefficients1) - y
    gradients = np.dot(X_selected.T, error) / len(X_selected)
    coefficients1 -= learning_rate * gradients
intercept1 = np.mean(y) - np.sum([coefficients1[j] * np.mean(X_selected[:, j]) for j in range(num_features)])

In [8]:
print(f"Intercept: {intercept1:.5f}")
print("Coefficients without manual libraries after correlation:")
for j, coef1 in enumerate(coefficients1):
    print(f"Coef_{j}: {coef1:.5f}")

Intercept: 13270.42227
Coefficients without manual libraries after correlation:
Coef_0: 3621.42131
Coef_1: 1961.97642
Coef_2: 570.59026
Coef_3: 9608.92735


## Linear regression with sklearn

In [10]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_normalized, y)

intercept2 = model.intercept_
coefficients2 = model.coef_

print("Coefficients using scikit-learn:")
print(f"Intercept: {intercept2:.5f}")
for j, coef2 in enumerate(coefficients2):
    print(f"Coef_{j}: {coef2:.5f}")

Coefficients using scikit-learn:
Intercept: 13270.42227
Coef_0: 3615.17881
Coef_1: 2024.09717
Coef_2: 576.54094
Coef_3: 9607.64125
Coef_4: 390.37465


## Compare intercept и coefficients

In [12]:
from tabulate import tabulate

intercept1 = 13270.42227
coef1 = [3621.11697, 1962.12906, 570.68414, 9608.45778]

intercept2 = 13270.42227
coef2 = [3615.17881, 2024.09717, 576.54094, 9607.64125, 390.37465]



coef_table = [
    ["Intercept", intercept1, intercept2],
    ["Coef_0", coef1[0], coef2[0]],
    ["Coef_1", coef1[1], coef2[1]],
    ["Coef_2", coef1[2], coef2[2]],
    ["Coef_3", coef1[3], coef2[3]],
]

headers = ["", "Manual", "Scikit-learn"]

print(tabulate(coef_table, headers=headers, floatfmt=".5f"))

                Manual    Scikit-learn
---------  -----------  --------------
Intercept  13270.42227     13270.42227
Coef_0      3621.11697      3615.17881
Coef_1      1962.12906      2024.09717
Coef_2       570.68414       576.54094
Coef_3      9608.45778      9607.64125
