Project in which I need to build a linear regression model for a simple dataset.

#  Загрузка нужных библиотек

In [None]:
import numpy as np
import pandas as pd

import plotly
import plotly.graph_objs as go
from plotly.subplots import make_subplots

from tqdm import tqdm_notebook as tqdm

# Loading data

In [None]:
path = '/content/drive/My Drive/Colab Notebooks/42 school/ft_linear_regression /data.csv'
data_v = pd.read_csv(path)

print("Data shape row_count = {} columns_count = {}".format(data_v.shape[0], data_v.shape[1]))
data_v.head()

Data shape row_count = 24 columns_count = 2


Unnamed: 0,km,price
0,240000,3650
1,139800,3800
2,150500,4400
3,185530,4450
4,176000,5250


# Data visualization

This dataset contains information about cars, their cost and the the number of kilometers they've made.

We can see just by looking at the data that their is a **linear relationship**, when the distance made diminish, the price rize.

We can try to trace an approximate line which goes *through* the data.

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=data_v['km'], y=data_v['price'], mode='markers'))
fig.update_traces(hoverinfo="all", 
                  hovertemplate="Km: %{x}<br>Price: %{y}")
fig.update_layout(legend_orientation="h", 
                  legend=dict(x=.5, xanchor="center"),
                  margin=dict(l=0, r=0, t=0, b=0))
fig.show()

# ft_linear_regression

We can use this hypothetical line to predict new data, for example predict the price of a car knowing only it's distance traveled.

The equation of a line is in the form:

$$
\boxed {
    price(mileage) = \theta_1 * mileage + \theta_0
}
$$

Where $\theta_1$ is the slope of the line and $\theta_0$ the y-intercept. If we managed to tweak those values correctly we can find the best fitting line for our data (or at least very close to it).


In [None]:
import copy

class ft_linear_regression:
    def __init__(self, learning_rate, flags, print_error=0, max_iter=100000):
        self.learning_rate = learning_rate
        self.target_column = flags["target"]
        self.parametr_column = flags["parametr"]
        self.print_error = print_error
        self.max_iter = max_iter 

        self.theta_0 = 0.0
        self.theta_1 = 0.0
        self.prev_mse = 0.0

    def estimatePrice(self, mileage) :
        return ((self.theta_0 + (self.theta_1 * float(mileage))))

    def mean_square_error(self) :
        tmp_summ = 0.0

        for i in range(self.inner_data.shape[0]):
            tmp_summ += (self.estimatePrice(self.inner_data.loc[i, self.parametr_column]) - float(self.inner_data.loc[i, self.target_column]))**2
        return (tmp_summ / self.inner_data.shape[0])
    
    def get_gradient0(self, X_batch, y_batch) :
        tmp_summ = 0.0

        for i in range(len(X_batch)):
            tmp_summ += (self.estimatePrice(X_batch[i]) - float(y_batch[i]))
        return (tmp_summ / len(X_batch))
    
    def get_gradient1(self, X_batch, y_batch) :
        tmp_summ = 0.0

        for i in range(len(X_batch)):
            tmp_summ += (self.estimatePrice(X_batch[i]) - float(y_batch[i])) * X_batch[i]
        return (tmp_summ / len(X_batch))

    def standardize(self) :
        self.min_parametr = self.inner_data[self.parametr_column].min()
        self.max_parametr = self.inner_data[self.parametr_column].max()
        self.min_target = self.inner_data[self.target_column].min()
        self.max_target = self.inner_data[self.target_column].max()

        for i in range(self.inner_data.shape[0]):
            self.inner_data.loc[i, self.parametr_column] = (self.inner_data.loc[i, self.parametr_column] - self.min_parametr) / (self.max_parametr - self.min_parametr)
            self.inner_data.loc[i, self.target_column] = (self.inner_data.loc[i, self.target_column] - self.min_target) / (self.max_target - self.min_target)

    def generate_batches(self, X, y, batch_size):
        assert len(X) == len(y)
        np.random.seed(42)
        X = np.array(X)
        y = np.array(y)
        perm = np.random.permutation(len(X))

        X_inner, y_inner = X[perm], y[perm]
        for i in range(X_inner.shape[0] // batch_size):
            yield tuple((X_inner[i * batch_size : (i + 1) * batch_size], y_inner[i * batch_size : (i + 1) * batch_size]))


    def fit(self, data, epochs=1000, tet=1e-10) :
        self.inner_data = copy.deepcopy(data)
        self.standardize()
        delta_mse = self.mean_square_error()
        cur_mse = delta_mse
        self.history = []

        for _ in tqdm(range(epochs)):
            for X_batch, y_batch in self.generate_batches(X=self.inner_data[self.parametr_column], 
                                                           y=self.inner_data[self.target_column], 
                                                           batch_size=2):
            
                self.theta_0 -= self.learning_rate * self.get_gradient0(X_batch, y_batch)
                self.theta_1 -= self.learning_rate * self.get_gradient1(X_batch, y_batch)
                self.history.append(self.mean_square_error())

                if self.print_error:
                    print(cur_mse)
          
            prev_mse = copy.deepcopy(cur_mse)
            cur_mse = self.mean_square_error()
            if cur_mse < tet:
                break

        self.theta_1 = (data[self.target_column].max() - data[self.target_column].min()) * self.theta_1 / \
          (data[self.parametr_column].max() - data[self.parametr_column].min())
        
        self.theta_0 = data[self.target_column].min() + \
          ((data[self.target_column].max() - data[self.target_column].min()) * self.theta_0) + \
          self.theta_1 * (1 - data[self.parametr_column].min())

In [2]:
a = input()
print(a)

12
12


In [None]:
lin_reg_model = ft_linear_regression(learning_rate=0.1, max_iter = 2000, flags={"target": 'price', "parametr": 'km'})

ind = 15
print("Price befor Train = {} Real price = {}".format(lin_reg_model.estimatePrice(data_v.loc[ind, 'km']), data_v.loc[ind, 'price']))

lin_reg_model.fit(data_v, tet=0.0001)
print("Price after Train = {} Real price = {}".format(lin_reg_model.estimatePrice(data_v.loc[ind, 'km']), data_v.loc[ind, 'price']))
print()
print("theta_0 = {}\ntheta_1 = {}".format(lin_reg_model.theta_0, lin_reg_model.theta_1))

Price befor Train = 0.0 Real price = 6900



This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


Price after Train = 6945.97722540742 Real price = 6900

theta_0 = 8567.410292033406
theta_1 = -0.021327629945754507


In [None]:
test_mileage = np.linspace(data_v['km'].min(), data_v['km'].max(), 100)
test_price = [lin_reg_model.estimatePrice(mel) for mel in test_mileage]

fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Scatter(x=data_v['km'], y=data_v['price'], mode='markers', name = 'Real Data'), 1, 1)
fig.add_trace(go.Scatter(x=test_mileage, y=test_price, mode='lines', name = 'My Linear Regression resulte'), 1, 1)

fig.add_trace(go.Scatter(x=list(range(len(lin_reg_model.history))), y=lin_reg_model.history, mode='lines', hovertemplate="Iteration: %{x}<br>MSE: %{y}", name='Error'), 1 ,2)

fig.update_layout(legend_orientation="h",
                  legend=dict(x=.5, xanchor="center"),
                  hovermode="x",
                  margin=dict(l=0, r=0, t=0, b=0))
fig.show()