In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv("/content/sample_data/california_housing_train.csv")
df_test = pd.read_csv("/content/sample_data/california_housing_test.csv")

In [3]:
df_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           17000 non-null  float64
 1   latitude            17000 non-null  float64
 2   housing_median_age  17000 non-null  float64
 3   total_rooms         17000 non-null  float64
 4   total_bedrooms      17000 non-null  float64
 5   population          17000 non-null  float64
 6   households          17000 non-null  float64
 7   median_income       17000 non-null  float64
 8   median_house_value  17000 non-null  float64
dtypes: float64(9)
memory usage: 1.2 MB


In [5]:
X_train = df_train.drop("median_house_value", axis = 1)
y_train = df_train["median_house_value"]

In [6]:
assert (df_train.columns == df_test.columns).all()

In [7]:
X_test = df_test.drop("median_house_value", axis = 1)
y_test = df_test["median_house_value"]

## 1) Linear Regression

Il faudrait ajouter une couche pour gérer directement la transition de dataframe au numpy.

In [46]:
class LinearRegression:


    def __init__(self, resolution:int = -1):
        self.methode_resolution = resolution
        self.weights = None
        self.bias = 0
        self.X_train = None
        self.y_train = None

    def fit(self,X_train,y_train, learning_rate:float = 0.1, epochs:int = 10000, sample:int  = - 1):
        n_samples, n_features = X_train.shape
        self.X_train = X_train
        self.y_train = y_train
        self.weights = np.random.randn(n_features,1) *0.1
        self.bias = np.random.randint()* 0.01
        result = X_train.T@X_train
        if self.methode_resolution == -1:
                self.weights = np.linalg.pinv(result)@np.transpose(X_train)@y_train
        else:
            seed = np.random.seed(42)
            for _ in range(epochs):
                if sample == -1:
                    self.backward_propagation(learning_rate, sample = X_train.shape[0])
        return self.weights, self.bias



    def predict(self,X_test):
        return X_test@self.weights + self.bias


    def backward_propagation(self,learning_rate, sample):
        indices = np.arange(self.X_train.shape[0])
        np.random.shuffle(indices)
        X_train = self.X_train[indices]
        y_train = self.y_train[indices]
        X_batch = X_train[:sample]
        y_batch = y_train[:sample]
        gradient =  (2/sample) *X_batch.T@(X_batch @ self.weights + self.bias - y_batch)
        gradient_bias = (2/sample) * np.sum(X_batch @ self.weights + self.bias - y_batch)
        self.weights -= learning_rate * gradient
        self.bias -= learning_rate * gradient_bias


### A) Résolution par équation normale

In [9]:
model_linear = LinearRegression()
model_linear.fit(X_train,y_train)

(0    -2242.354677
 1    -8474.225900
 2     1786.148470
 3      -15.359703
 4       79.962291
 5      -39.904622
 6      133.170287
 7    45838.748893
 dtype: float64,
 0.0)

In [10]:
result = model_linear.predict(X_test.to_numpy())

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
model = LinearRegression()
model.fit(X_train,y_train)
result_skl = model.predict(X_test)

In [13]:
from sklearn.metrics import mean_absolute_error

In [14]:
print(f"Moyenne absolue sklearn: {mean_absolute_error(y_test, result_skl)}")
print(f"Moyenne absolue Implémentation: {mean_absolute_error(y_test, result)}")
print(f"prix max d'une habitation : {y_test.max()}")
print(f"prix min d'une habitation : {y_test.min()}")

Moyenne absolue sklearn: 50352.22825794297
Moyenne absolue Implémentation: 54747.3043198532
prix max d'une habitation : 500001.0
prix min d'une habitation : 22500.0


### B) Résolution par backward_propagation

In [47]:
class LinearRegression:


    def __init__(self, resolution:int = -1):
        self.methode_resolution = resolution
        self.weights = None
        self.bias = 0
        self.X_train = None
        self.y_train = None
        self.gradient = []

    def fit(self,X_train,y_train, learning_rate:float = 0.0001, epochs:int = 1000, gradient_limit = 2.0, sample:int  = - 1):
        n_samples, n_features = X_train.shape
        self.X_train = X_train
        self.y_train = y_train
        self.weights = np.random.randn(n_features,1) *0.1
        self.bias = np.random.randint()* 0.01
        result = X_train.T@X_train
        if self.methode_resolution == -1:
                self.weights = np.linalg.pinv(result)@np.transpose(X_train)@y_train
        else:
            for _ in range(epochs):
                if sample == -1:
                    self.backward_propagation(learning_rate, sample = X_train.shape[0], gradient_limit=gradient_limit)
                else:
                    self.backward_propagation(learning_rate, sample = sample, gradient_limit=gradient_limit)
        return self.weights, self.bias



    def predict(self,X_test):
        return X_test@self.weights + self.bias


    def backward_propagation(self,learning_rate, sample, gradient_limit = 2.0):
        seed = np.random.seed(42)
        indices = np.arange(self.X_train.shape[0])
        np.random.shuffle(indices)
        X_train = self.X_train[indices]
        y_train = self.y_train[indices]
        X_batch = X_train[:sample]
        y_batch = y_train[:sample]
        gradient =  np.clip((2/sample) *X_batch.T@(X_batch @ self.weights + self.bias - y_batch.reshape(-1,1)),-gradient_limit,gradient_limit)
        gradient_bias = np.clip((2/sample) * np.sum(X_batch @ self.weights + self.bias - y_batch.reshape(-1,1)),-gradient_limit,gradient_limit)
        self.gradient.append(gradient)
        self.weights -= learning_rate * gradient
        self.bias -= learning_rate * gradient_bias


In [42]:
model_linear = LinearRegression(resolution = 1)
model_linear.fit(X_train.to_numpy(),y_train.to_numpy())

(array([[-0.10314913],
        [ 0.21011428],
        [ 0.31633254],
        [ 0.29517817],
        [ 0.34576978],
        [ 0.21292324],
        [ 0.14675698],
        [ 0.26212335]]),
 np.float64(0.20000000000000367))

In [43]:
result_impl = model_linear.predict(X_test.to_numpy())

In [45]:
print(f"Moyenne absolue sklearn: {mean_absolute_error(y_test, result_skl)}")
print(f"Moyenne absolue Implémentation: {mean_absolute_error(y_test, result_impl)}")
print(f"prix max d'une habitation : {y_test.max()}")
print(f"prix min d'une habitation : {y_test.min()}")

Moyenne absolue sklearn: 50352.22825794297
Moyenne absolue Implémentation: 204494.96304191145
prix max d'une habitation : 500001.0
prix min d'une habitation : 22500.0
