# Петров Сергей

## Проверить всё вместе

1) Использовать One-Hot энкодинги для категориальных фичей у которых нет естественного порядка (hint: pd.get_dummies)

2) Заполнить nan - в числовых фичах средним из train выборки

3) Сделать стандартное шкалирование по train выборке (можно взять StandardScaler из sklearn или использовать свой самописный)

4) Сравнить модели (mse, r2, forward_selector):
  * Самописное аналитическое решение
  * Самописный градиентный спуск
  * Линейную регрессию из sklearn
  * Ridge регрессию из sklearn для alpha (0.01, 0.1, 1.0)
  * Lasso регрессию из sklearn для alpha (0.01, 0.1, 1.0)
  * ElasticNet регрессию из sklearn для alpha (0.01, 0.1, 1.0)

5) Выбрать лучшую модель по mse на test выборке

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [11]:
df = pd.read_csv('train.csv')

In [12]:
r_state = 42
alpha = [0.01, 0.1, 1]

## 1) One-hot энк для бинарных данных 

In [13]:
df['Street'] = df['Street'].astype('category')
df['Street'] = df['Street'].cat.codes

In [14]:
df['Street'].unique()

array([1, 0], dtype=int8)

## 1) One-hot энк для небинарных персон

In [15]:
df = pd.get_dummies(df, columns = ['Alley'])

In [7]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Alley_Grvl,Alley_Pave
0,1,60,RL,65.0,8450,1,Reg,Lvl,AllPub,Inside,...,,,0,2,2008,WD,Normal,208500,0,0
1,2,20,RL,80.0,9600,1,Reg,Lvl,AllPub,FR2,...,,,0,5,2007,WD,Normal,181500,0,0
2,3,60,RL,68.0,11250,1,IR1,Lvl,AllPub,Inside,...,,,0,9,2008,WD,Normal,223500,0,0
3,4,70,RL,60.0,9550,1,IR1,Lvl,AllPub,Corner,...,,,0,2,2006,WD,Abnorml,140000,0,0
4,5,60,RL,84.0,14260,1,IR1,Lvl,AllPub,FR2,...,,,0,12,2008,WD,Normal,250000,0,0


In [16]:
hw_train_df, hw_test_df = train_test_split(df,  test_size=0.25, random_state=r_state)

## 2) Заполнение нанов

In [17]:
for f in df.columns:
    if not (np.issubdtype(hw_train_df[f].dtype, np.number)):
        continue
        
    mean = hw_train_df[f].mean()
    hw_train_df[f] = hw_train_df[f].fillna(mean)
    hw_test_df[f] = hw_test_df[f].fillna(mean)

## 3)  Стандартное шкалирование

In [18]:
# Реализовать класс MinMaxScaler, который запоминает параметры min и range (max-min) на трейн выборке
# и использует их, чтобы отшкалировать данные (X-min)/(max-min)

class MinMaxScaler:
    def __init__(self):
        self.min = None
        self.range = None
    
    def fit(self, X_train):
        self.min = {}
        self.range = {}
        for col in X_train.columns:
            
            if not (np.issubdtype(X_train[col].dtype, np.number)):
                continue
                    
            self.min[col] = X_train[col].min()
            self.range[col] = X_train[col].max() - self.min[col]
    
    def transform(self, X):
        transformed = X[:]
        for col in self.min.keys():            
            transformed[col] = (transformed[col] - self.min[col])/self.range[col]
        return transformed
    
    def fit_transform(self, X_train):
        self.fit(X_train)
        return self.transform(X_train)
    
    def inverse_transform(self, X):
        transformed = X[:]
        for col in self.min.keys():            
            transformed[col] = transformed[col]*self.range[col] + self.min[col]
        return transformed

In [19]:
# Реализовать класс StandardScaler, который запоминает параметры mean и std на трейн выборке
# и использует их, чтобы отшкалировать данные (X-mean)/std

class StandardScaler:
    def __init__(self):
        self.mean = None
        self.std = None
    
    def fit(self, X_train):
        self.mean = {}
        self.std = {}
        for col in X_train.columns:
            
            if not (np.issubdtype(X_train[col].dtype, np.number)):
                continue
                    
            self.mean[col] = X_train[col].mean()
            self.std[col] = X_train[col].std()
    
    def transform(self, X):
        transformed = X[:]
        for col in self.mean.keys():            
            transformed[col] = (transformed[col] - self.mean[col])/self.std[col]
        return transformed
    
    def fit_transform(self, X_train):
        self.fit(X_train)
        return self.transform(X_train)
    
    def inverse_transform(self, X):
        transformed = X[:]
        for col in self.mean.keys():            
            transformed[col] = transformed[col]* self.std[col] + self.mean[col]
        return transformed

In [20]:
std_scal = StandardScaler()
hw_std_train_df = std_scal.fit_transform(hw_train_df)
hw_std_test_df = std_scal.fit_transform(hw_test_df)

## 4) ...

In [125]:
y_col = 'SalePrice'

numerical_mask = [np.issubdtype(hw_std_train_df[name].dtype, np.number) and not (name in [y_col, 'Id']) for name in hw_std_train_df.columns]
num_tr_df = hw_std_train_df[hw_std_train_df.columns[numerical_mask]]
features_to_forward_select = list(num_tr_df.columns)

In [126]:
class ForwardSelector:
    def __init__(self, model):
        self.model = model
        
    def forward_select(self, train_df, test_df, x_col, y_col):
        
        y_train = train_df[y_col]
        min_mse = np.inf
        selected = {}
        x_col = x_col[:]
        
        for i in range(len(x_col)):
            
            next_feature = np.nan
            for f in x_col:
                X_train = train_df[list(selected.keys()) + [f]]
                self.model.fit(X_train, y_train)
                cur_mse = mse(y_train, self.model.predict(X_train))
                if(min_mse > cur_mse):
                    min_mse = cur_mse
                    next_feature = f
                    
            if not(next_feature is np.nan):
                selected[next_feature] = min_mse
                x_col.remove(next_feature)
            else:
                break
        return selected

In [127]:
fs = ForwardSelector(LSM_Regressor())
dic = fs.forward_select(hw_std_train_df, hw_std_test_df, features_to_forward_select, y_col)
chosen_f = list(dic.keys())

In [129]:
print(f'Delete {len(features_to_forward_select) - len(chosen_f)} params')
for i in features_to_forward_select:
    try:
        chosen_f.index(i)
    except:
        print(i)

Delete 2 params
BsmtFinSF2
2ndFlrSF


In [130]:
def mse(x,y):
    t = x - y
    return (t*t).mean()

In [131]:
def r2(x, y):
    res = y - x
    tot = y - y.mean()
    res *= res
    tot *= tot    
    return 1 - sum(res)/sum(tot)

## Линейная регрессия

In [132]:
class LSM_Regressor:
    def __init__(self):
        self.W = None
        self.b = None
    
    def fit(self, X_train, y_train):
        x_expanded = np.append(X_train, np.ones((X_train.shape[0], 1)), axis = 1)
        temp = np.linalg.inv(x_expanded.T @ x_expanded) @ x_expanded.T @ y_train
        self.W = temp[:-1]
        self.b = temp[-1]
        
    def predict(self, X):
        return X @ self.W + self.b

In [133]:
lsm = LSM_Regressor()
lsm.fit(hw_std_train_df[chosen_f], hw_std_train_df[y_col])
lsm_pr = lsm.predict(hw_std_test_df[chosen_f])

## Градиентный спуск 

In [134]:
class GD_Regressor:
    def __init__(self, max_steps=2000, alpha=1e-10):
        self.nsteps = max_steps
        self.alpha = alpha
        
        self.W = None
#         self.b = None
    def __grad(self,x,y):
        return x.T @( (x @ self.W) - y)
        
    def fit(self, X_train, y_train):
        lamb = 0.1
        x_expanded = np.append(X_train, np.ones((X_train.shape[0], 1)), axis = 1)
        self.W = np.ones(x_expanded.shape[1])
        
        prev_m = mse(self.predict(X_train), y_train)
        for i in range(self.nsteps):
            self.W -= lamb*self.__grad(x_expanded, y_train)
            next_m = mse(self.predict(X_train), y_train)
            if(next_m > prev_m):
                lamb /= 2
            
            if(abs(next_m - prev_m) < self.alpha):
                break
            prev_m  = next_m 
            
#         self.W, self.b = self.W[:-1], self.W[-1]
        
    def predict(self, X):
        X = np.append(X, np.ones((X.shape[0], 1)), axis = 1)
        return X @ self.W

In [135]:
gd = GD_Regressor()
gd.fit(hw_std_train_df[chosen_f], hw_std_train_df[y_col])
gd_pr = gd.predict(hw_std_test_df[chosen_f])

## Sklearn линейная регрессия

In [136]:
sklsm = sk.linear_model.LinearRegression()
sklsm.fit(hw_std_train_df[chosen_f], hw_std_train_df[y_col])
sklsm_pr = sklsm.predict(hw_std_test_df[chosen_f])

## Sklearn Ridge 

In [137]:
ridge_pr = []
for a in alpha:
    ridge = sk.linear_model.Ridge(a)
    ridge.fit(hw_std_train_df[chosen_f], hw_std_train_df[y_col])
    ridge_pr += [ridge.predict(hw_std_test_df[chosen_f])]


## Sklearn Lasso

In [138]:
lasso_pr = []
for a in alpha:
    lasso = sk.linear_model.Lasso(a)
    lasso.fit(hw_std_train_df[chosen_f], hw_std_train_df[y_col])
    lasso_pr += [lasso.predict(hw_std_test_df[chosen_f])]

## Sklearn ElasticNet

In [139]:
ela_pr = []
for a in alpha:
    ela = sk.linear_model.ElasticNet(a)
    ela.fit(hw_std_train_df[chosen_f], hw_std_train_df[y_col])
    ela_pr += [ela.predict(hw_std_test_df[chosen_f])]

# Ошибк

In [140]:
print('LSM')
print("MSE",mse(hw_std_test_df[y_col], lsm_pr))
print("R2", r2(hw_std_test_df[y_col], lsm_pr))

LSM
MSE 0.17001782868745116
R2 0.7976686714809162


In [141]:
print('Grand')
print("MSE",mse(hw_std_test_df[y_col], gd_pr))
print("R2", r2(hw_std_test_df[y_col], gd_pr))

Grand
MSE 0.1700162217133981
R2 0.7976704464057448


In [142]:
print('SkLsm')
print("MSE",mse(hw_std_test_df[y_col], sklsm_pr))
print("R2", r2(hw_std_test_df[y_col], sklsm_pr))

SkLsm
MSE 0.17001782868745127
R2 0.7976686714809155


In [143]:
print('Ridge')
for i in range(len(alpha)):
    print(alpha[i])
    print("MSE",mse(hw_std_test_df[y_col], ridge_pr[i]))
    print("R2", r2(hw_std_test_df[y_col], ridge_pr[i]))

Ridge
0.01
MSE 0.17001735116148523
R2 0.7976679344287751
0.1
MSE 0.17001306290311388
R2 0.7976612958548835
1
MSE 0.16997110115495545
R2 0.7975944167947654


In [144]:
print('Lasso')
for i in range(len(alpha)):
    print(alpha[i])
    print("MSE",mse(hw_std_test_df[y_col], lasso_pr[i]))
    print("R2", r2(hw_std_test_df[y_col], lasso_pr[i]))

Lasso
0.01
MSE 0.1686476097819389
R2 0.7905462861448077
0.1
MSE 0.2065463290846978
R2 0.6436287063488944
1
MSE 0.9972602739726028
R2 -2.6255941210517018e+64


In [145]:
print('ElasticNet')
for i in range(len(alpha)):
    print(alpha[i])
    print("MSE",mse(hw_std_test_df[y_col], ela_pr[i]))
    print("R2", r2(hw_std_test_df[y_col], ela_pr[i]))

ElasticNet
0.01
MSE 0.16851543451971493
R2 0.7944100948832368
0.1
MSE 0.1808111095432974
R2 0.7313819046477348
1
MSE 0.674555127736402
R2 -13.90688284534544
