## Linear Regression model in Python

In [28]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import  LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [20]:
### read the train, test, dev dataset
#train data
train_df = pd.read_csv("../data/train.csv")
train_df = train_df.drop("Unnamed: 0",axis=1)
x_train = train_df.drop("Price",axis=1)
x_train = x_train.select_dtypes(include=['int', 'float'])  #only regression for the numeric,remove all the other type data
y_train = train_df["Price"]  #price is the final goal we need to predict
x_train.head()
y_train.head()

#test data
test_df = pd.read_csv("../data/test.csv")
test_df = test_df.drop("Unnamed: 0",axis=1)
x_test = test_df.drop("Price",axis=1)
x_test = x_test.select_dtypes(include=['int', 'float'])
y_test = test_df["Price"]
x_test.head()
y_test.head()

#dev data
dev_df = pd.read_csv("../data/dev.csv")
dev_df = dev_df.drop("Unnamed: 0",axis=1)
x_dev = dev_df.drop("Price",axis=1)
x_dev = x_dev.select_dtypes(include=['int', 'float'])
y_dev = dev_df["Price"]
x_dev.head()
y_dev.head()


0    552.808567
1    276.521454
2    251.915731
3    319.640053
4    983.291543
Name: Price, dtype: float64

In [21]:
x_train.head()

Unnamed: 0,Person.Capacity,Multiple.Rooms,Business,Cleanliness.Rating,Guest.Satisfaction,Bedrooms,City.Center..km.,Metro.Distance..km.,Attraction.Index,Normalised.Attraction.Index,Restraunt.Index,Normalised.Restraunt.Index
0,4,0,0,8,85,1,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928
1,2,0,1,9,87,1,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467
2,2,0,0,10,98,1,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677
3,4,0,0,9,96,2,3.719141,1.196112,106.226456,5.624761,133.876202,9.328686
4,2,0,0,10,96,1,1.009922,0.917115,409.858124,21.70226,555.114276,38.681161


## the basic lr model for all features

In [24]:
lr=LinearRegression() #normalized data will improve the processing speed 
lr.fit(x_train,y_train)
y_pred_lr=lr.predict(x_test)

mae_lr = metrics.mean_absolute_error(y_test, y_pred_lr)
mse_lr = metrics.mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = metrics.r2_score(y_test, y_pred_lr)

print('\nMAE of Linear Regression : ', mae_lr)
print('\nMSE of Linear Regression : ', mse_lr)
print('\nRMSE of Linear Regression: ', rmse_lr)
print('\nR2 of Linear Regression  : ', r2_lr)


MAE of Linear Regression :  105.65500109747222

MSE of Linear Regression :  61045.62467038386

RMSE of Linear Regression:  247.07412788550698

R2 of Linear Regression  :  0.17940344583563805


## Using polynomial

In [27]:
#model build and fit
poly_lr = make_pipeline(PolynomialFeatures(degree=2),LinearRegression())
poly_lr.fit(x_train,y_train)
y_pred_poly = poly_lr.predict(x_test)
#calculate the meatures of model
mae_poly = metrics.mean_absolute_error(y_test, y_pred_poly)
mse_poly = metrics.mean_squared_error(y_test, y_pred_poly)
rmse_poly = np.sqrt(mse_poly)
r2_poly = metrics.r2_score(y_test, y_pred_poly)

print('\nMAE of Polynomial Linear Regression : ', mae_poly)
print('\nMSE of Polynomial Linear Regression : ', mse_poly)
print('\nRMSE of Polynomial Linear Regression: ', rmse_poly)
print('\nR2 of Polynomial Linear Regression  : ', r2_poly)


MAE of Polynomial Linear Regression :  94.08462882381585

MSE of Polynomial Linear Regression :  54419.95727964579

RMSE of Polynomial Linear Regression:  233.2808549359458

R2 of Polynomial Linear Regression  :  0.2684679751812865


## Using L1 

In [30]:
# model build and fit
l1 = Lasso(alpha=0.1)
l1.fit(x_train,y_train)
y_pred_l1 = l1.predict(x_test)

mae_l1 = metrics.mean_absolute_error(y_test, y_pred_l1)
mse_l1 = metrics.mean_squared_error(y_test, y_pred_l1)
rmse_l1 = np.sqrt(mse_l1)
r2_l1 = metrics.r2_score(y_test, y_pred_l1)

print('\nMAE of Linear Regression with L1 : ', mae_l1)
print('\nMSE of Linear Regression  with L1: ', mse_l1)
print('\nRMSE of Linear Regression with L1: ', rmse_l1)
print('\nR2 of Linear Regression with L1  : ', r2_l1)


MAE of Linear Regression with L1 :  105.62518918465686

MSE of Linear Regression  with L1:  61044.87964395439

RMSE of Linear Regression with L1:  247.07262018271953

R2 of Linear Regression with L1  :  0.1794134607404586


## l2

In [31]:
# model build and fit
l2 = Ridge(alpha=0.1)
l2.fit(x_train,y_train)
y_pred_l2 = l2.predict(x_test)

mae_l2 = metrics.mean_absolute_error(y_test, y_pred_l2)
mse_l2 = metrics.mean_squared_error(y_test, y_pred_l2)
rmse_l2 = np.sqrt(mse_l2)
r2_l2 = metrics.r2_score(y_test, y_pred_l2)

print('\nMAE of Linear Regression with L2 : ', mae_l2)
print('\nMSE of Linear Regression  with L2: ', mse_l2)
print('\nRMSE of Linear Regression with L2: ', rmse_l2)
print('\nR2 of Linear Regression with L2  : ', r2_l2)


MAE of Linear Regression with L2 :  105.65493202360771

MSE of Linear Regression  with L2:  61045.62318186005

RMSE of Linear Regression with L2:  247.0741248732049

R2 of Linear Regression with L2  :  0.17940346584489375


## USING gridsearch

In [32]:
# 定义超参数搜索范围
param_grid = {'normalize': [True, False], 'fit_intercept': [True, False]}

# 使用网格搜索来选择最佳的超参数组合
grid = GridSearchCV(lr, param_grid=param_grid, cv=5)
grid.fit(x_train, y_train)

# 打印最佳超参数组合
print(grid.best_params_)

{'fit_intercept': False, 'normalize': True}


## minmaxscaler

In [33]:
from sklearn.preprocessing import MinMaxScaler

# 进行 MinMax 缩放
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

lr.fit(x_train_scaled,y_train)
y_pred_scaler = lr.predict(x_test_scaled)

mae_scaler = metrics.mean_absolute_error(y_test, y_pred_scaler)
mse_scaler = metrics.mean_squared_error(y_test, y_pred_scaler)
rmse_scaler = np.sqrt(mse_scaler)
r2_scaler = metrics.r2_score(y_test, y_pred_scaler)

print('\nMAE of Linear Regression with scaler : ', mae_scaler)
print('\nMSE of Linear Regression  with scaler: ', mse_scaler)
print('\nRMSE of Linear Regression with scaler: ', rmse_scaler)
print('\nR2 of Linear Regression with scaler  : ', r2_scaler)


MAE of Linear Regression with scaler :  105.65500109747228

MSE of Linear Regression  with scaler:  61045.62467038387

RMSE of Linear Regression with scaler:  247.074127885507

R2 of Linear Regression with scaler  :  0.17940344583563794
