## Linear Regression model in Python

In [17]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import  LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

In [18]:
### read the train, test, dev dataset
#train data
train_df = pd.read_csv("../data/train.csv")
train_df = train_df.drop("Unnamed: 0",axis=1)
x_train = train_df.drop("Price",axis=1)
x_train = x_train.select_dtypes(include=['int', 'float'])  #only regression for the numeric,remove all the other type data
y_train = np.log(train_df["Price"])  #price is the final goal we need to predict
x_train.head()
y_train.head()

#test data
test_df = pd.read_csv("../data/test.csv")
test_df = test_df.drop("Unnamed: 0",axis=1)
x_test = test_df.drop("Price",axis=1)
x_test = x_test.select_dtypes(include=['int', 'float'])
y_test = np.log(test_df["Price"])
x_test.head()
y_test.head()

#dev data
dev_df = pd.read_csv("../data/dev.csv")
dev_df = dev_df.drop("Unnamed: 0",axis=1)
x_dev = dev_df.drop("Price",axis=1)
x_dev = x_dev.select_dtypes(include=['int', 'float'])
y_dev = np.log(dev_df["Price"])
x_dev.head()
y_dev.head()


0    6.315012
1    5.622288
2    5.529095
3    5.767196
4    6.890906
Name: Price, dtype: float64

In [19]:
x_train.head()

Unnamed: 0,Person.Capacity,Multiple.Rooms,Business,Cleanliness.Rating,Guest.Satisfaction,Bedrooms,City.Center..km.,Metro.Distance..km.,Attraction.Index,Normalised.Attraction.Index,Restraunt.Index,Normalised.Restraunt.Index
0,4,0,0,8,85,1,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928
1,2,0,1,9,87,1,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467
2,2,0,0,10,98,1,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677
3,4,0,0,9,96,2,3.719141,1.196112,106.226456,5.624761,133.876202,9.328686
4,2,0,0,10,96,1,1.009922,0.917115,409.858124,21.70226,555.114276,38.681161


## the basic lr model for all features

In [20]:
lr=LinearRegression() #normalized data will improve the processing speed 
lr.fit(x_train,y_train)
y_pred_lr=lr.predict(x_test)

mae_lr = metrics.mean_absolute_error(y_test, y_pred_lr)
mse_lr = metrics.mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = metrics.r2_score(y_test, y_pred_lr)

print('\nMAE of Linear Regression : ', mae_lr)
print('\nMSE of Linear Regression : ', mse_lr)
print('\nRMSE of Linear Regression: ', rmse_lr)
print('\nR2 of Linear Regression  : ', r2_lr)


MAE of Linear Regression :  0.35570350120114774

MSE of Linear Regression :  0.2156368808044554

RMSE of Linear Regression:  0.4643671831691548

R2 of Linear Regression  :  0.37105719329414844


## Using L1 

In [21]:
# model build and fit
l1 = Lasso(alpha=0.1)
l1.fit(x_train,y_train)
y_pred_l1 = l1.predict(x_test)

mae_l1 = metrics.mean_absolute_error(y_test, y_pred_l1)
mse_l1 = metrics.mean_squared_error(y_test, y_pred_l1)
rmse_l1 = np.sqrt(mse_l1)
r2_l1 = metrics.r2_score(y_test, y_pred_l1)

print('\nMAE of Linear Regression with L1 : ', mae_l1)
print('\nMSE of Linear Regression  with L1: ', mse_l1)
print('\nRMSE of Linear Regression with L1: ', rmse_l1)
print('\nR2 of Linear Regression with L1  : ', r2_l1)


MAE of Linear Regression with L1 :  0.37325401520123225

MSE of Linear Regression  with L1:  0.23831941535353018

RMSE of Linear Regression with L1:  0.48817969576123316

R2 of Linear Regression with L1  :  0.3048996005424971


## l2

In [22]:
# model build and fit
l2 = Ridge(alpha=0.1)
l2.fit(x_train,y_train)
y_pred_l2 = l2.predict(x_test)

mae_l2 = metrics.mean_absolute_error(y_test, y_pred_l2)
mse_l2 = metrics.mean_squared_error(y_test, y_pred_l2)
rmse_l2 = np.sqrt(mse_l2)
r2_l2 = metrics.r2_score(y_test, y_pred_l2)

print('\nMAE of Linear Regression with L2 : ', mae_l2)
print('\nMSE of Linear Regression  with L2: ', mse_l2)
print('\nRMSE of Linear Regression with L2: ', rmse_l2)
print('\nR2 of Linear Regression with L2  : ', r2_l2)


MAE of Linear Regression with L2 :  0.3557035022308756

MSE of Linear Regression  with L2:  0.21563690540800645

RMSE of Linear Regression with L2:  0.4643672096606375

R2 of Linear Regression with L2  :  0.37105712153357384


## USING gridsearch polynomial non linear

In [8]:
# Define a Pipeline object with two steps, PolynomialFeatures and LinearRegression
pipeline = Pipeline([
    ('poly', PolynomialFeatures()),
    ('reg', LinearRegression())
])

# Define a parameter dictionary containing two parameters, degree and fit_intercept, for the GridSearchCV method to search for the best subset
params = {
    'poly__degree': [1, 2, 3, 4, 5, 6],
    'reg__fit_intercept': [True, False]
}

# Define a GridSearchCV object to search for the best subset
grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(x_train, y_train)

# Output the best subset
print(grid_search.best_params_)

y_pred_cv = grid_search.predict(x_test)
mae_cv = metrics.mean_absolute_error(y_test, y_pred_cv)
mse_cv = metrics.mean_squared_error(y_test, y_pred_cv)
rmse_cv = np.sqrt(mse_cv)
r2_cv = metrics.r2_score(y_test, y_pred_cv)

print('\nMAE of best subset : ', mae_cv)
print('\nMSE of best subset : ', mse_cv)
print('\nRMSE of best subset: ', rmse_cv)
print('\nR2 of best subset  : ', r2_cv)

{'poly__degree': 1, 'reg__fit_intercept': True}

MAE of best subset :  105.65500109747224

MSE of best subset :  61045.624670383855

RMSE of best subset:  247.07412788550698

R2 of best subset  :  0.17940344583563816


## minmaxscaler

In [23]:
from sklearn.preprocessing import MinMaxScaler

# 进行 MinMax 缩放
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

lr.fit(x_train_scaled,y_train)
y_pred_scaler = lr.predict(x_test_scaled)

mae_scaler = metrics.mean_absolute_error(y_test, y_pred_scaler)
mse_scaler = metrics.mean_squared_error(y_test, y_pred_scaler)
rmse_scaler = np.sqrt(mse_scaler)
r2_scaler = metrics.r2_score(y_test, y_pred_scaler)

print('\nMAE of Linear Regression with scaler : ', mae_scaler)
print('\nMSE of Linear Regression  with scaler: ', mse_scaler)
print('\nRMSE of Linear Regression with scaler: ', rmse_scaler)
print('\nR2 of Linear Regression with scaler  : ', r2_scaler)


MAE of Linear Regression with scaler :  0.35570350120114774

MSE of Linear Regression  with scaler:  0.2156368808044554

RMSE of Linear Regression with scaler:  0.4643671831691548

R2 of Linear Regression with scaler  :  0.37105719329414844


## using CV to the best subset selection

In [24]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# 数据标准化
scaler = StandardScaler()
X = scaler.fit_transform(x_train)

# 递归特征消除交叉验证
estimator = LinearRegression()
selector = RFECV(estimator, step=1, cv=5)
selector.fit(X, y_train)

# 打印选择后的特征和最优特征数
print('选择后的特征：', selector.support_)
print('最优特征数：', selector.n_features_)


选择后的特征： [False False False False False False False False False  True False False]
最优特征数： 1


In [16]:
print(selector.get_feature_names_out)

<bound method SelectorMixin.get_feature_names_out of RFECV(cv=5, estimator=LinearRegression())>
