# Linear Regression Model wiht Boston Dataset

CRIM - per capita crime rate by town

ZN - proportion of residential land zoned for lots over 25,000 sq.ft.

INDUS - proportion of non-retail business acres per town.

CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)

NOX - nitric oxides concentration (parts per 10 million)

RM - average number of rooms per dwelling

AGE - proportion of owner-occupied units built prior to 1940

DIS - weighted distances to five Boston employment centres

RAD - index of accessibility to radial highways

TAX - full-value property-tax rate per $10,000

PTRATIO - pupil-teacher ratio by town

B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town

LSTAT - % lower status of the population

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

### Load Data


In [None]:
df = pd.read_csv('https://rathachai.github.io/DA101/data/boston.csv')

In [None]:
df

In [None]:
df.info()
df.describe()

### Data Exploration

## Normal Linear Regression

In [None]:
sns.pairplot(df)

In [None]:
sns.pairplot(df,x_vars=df.columns,y_vars=["medv"])

In [None]:
sns.displot(df['medv'])

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr())

In [None]:
df.corr().sort_values("medv")[["medv"]]

In [None]:
sns.pairplot(df,x_vars=["lstat","rm"],y_vars=["medv"])

selected 'lstat' and 'rm' because there's dist graph look closets to norm dist.

In [None]:
X = df[["lstat","rm"]]
y = df["medv"]

In [None]:
X


In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=101)

print('len X_train',len(X_train),\
        '|len X_test',len(X_test))
print('len y_train',len(X_train),\
        '|len y_test',len(X_test))

In [None]:
model = LinearRegression()
model.fit(X_train,y_train)


In [None]:
print("LM MODEL\n",\
        y.name,"=")
for i in range(0,len(X.columns)):
    print(model.coef_[i],"*",X.columns[i],"+")
print(model.intercept_)

### Evalutaion

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
plt.scatter(y_test,y_pred)

In [None]:
rmse_linearReg = metrics.mean_squared_error(y_test, y_pred, squared=False)
print("RMSE = ", rmse_linearReg)

In [None]:
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print("MAPE = ", mape, "%")

## K-Fold Cross-Validation

In [None]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


### Normal K-Fold

In [None]:
X = df[["lstat","rm"]]
y = df["medv"]


In [None]:
k = 5
kf = KFold(n_splits = k)


In [None]:
round = 0
rmse_ = []

for train_index, test_index in kf.split(X):
    print("Round",round+1)
    print("TRAIN :",train_index[0:10],"...")
    print("TEST :", test_index[0:5],"...")

    X_train,X_test = X.loc[train_index],X.loc[test_index]
    y_train,y_test = y.loc[train_index],y.loc[test_index]

    model = LinearRegression()
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)

    rmse = mean_squared_error(y_test,y_pred,squared=False)
    print("RMSE =", rmse)
    rmse_.append(rmse)

    print('--------------------------------')
    round += 1

In [None]:
rmse_normKFold = np.array(rmse_).mean()
print("RMSE =",rmse_normKFold)

### Feature Scaling with K-Fold

In [None]:
from sklearn import preprocessing

In [None]:
X = df[["lstat","rm"]]
y = df["medv"]

In [None]:
X.boxplot()

In [None]:
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [None]:
print("type of X =",type(X))
print("type of X_scaled =",type(X_scaled))

In [None]:
pd.DataFrame(X_scaled, columns=["lstat","rm"]).boxplot()

In [None]:
k = 5
kf = KFold(n_splits = k)

In [None]:
round = 0
rmse_ = []

for train_index, test_index in kf.split(X):
    print("Round",round+1)
    print("TRAIN :",train_index[0:10],"...")
    print("TEST :", test_index[0:5],"...")

    X_train,X_test = X.loc[train_index],X.loc[test_index]
    y_train,y_test = y.loc[train_index],y.loc[test_index]

    model = LinearRegression()
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)

    rmse = mean_squared_error(y_test,y_pred,squared=False)
    print("RMSE =", rmse)
    rmse_.append(rmse)

    print('--------------------------------')
    round += 1

In [None]:
rmse_scalingKFold = np.array(rmse_).mean()
print("RMSE =", rmse_scalingKFold)

### Feature Engineering

In [None]:
sns.pairplot(df[['lstat','medv']])

In [None]:
sns.pairplot(pd.DataFrame({"lstat":df["lstat"].apply(np.log),"medv":y}))

In [None]:
round = 0
rmse_ = []

for train_index, test_index in kf.split(X):
    print("Round",round+1)
    print("TRAIN :",train_index[0:10],"...")
    print("TEST :", test_index[0:5],"...")

    X_train,X_test = X.loc[train_index],X.loc[test_index]
    y_train,y_test = y.loc[train_index],y.loc[test_index]

    X_train["lstat"] = X_train["lstat"].apply(np.log)
    X_test["lstat"] = X_test["lstat"].apply(np.log)

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = LinearRegression()
    model.fit(X_train_scaled,y_train)

    y_pred = model.predict(X_test_scaled)

    rmse = mean_squared_error(y_test,y_pred,squared=False)
    print("RMSE =", rmse)
    rmse_.append(rmse)

    print('--------------------------------')
    round += 1

In [None]:
rmse_FeatureEngi = np.mean(rmse_)
print("RMSE =",rmse_FeatureEngi)

In [None]:
print('RMSE Summary\n',\
    '- LinearReg :',rmse_linearReg,\
    '\n- NormKFold :',rmse_normKFold,\
    '\n- ScalingKFold :',rmse_scalingKFold,\
    '\n- FeatureEngi :',rmse_FeatureEngi)

# Deploy

In [4]:
def modelSummary(name,X,y,model):
    print(name,"\n",\
        y.name,"=")
    for i in range(0,len(X.columns)):
        print(model.coef_[i],"*",X.columns[i],"+")
    print(model.intercept_)

In [5]:
df = pd.read_csv('https://rathachai.github.io/DA101/data/boston.csv')


In [6]:
df["lstat"] = df["lstat"].apply(np.log)

In [7]:
X = df[["lstat","rm"]]
y = df["medv"]


In [8]:
model = LinearRegression()
model.fit(X,y)

LinearRegression()

In [9]:
modelSummary('MODEL',X,y,model)

MODEL 
 medv =
-9.685463293900552 * lstat +
3.5977011197200057 * rm +
22.886466246629507


In [10]:
#@title Set Your Parameters { run: "auto" }
lstat_val =6 #@param {type:"slider", min:0, max:30, step:0.5}
rm_val =5.5 #@param {type:"slider", min:0, max:10, step:0.5}

medv_val = model.predict([[np.log(lstat_val),rm_val]])
print("medv = ", np.round(medv_val[0],2), "x 1000 Dollas")

medv =  25.32 x 1000 Dollas
