### Simple Regression model to predict the value of price of house

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.datasets import load_boston

dataset= load_boston()

In [None]:
print(dataset.keys())

In [None]:
df = pd.DataFrame(dataset.data, columns = dataset.feature_names)
df.head()

In [None]:
df['MEDV']= dataset.target
df.head()

##### Data preprocessing

In [None]:
## check if there is any missing values
df.isnull().sum()

##### Explanatory Data Analysis
Visualize the relationship b/w target and other features

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.displot(df['MEDV'], bins=30)
plt.show()

In [None]:
correlation_matrix = df.corr().round(2)
sns.heatmap(data=correlation_matrix, annot=True)

Select the features that are highly correlated to the target variable. 
While selecting features, check for multi collinearity -do not select the features that correlated to each other for training

In [None]:
plt.figure(figsize=(20, 5))

features = ['LSTAT', 'RM']
target = df['MEDV']

for i, col in enumerate(features):
    plt.subplot(1, len(features) , i+1)
    x = df[col]
    y = target
    plt.scatter(x, y, marker='o')
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel('MEDV')

Prepare the data for training the model

In [None]:
X = pd.DataFrame(np.c_[df['LSTAT'], df['RM']], columns = ['LSTAT', 'RM'])
Y = df['MEDV']

In [None]:
## Split train and test data

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lin_model = LinearRegression()
lin_model.fit (X_train, Y_train)

##### Model Evaluation

In [None]:
pred = lin_model.predict(X_train)
rmse = np.sqrt(mean_squared_error(Y_train, pred))
r2 = r2_score(Y_train, pred) 

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

# model evaluation for testing set
y_test_predict = lin_model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
r2 = r2_score(Y_test, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))