In [1]:
# coding implementation of multiple linear regression

# problem: house price prediction based on size of house, distane to city center and number of bedrooms

In [2]:
# import necessary modules and packages

import numpy as np
from sklearn.linear_model import LinearRegression

In [3]:
# Data: X is predictor or independent variable, Y is predicted output or dependent variable
X = np.array([
    [1500, 3, 5],
    [2000, 4, 3],
    [1200, 2, 7],
    [1800, 3, 4],
    [2500, 4, 2],
    [1600, 3, 6],
    [2100, 4, 4],
    [1300, 2, 8],
    [1900, 3, 3],
    [2200, 4, 5]
])
# X contains size, bedrooms and distance to city

# y contains price of the houses
y = np.array([200000, 300000, 150000, 250000, 400000, 220000, 320000, 170000, 280000, 350000])


In [4]:
# construct X and y to form a model
model = LinearRegression()

model.fit(X, y)

In [5]:
# evaluate coefficients of every independent variables and intercept for multiple linear regression equation
# equation: output(y) = intercept + coefficient1*variable1 + coefficient2*variable2------------coefficientk*variablek

intercept = model.intercept_
coefficients = model.coef_
# coefficients

In [6]:
# put all the values in the equation
equation = f"Price = {intercept:.2f} + {coefficients[0]:.2f}(size) + {coefficients[1]:.2f}(bedrooms) + {coefficients[2]:.2f}(distance to city center)"

print("finalized multiple linear regression: ")
print(equation)

finalized multiple linear regression: 
Price = -136025.80 + 212.46(size) + -1036.42(bedrooms) + 3998.48(distance to city center)


In [7]:
# prediction for a new house based on the model tested using previous dataset
new_house = np.array([[2000, 3, 4]]);
predicted_price = model.predict(new_house);

print(f"Predicted Price for the New House: ${predicted_price[0]:,.2f}")



Predicted Price for the New House: $301,775.42


In [8]:
# OLS(ordinary least squares) implementation.....

In [9]:
# import necessary modules and setup input and output variables.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm  # statsmodels library is used to measure OLS

from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'statsmodels'

In [None]:
# simulate simple regression dataset

np.random.seed(42)
X = np.random.rand(30, 1) * 10
epsilon = np.random.rand(30, 1) * 2  # random noise or error term
y = 3 * X + 2 + epsilon  # here, intercept and slope are considered as given

In [None]:
# Create a DataFrame for easier manipulation (or shape the data)
data = pd.DataFrame({'X': X.flatten(), 'y': y.flatten()})
# data

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[["X"]], data['y'], test_size=0.2, random_state=42)

In [None]:
# Add a constant term to the independent variables matrix for the intercept in the model
X_train = sm.add_constant(X_train)
# X_train
# y_test

In [None]:
# fit the OLS model to estimate parameters 
model = sm.OLS(y_train, X_train).fit()
model

In [None]:
# print te summary of the regression
print(model.summary())

In [None]:
# evaluate predictions on test set (X testing variable and y testing variables)

X_test = sm.add_constant(X_test);
y_pred = model.predict(X_test);


In [None]:
# Visualize predictions against actual values

plt.scatter(X_test['X'], y_test, label="Actual")
plt.scatter(X_test['X'], y_pred, label="Predicted", marker='x')
plt.xlabel("X")
plt.ylabel("y")
plt.title("Actual vs Predicted Values")
plt.legend()
plt.show()

In [None]:
# let's practice how to draw a diagram for X and Y using matplotlib
import matplotlib.pyplot as plt2
import numpy as np
import pandas as pd

input = [21, 32, 11, 40, 20, 56, 44];
output = [1000, 670, 990, 1126, 1566, 874, 1091];

pred_input = [19, 30, 13, 24, 36, 31, 20];
pred_output = [798, 1300, 1178, 595, 930, 1250, 1040];

plt2.scatter(input, output,label="Actual")
plt2.scatter(pred_input, pred_output, label="Predicted", marker="*")
plt2.xlabel("Input")
plt2.ylabel("Output")
plt2.title("Input vs Output")
plt2.legend()
plt2.show()


In [None]:
#  feature scaling and Normalization
# suppose we have two input variables and scaling for both the vars are very different

Number_of_Bedrooms = [1, 2, 3, 4, 5]
Square_Footage = [800, 1200, 1500, 2000, 2500]

# second variable has very wide scaling as comapare to first variable. Hence linear regression algorithm might give more importance to 
# second variable because of it's large scaling limits that might leads to biased coefficients.

In [None]:
# from sklearn.prepocessing import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

Number_of_Bedrooms_scaled = scaler.fit_transform(np.array([Number_of_Bedrooms]).reshape(-1,1))
Square_Footage_scaled = scaler.fit_transform(np.array([Square_Footage]).reshape(-1,1))
Number_of_Bedrooms_scaled
# Square_Footage_scaled

In [None]:
# Model evaluation and model validation
# mean_squared_error(MSE) and R-square(coefficient of determination)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt3
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
np.random.seed(42)
X = np.random.rand(50, 1) * 10
epsilon = np.random.rand(50, 1) * 2
y = 4 + 3 * X + epsilon



In [None]:
# split the data in train and test categories
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# add the constant term in input variable to get intercept
# train the model with training data

model = LinearRegression()
model.fit(X_train, y_train);

In [None]:
# make the predictions on testing data set using previous trained model

y_pred = model.predict(X_test);
y_pred

In [None]:
# Evaluate the model using MSE and R2

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mean squared error: ", mse)
print("Coefficient of determination: ", r2)

In [None]:
# visualize the data (tested and predicted)

plt3.scatter(X_test, y_test, label="Actual", color="black")
# plt3.scatter(X_test, y_pred, label="Predicted", marker="*")
plt3.scatter(X_test, y_pred, label="Predicted", color="blue", linewidth=3)
plt3.xlabel("X")
plt3.ylabel("Y")
plt3.title("Actual output vs Predicted output")
plt3.legend()
plt3.show()

In [None]:
print("notebook code has been pushed to github successfully")