### Normal Equation Approach for Logistic Regression on Binary Classification

In [10]:
# read all the data from real_estate data set
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# read the data
df = pd.read_csv('real_estate.csv')
# preparing the independent and the dependent variables of the data
# the column 0 is the serial number of the data and it is not included in the independent variables
X = df.iloc[:, 1:7].values
Y = df.iloc[:, 7].values

In [11]:
# split the data into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=78)

In [12]:
# computing the vector theta using the normal equation
theta = np.linalg.inv(X_train.T.dot(X_train)).dot(X_train.T).dot(Y_train)
# predicting the values of the test data using the theta computed
Y_predicted = X_test.dot(theta)
# calculating the mean squared error of the model using the predicted values and the test data
mean_square_error = np.mean((Y_predicted - Y_test)**2)
# calculating the residual sum of squares   
residual_sum_of_squares = np.sum((Y_test - Y_predicted)**2)
# calculating the total sum of squares
total_sum_of_squares = np.sum((Y_test - np.mean(Y_test))**2)
# calculating the R2 score of the model
r2_score = 1 - (residual_sum_of_squares / total_sum_of_squares)
# printing the values of the metrics of the model
print('Mean Squared Error: ', mean_square_error)
print('Residual Sum of Squares: ', residual_sum_of_squares)
print('Total Sum of Squares: ', total_sum_of_squares)
print('R2 Score: ', r2_score)

Mean Squared Error:  45.40813976772222
Residual Sum of Squares:  3768.875600720944
Total Sum of Squares:  15774.027951807228
R2 Score:  0.7610708176608028
