<a href="https://colab.research.google.com/github/Satya-1729/Python_machine_learning/blob/main/evaluating_the_models_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
#evaluating the differnt models

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers import Dense
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# reducing the display precison on numpy arrays i.e., decimals value upto 2 decimal places
np.set_printoptions(precision=2)

# supressing_the_warnings
tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(0)

# now loading the dataset
df = pd.read_csv('/content/House Price Prediction Dataset.csv')
df.head()

x = df.iloc[:,1]
print(x[:5],"\n")
y = df.iloc[:,-1]

print(x.shape,"\n", y.shape)

# converting into 2d arrays since data will be in 1d array

x = np.expand_dims(x, axis=1)
y = np.expand_dims(y, axis=1)

print(x.shape,"\n", y.shape,"\n\n")

# now we will split our dataset into training and testing and cross-validation sets

x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.4, random_state=42)

#now we will split remaing dataset into test and cross-validation set

x_test, x_cv, y_test, y_cv = train_test_split(x_, y_, test_size=0.5, random_state=42)

print(x_train.shape,"\n", y_train.shape,"\n")
print(x_test.shape,"\n", y_test.shape,"\n")
print(x_cv.shape,"\n", y_cv.shape)

del x_, y_ # deleteing

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train) #it will normalize the dataset
x_test_scaled = scaler.transform(x_test) #it will also normalize the dataset using the s_train mean and std deviation
x_cv_scaled = scaler.transform(x_cv)#it will also normalize the dataset using the s_train mean and std deviation

# we will preview the x_train data
print(x_train_scaled[:5],"\n")
# now using the model linear regression

model = LinearRegression()

# now fitting the x_train and y_train
model.fit(x_train_scaled, y_train)

#now we will evaluate the model using the builtin mean square error function

mse = mean_squared_error(y_train, model.predict(x_train_scaled))/2
print(f"Mean Squared Error:, {mse:.2f}")

yhat = model.predict(x_train_scaled)
cost = 0
# for loop
for i in range(len(yhat)):
  f_wb_i = (yhat[i] - y_train[i])**2
  cost = cost + f_wb_i

mse_1 = cost/(2 * len(yhat))
print(f"Total Error: {mse_1.squeeze() : .2f}")

# now we will calculate for cross validation set

mse = mean_squared_error(y_cv, model.predict(x_cv_scaled))/2
print(f"Mean Squared Error:, {mse:.2f}")






0    1360
1    4272
2    3592
3     966
4    4926
Name: Area, dtype: int64 

(2000,) 
 (2000,)
(2000, 1) 
 (2000, 1) 


(1200, 1) 
 (1200, 1) 

(400, 1) 
 (400, 1) 

(400, 1) 
 (400, 1)
[[ 1.01]
 [-1.38]
 [ 1.46]
 [-0.75]
 [ 1.7 ]] 

Mean Squared Error:, 37726001465.90
Total Error:  37726001465.90
Mean Squared Error:, 39915237841.61


In [23]:
#adding the polynomial features

poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train_scaled)
x_test_poly = poly.transform(x_test_scaled)
x_cv_poly = poly.transform(x_cv_scaled)

print(x_train_poly.shape,"\n")
print(x_test_poly.shape,"\n")
print(x_cv_poly.shape,"\n")

print(x_train_poly[0:5],"\n")



# now we will normalize the x train poly and x_cv poly...
scaler_poly = StandardScaler()
x_train_poly_scaled = scaler_poly.fit_transform(x_train_poly)
x_test_poly_scaled = scaler_poly.transform(x_test_poly)
x_cv_poly_scaled = scaler_poly.transform(x_cv_poly)

poly_model = LinearRegression()
poly_model.fit(x_train_poly_scaled, y_train)

yhat_poly = poly_model.predict(x_train_poly_scaled)
mse = mean_squared_error(y_train, yhat_poly)/2
print(f"Mean Squared Error for training dataset in poly :, {mse:.2f}")

# checking for cross validation dataset before running for test dataset
yhat_poly_cv = poly_model.predict(x_cv_poly_scaled)
mse = mean_squared_error(y_cv, yhat_poly_cv)/2
print(f"Mean Squared Error for cross validation poly :, {mse:.2f}")


(1200, 2) 

(400, 2) 

(400, 2) 

[[ 1.01  1.01]
 [-1.38  1.91]
 [ 1.46  2.14]
 [-0.75  0.57]
 [ 1.7   2.87]] 

Mean Squared Error for training dataset in poly :, 37723299158.37
Mean Squared Error for cross validation poly :, 39939688677.69


In [37]:
# now we will evaluate our models woth different polynomial degrees on the same set of the data

error_trains = []
error_cvs = []
poly_degree = []

scalers = []
models = []

for i in range(1,11):
  poly = PolynomialFeatures(degree=i, include_bias=False)
  x_train_poly_mapped = poly.fit_transform(x_train_scaled)
  #instead of using the indexing adding we have to append it cause
  #we are overwritting in the list that is not possible for some reasons
  poly_degree.append(poly)


#scalers used in the code appending to the list
  scaler_poly = StandardScaler()
  x_train_poly_scaled= scaler_poly.fit_transform(x_train_poly_mapped)
  scalers.append(scaler_poly)



#appending the models used in this dataset
  model = LinearRegression()
  model.fit(x_train_poly_scaled, y_train)
  models.append(model)


#calculating the errors for different degrees and appending to the list error_trains
  yhat_poly = model.predict(x_train_poly_scaled)
  error_train = mean_squared_error(y_train, yhat_poly)/2
  error_trains.append(error_train)

#transforming the x_cv dataset into same degree as the degree of x train
  x_cv_poly_mapped = poly.transform(x_cv_scaled)
  x_cv_poly_scaled = scaler_poly.transform(x_cv_poly_mapped)

# checking the error for cross validation dataset
  yhat_poly_cv = model.predict(x_cv_poly_scaled)
  error_cv = mean_squared_error(y_cv, yhat_poly_cv)/2
  error_cvs.append(error_cv)

print(poly_degree,"\n")
print(scalers,"\n")
print(models,"\n")
print(error_trains,"\n")
print(error_cvs,"\n")



[PolynomialFeatures(degree=1, include_bias=False), PolynomialFeatures(include_bias=False), PolynomialFeatures(degree=3, include_bias=False), PolynomialFeatures(degree=4, include_bias=False), PolynomialFeatures(degree=5, include_bias=False), PolynomialFeatures(degree=6, include_bias=False), PolynomialFeatures(degree=7, include_bias=False), PolynomialFeatures(degree=8, include_bias=False), PolynomialFeatures(degree=9, include_bias=False), PolynomialFeatures(degree=10, include_bias=False)] 

[StandardScaler(), StandardScaler(), StandardScaler(), StandardScaler(), StandardScaler(), StandardScaler(), StandardScaler(), StandardScaler(), StandardScaler(), StandardScaler()] 

[LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression()] 

[37726001465.9021, 37723299158.37444, 37658969815.62473, 37621769439.26818, 37617583477.02773, 37574024046.15376, 375666

In [47]:
# reducing the display precison on numpy arrays i.e., decimals value upto 2 decimal places
np.set_printoptions(precision=2)

# choosing the best model
# now we will find the index of lowest error_trains i.e., lowest training error in a list

#using argmin function to calculate the index of minimum error value above all the errror values
degree_use = np.argmin(error_trains)+1
print(degree_use)

degree_use_cv = np.argmin(error_cvs)+1 #here we add because index starts at 0
print(degree_use_cv)

# now after seeing the results on the cv dataset we will use the model with the degree 6
#to transform our testing dataset
x_test_poly_mapped = poly_degree[degree_use_cv-1].transform(x_test_scaled)

# now we will normalize the datset using same std deviation and mean
x_test_poly_scaled = scalers[degree_use_cv-1].transform(x_test_poly_mapped)

# in this below line we calculate the test predicted value in order to calculate the error for test data

yhat = models[degree_use_cv-1].predict(x_test_poly_scaled)

# error calculation
error_test_f = mean_squared_error(yhat, y_test)/2
print(f"{error_test_f:.2f}")






10
6
38258274364.81
