# Micro Project

In [1]:

# Predict the price of a house with high confidence(95% confidence)
# adjust for price inflation by multiplying the values by 30
# The house should have 4 rooms, 
# the Area where the house is located should have a school with PT ratio 10
# The house should be near the Charles river
# inputs must be given from the KB
# negative rooms  and zero rooms...not allowed
# same with PT ratio.....
# Charles river.....Choice (Yes/no)...set yes = 1 and no = 0



# Date of discussion tentatively on the 3rd September 2020.
# please dont send me codes....
# please dont ask me doubts before the 3rd September 2020

## Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

## Importing Dataset

In [3]:
boston_dataset = load_boston()
data = pd.DataFrame(data= boston_dataset.data, columns= boston_dataset.feature_names)
data['PRICE'] = boston_dataset.target
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [4]:
print('\nNo.Of Entries in DataFrame :', data.shape[0])


No.Of Entries in DataFrame : 506


## Data Normalisation & Splitting

In [5]:
features = data.drop(['PRICE'], axis=1)

prices_log = np.log(data['PRICE'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(features, prices_log, test_size= 0.20, random_state= 42)

## Linear Regression

In [7]:
regr = LinearRegression()
regr.fit(X_train, y_train)

Coef = pd.DataFrame(index= X_train.columns, data= regr.coef_, columns= ['Coef'])
print(Coef)

print('\nIntercepct', regr.intercept_)
print('\nTrain_Score(X_train, y_train) :', regr.score(X_train, y_train))

             Coef
CRIM    -0.009679
ZN       0.000757
INDUS    0.003057
CHAS     0.096207
NOX     -0.727261
RM       0.113095
AGE     -0.000139
DIS     -0.048944
RAD      0.011139
TAX     -0.000505
PTRATIO -0.036989
B        0.000579
LSTAT   -0.028321

Intercepct 3.840920309917581

Train_Score(X_train, y_train) : 0.795714923175866


## Prediction

In [8]:
y_hat = regr.predict(X_test)

print('\nTest_Score(y_test,y_hat) :',r2_score(y_test, y_hat))


Test_Score(y_test,y_hat) : 0.7462724975382733


## Stats Model API......................................

In [9]:
X_incl_const_1 = sm.add_constant(X_train)

model_1 = sm.OLS(y_train, X_incl_const_1)
results_1 = model_1.fit()

print('\nSummary of Model_1 >>>>\n', results_1.summary())


Summary of Model_1 >>>>
                             OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.796
Model:                            OLS   Adj. R-squared:                  0.789
Method:                 Least Squares   F-statistic:                     116.9
Date:                Wed, 04 Nov 2020   Prob (F-statistic):          1.35e-125
Time:                        09:50:12   Log-Likelihood:                 106.78
No. Observations:                 404   AIC:                            -185.6
Df Residuals:                     390   BIC:                            -129.5
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.8409     

In [10]:
vif_1 = [] #empty list                         # Variance Inflation Factor

for i in range (X_incl_const_1.shape[1]):
    
    vif_1.append(variance_inflation_factor(exog= X_incl_const_1.values, exog_idx=i))

print('\nVIF_1\n', vif_1)


VIF_1
 [580.7472632659341, 1.7131869906128505, 2.465630718663123, 3.8778553502602815, 1.0966737120634569, 4.469150159170631, 1.9478087495837588, 2.9899478376482787, 4.16857837354429, 7.658315779148442, 8.943301431814218, 1.851448407067042, 1.3251213980906684, 2.818045379538575]


### Creating Model_2 by Dropping Features from Model_1 Using P_values>0.05

In [11]:
X_incl_const_2 = sm.add_constant(X_train)

P_values = round(results_1.pvalues, 3)

for i in range(P_values.shape[0]):
    if(P_values[i]>0.05):
        
        X_incl_const_2.drop([P_values.index[i]], axis=1, inplace=True)

model_2 = sm.OLS(y_train, X_incl_const_2)
results_2 = model_2.fit()

print('\nSummary of Model_2 >>>>\n', results_2.summary())


Summary of Model_2 >>>>
                             OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.794
Model:                            OLS   Adj. R-squared:                  0.789
Method:                 Least Squares   F-statistic:                     151.9
Date:                Wed, 04 Nov 2020   Prob (F-statistic):          2.66e-128
Time:                        09:50:12   Log-Likelihood:                 105.49
No. Observations:                 404   AIC:                            -189.0
Df Residuals:                     393   BIC:                            -145.0
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.8528     

In [12]:
vif_2 = [] #empty list                          # Variance Inflation Factor

for i in range (X_incl_const_2.shape[1]):
    
    vif_2.append(variance_inflation_factor(exog= X_incl_const_2.values, exog_idx=i))

print('\nVIF_2\n',vif_2)


VIF_2
 [576.4258577750066, 1.6995063324698798, 1.0833550411756299, 3.7973870197322683, 1.8066696516255718, 2.619209863656011, 7.090392330230745, 7.078572054426567, 1.5665696655442751, 1.3186011284107795, 2.551020818165935]


### Creating Model_3 by Dropping Features from Model_2 Using VIF>10

In [13]:
for i in range(X_incl_const_2.shape[1]):
    if(vif_2[i]>10):
        
        X_incl_const_2.drop([X_incl_const_2.columns[i]], axis=1, inplace=True)

In [14]:
X_incl_const_3 = sm.add_constant(X_incl_const_2)

model_3 = sm.OLS(y_train, X_incl_const_3)
results_3 = model_3.fit()

print('\nSummary of Model_3 >>>>\n', results_3.summary())


Summary of Model_3 >>>>
                             OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.794
Model:                            OLS   Adj. R-squared:                  0.789
Method:                 Least Squares   F-statistic:                     151.9
Date:                Wed, 04 Nov 2020   Prob (F-statistic):          2.66e-128
Time:                        09:50:12   Log-Likelihood:                 105.49
No. Observations:                 404   AIC:                            -189.0
Df Residuals:                     393   BIC:                            -145.0
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.8528     

In [15]:
vif_3=[] #empty list                            # Variance Inflation Factor

for i in range (X_incl_const_3.shape[1]):
    
    vif_3.append(variance_inflation_factor(exog= X_incl_const_3.values, exog_idx=i))

print('\nVIF_3\n',vif_3)


VIF_3
 [576.4258577750066, 1.6995063324698798, 1.0833550411756299, 3.7973870197322683, 1.8066696516255718, 2.619209863656011, 7.090392330230745, 7.078572054426567, 1.5665696655442751, 1.3186011284107795, 2.551020818165935]


## Prediction Using Optimized Model "Model_3"

In [16]:
X_test = X_test[X_incl_const_2.columns]    ## Re-modelling the Data for Using in Model_3

Pred_y_test = results_3.predict( sm.add_constant(X_test))

print('\nOptimised_Test_Score(y_test, Pred_y_test) :', r2_score(y_test, Pred_y_test))


Optimised_Test_Score(y_test, Pred_y_test) : 0.7430764959352337


## Getting Required Data
### Please Enter Requirements For Below Cell Outputs

In [17]:
PT_ratio = round(float(input('\nenter required PT_RATIO :')),1)
Rooms    = round(float(input('\nenter required no.of Rooms :')),3)
Chas    = input('\nDo Tract should bound Charle\'s River ? [yes/no] : ')

if(Chas=='yes'):
    chas=1.0    
elif(Chas=='no'):
    chas=0.0  
else:
    print('\ndata has both \'yes\' and \'no\' for CHAS')
    chas=data['CHAS']


enter required PT_RATIO :10

enter required no.of Rooms :4

Do Tract should bound Charle's River ? [yes/no] : yes


### Using the Mean Values for Other Features

In [18]:
data_3=X_incl_const_3
X_new=[] # empty list
for i in range (data_3.shape[1]):
    if(data_3.columns[i]=='PTRATIO'):
        X_new.append(PT_ratio)
    elif(data_3.columns[i]=='RM'):
        X_new.append(Rooms)
    elif(data_3.columns[i]=='CHAS'):
        X_new.append(chas)
    else:
        X_new.append(data_3[data_3.columns[i]].mean())

### Predicting Your Output

In [19]:
Predicted_Log_Price = results_3.predict(X_new)[0]

print(f'\nPoint Predicted Price for Your Requirements : ${round((30000*np.e**Predicted_Log_Price),3)}')


Point Predicted Price for Your Requirements : $738021.91


### Prediction With High Confidence Interval "95%"

In [20]:
upper_bound = 30000*np.e**(Predicted_Log_Price + 2*np.sqrt(results_3.mse_resid))
lower_bound = 30000*np.e**(Predicted_Log_Price - 2*np.sqrt(results_3.mse_resid))

#print('\nRMSE of Model_3 :',round(np.sqrt(results_3.mse_resid),3))

print(f'\nFor Your Requirements Price of a House Ranges From : ${round(lower_bound,3)} To : ${round(upper_bound,3)}')


For Your Requirements Price of a House Ranges From : $505759.611 To : $1076947.088
