In [39]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [40]:
df = pd.read_csv(r"C:\Users\sahil\OneDrive\Naresh IT Class\Data Files\housing_price_dataset.csv")

In [41]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [42]:
df.shape

(50000, 6)

In [43]:
df.columns

Index(['SquareFeet', 'Bedrooms', 'Bathrooms', 'Neighborhood', 'YearBuilt',
       'Price'],
      dtype='object')

In [44]:
df.isnull().sum()

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64

In [45]:
df.dtypes

SquareFeet        int64
Bedrooms          int64
Bathrooms         int64
Neighborhood     object
YearBuilt         int64
Price           float64
dtype: object

In [46]:
house_age = []
for i in df['YearBuilt']:
    house_age.append(2024-i)


In [47]:
df['House_age'] = house_age


In [48]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price,House_age
0,2126,4,1,Rural,1969,215355.283618,55
1,2459,3,2,Rural,1980,195014.221626,44
2,1860,2,1,Suburb,1970,306891.012076,54
3,2294,2,1,Urban,1996,206786.787153,28
4,2130,5,2,Suburb,2001,272436.239065,23


In [49]:
df.tail()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price,House_age
49995,1282,5,3,Rural,1975,100080.865895,49
49996,2854,2,2,Suburb,1988,374507.656727,36
49997,2979,5,3,Suburb,1962,384110.55559,62
49998,2596,5,2,Rural,1984,380512.685957,40
49999,1572,5,3,Rural,2011,221618.583218,13


In [50]:

df['Price1'] = df['Price']

In [51]:
df.drop('Price',inplace=True, axis=1)

In [52]:
df.rename(columns={'Price1':'Price'},inplace=True)

In [53]:
df.drop('YearBuilt', axis=1, inplace=True)

In [54]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,House_age,Price
0,2126,4,1,Rural,55,215355.283618
1,2459,3,2,Rural,44,195014.221626
2,1860,2,1,Suburb,54,306891.012076
3,2294,2,1,Urban,28,206786.787153
4,2130,5,2,Suburb,23,272436.239065


In [55]:
df['Neighborhood'].value_counts()

Neighborhood
Suburb    16721
Rural     16676
Urban     16603
Name: count, dtype: int64

#### Applying One Hot Encoding Using Pandas

In [56]:
df = pd.get_dummies(df,columns=['Neighborhood']).astype(int)

In [57]:
df.head()


Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,House_age,Price,Neighborhood_Rural,Neighborhood_Suburb,Neighborhood_Urban
0,2126,4,1,55,215355,1,0,0
1,2459,3,2,44,195014,1,0,0
2,1860,2,1,54,306891,0,1,0
3,2294,2,1,28,206786,0,0,1
4,2130,5,2,23,272436,0,1,0


In [58]:
output_variable = df.pop('Price')
df['Price'] = output_variable

In [59]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,House_age,Neighborhood_Rural,Neighborhood_Suburb,Neighborhood_Urban,Price
0,2126,4,1,55,1,0,0,215355
1,2459,3,2,44,1,0,0,195014
2,1860,2,1,54,0,1,0,306891
3,2294,2,1,28,0,0,1,206786
4,2130,5,2,23,0,1,0,272436


In [60]:
# sns.pairplot(data=df)

### Apply Train-Test-Split

In [61]:
x = df.drop('Price',axis=1)
y = df['Price']

In [62]:
from sklearn.model_selection import train_test_split

In [63]:
x_train, x_test, y_tarin, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Model Developement

In [64]:
def train_Linear_model(x_train,y_tarin,x_test):
    from sklearn.linear_model import LinearRegression
    lr = LinearRegression()
    lr.fit(x_train,y_tarin)
    y_prediction = lr.predict(x_test)
    return y_prediction

In [65]:
y_prediction = train_Linear_model(x_train,y_tarin,x_test)

In [66]:
y_prediction

array([218328.98824436, 135024.53577193, 255260.32886374, ...,
       310674.73762015, 200903.61356623, 244037.34852069])

### Model Evaluation

In [67]:
def model_evaluation(y_test, y_prediction):
    from sklearn.metrics import mean_squared_error, r2_score
    R2 = r2_score(y_test,y_prediction)
    MSE =mean_squared_error(y_test, y_prediction)
    RMSE = np.sqrt(MSE)
    print('R2-Score : ',R2)
    print('MSE Value : ',MSE)
    print('RMSE Value : ',RMSE)

In [68]:
model_evaluation(y_test, y_prediction)

R2-Score :  0.5755628291469783
MSE Value :  2436249404.0238967
RMSE Value :  49358.37724261097


In [69]:
test_df = pd.DataFrame()
test_df['y_actual'] = y_test
test_df['y_prediction'] = y_prediction
test_df

Unnamed: 0,y_actual,y_prediction
33553,170835,218328.988244
9427,126913,135024.535772
199,246611,255260.328864
12447,244250,257830.906930
39489,271127,277383.170424
...,...,...
28567,199265,226300.202797
25079,241869,198768.104718
18707,352184,310674.737620
15200,244830,200903.613566


### Feature Scaling 
##### using Column Transformer class of sklearn.compose package 
**we apply Column Transformer only on SquareFeet,	Bedrooms,	Bathrooms,	House_age and Price(Output/Target) columns**

In [70]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,House_age,Neighborhood_Rural,Neighborhood_Suburb,Neighborhood_Urban,Price
0,2126,4,1,55,1,0,0,215355
1,2459,3,2,44,1,0,0,195014
2,1860,2,1,54,0,1,0,306891
3,2294,2,1,28,0,0,1,206786
4,2130,5,2,23,0,1,0,272436


##### Import the package

In [71]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

**Create object of ColumnTransformer class with StandardScale class**

In [72]:
ct = ColumnTransformer([('scale', StandardScaler(), ['SquareFeet','Bedrooms','Bathrooms','House_age','Price'])], remainder='passthrough')

**Apply ColunmTransformer to the DataFrame**

In [73]:
df = pd.DataFrame(ct.fit_transform(df), columns=['SquareFeet','Bedrooms','Bathrooms','House_age', 'Price', 'Neighborhood_Rural', 'Neighborhood_Suburb', 'Neighborhood_Urban'])

In [74]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,House_age,Price,Neighborhood_Rural,Neighborhood_Suburb,Neighborhood_Urban
0,0.207861,0.449067,-1.220113,0.791751,-0.124398,1.0,0.0,0.0
1,0.78648,-0.446738,0.005614,0.260842,-0.391547,1.0,0.0,0.0
2,-0.25434,-1.342543,-1.220113,0.743486,1.077791,0.0,1.0,0.0
3,0.499777,-1.342543,-1.220113,-0.51139,-0.236939,0.0,0.0,1.0
4,0.214811,1.344872,0.005614,-0.752713,0.625276,0.0,1.0,0.0


In [75]:
df['Neighborhood_Rural'] = df['Neighborhood_Rural'].astype(int)
df['Neighborhood_Suburb'] = df['Neighborhood_Suburb'].astype(int)
df['Neighborhood_Urban'] = df['Neighborhood_Urban'].astype(int)

In [76]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,House_age,Price,Neighborhood_Rural,Neighborhood_Suburb,Neighborhood_Urban
0,0.207861,0.449067,-1.220113,0.791751,-0.124398,1,0,0
1,0.78648,-0.446738,0.005614,0.260842,-0.391547,1,0,0
2,-0.25434,-1.342543,-1.220113,0.743486,1.077791,0,1,0
3,0.499777,-1.342543,-1.220113,-0.51139,-0.236939,0,0,1
4,0.214811,1.344872,0.005614,-0.752713,0.625276,0,1,0


#### Put the Output/Target Column at last 

In [77]:
output_variable = df.pop('Price')
df['Price'] = output_variable

In [78]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,House_age,Neighborhood_Rural,Neighborhood_Suburb,Neighborhood_Urban,Price
0,0.207861,0.449067,-1.220113,0.791751,1,0,0,-0.124398
1,0.78648,-0.446738,0.005614,0.260842,1,0,0,-0.391547
2,-0.25434,-1.342543,-1.220113,0.743486,0,1,0,1.077791
3,0.499777,-1.342543,-1.220113,-0.51139,0,0,1,-0.236939
4,0.214811,1.344872,0.005614,-0.752713,0,1,0,0.625276


### Now again apply Linear Regression

In [79]:
x = df.drop('Price', axis=1)
y = df['Price']

**Train-Test Split**

In [80]:
x_train, x_test, y_tarin, y_test = train_test_split(x, y, test_size=0.2, random_state=1234)

**Model Developement**
- model developement code is save in function

In [81]:
y_prediction = train_Linear_model(x_train,y_tarin,x_test)

In [82]:
y_prediction

array([ 0.59375   ,  0.01269531, -0.50439453, ..., -0.59716797,
       -0.27001953, -0.47753906])

**Model Evalution**
- code already save in function we just call it

In [83]:
model_evaluation(y_test, y_prediction)

R2-Score :  0.5693056021386818
MSE Value :  0.430674249156448
RMSE Value :  0.6562577612161612
