In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.impute import SimpleImputer



#Data Acquisition

In [None]:
df = pd.read_csv("/content/1553768847-housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [None]:
for i in df.columns:
  print(i)

longitude
latitude
housing_median_age
total_rooms
total_bedrooms
population
households
median_income
ocean_proximity
median_house_value


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
 9   median_house_value  20640 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [None]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [None]:
df["ocean_proximity"].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

#Separating Categorical and Numerical Columns

In [None]:
Numerical_column =[]
for i in df.dtypes.index:
  if df.dtypes[i] != object:
    Numerical_column.append(i)

Categorical_column = []
for i in df.dtypes.index:
  if df.dtypes[i]== object:
    Categorical_column .append(i)

print("Numerical columns are : ",  Numerical_column)
print("Categorical columns are : " , Categorical_column)




Numerical columns are :  ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
Categorical columns are :  ['ocean_proximity']


#Handling Missing data Using Simple Imputer

In [None]:
#Handling numerical data
num_data = SimpleImputer(strategy="mean")
num_data.fit(df[Numerical_column])
df[Numerical_column] = num_data.transform(df[Numerical_column])

In [None]:
df.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64

#Handling Categorical Data Using One Hot Encoding

In [None]:
df = pd.get_dummies(df, columns=["ocean_proximity"],drop_first=True)

df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,True,False,False,False
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,True,False,False,False
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,True,False,False,False
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,True,False,False,False


#Scaling Using Min-Max Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
Scaled_df= scaler.fit_transform(df)
Scaled_df

array([[0.21115538, 0.5674814 , 0.78431373, ..., 0.        , 1.        ,
        0.        ],
       [0.21215139, 0.565356  , 0.39215686, ..., 0.        , 1.        ,
        0.        ],
       [0.21015936, 0.5642933 , 1.        , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.31175299, 0.73219979, 0.31372549, ..., 0.        , 0.        ,
        0.        ],
       [0.30179283, 0.73219979, 0.33333333, ..., 0.        , 0.        ,
        0.        ],
       [0.30976096, 0.72582359, 0.29411765, ..., 0.        , 0.        ,
        0.        ]])

##Converting array back to dataframe##

In [None]:
df = pd.DataFrame(Scaled_df , columns = df.columns)
df.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0.211155,0.567481,0.784314,0.022331,0.019863,0.008941,0.020556,0.539668,0.902266,0.0,0.0,1.0,0.0
1,0.212151,0.565356,0.392157,0.180503,0.171477,0.06721,0.186976,0.538027,0.708247,0.0,0.0,1.0,0.0


In [None]:
df = df.rename(columns={
    "ocean_proximity_NEAR BAY": "ocean_proximity_NEAR_bay",
    "ocean_proximity_NEAR OCEAN": "ocean_proximity_Ocean"
})
df.head(1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR_bay,ocean_proximity_Ocean
0,0.211155,0.567481,0.784314,0.022331,0.019863,0.008941,0.020556,0.539668,0.902266,0.0,0.0,1.0,0.0


#Separarting X and Y

In [None]:
X = df.drop(["median_house_value"], axis = 1)
Y = df["median_house_value"]
X


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR_bay,ocean_proximity_Ocean
0,0.211155,0.567481,0.784314,0.022331,0.019863,0.008941,0.020556,0.539668,0.0,0.0,1.0,0.0
1,0.212151,0.565356,0.392157,0.180503,0.171477,0.067210,0.186976,0.538027,0.0,0.0,1.0,0.0
2,0.210159,0.564293,1.000000,0.037260,0.029330,0.013818,0.028943,0.466028,0.0,0.0,1.0,0.0
3,0.209163,0.564293,1.000000,0.032352,0.036313,0.015555,0.035849,0.354699,0.0,0.0,1.0,0.0
4,0.209163,0.564293,1.000000,0.041330,0.043296,0.015752,0.042427,0.230776,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,0.324701,0.737513,0.470588,0.042296,0.057883,0.023599,0.054103,0.073130,1.0,0.0,0.0,0.0
20636,0.312749,0.738576,0.333333,0.017676,0.023122,0.009894,0.018582,0.141853,1.0,0.0,0.0,0.0
20637,0.311753,0.732200,0.313725,0.057277,0.075109,0.028140,0.071041,0.082764,1.0,0.0,0.0,0.0
20638,0.301793,0.732200,0.333333,0.047256,0.063315,0.020684,0.057227,0.094295,1.0,0.0,0.0,0.0


In [None]:
# from sklearn.preprocessing import PolynomialFeatures
# polynomial_features = PolynomialFeatures(degree=2)
# X_poly = X_poly = polynomial_features.fit_transform(X)

#Using Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
# X_test, X_train, Y_test, Y_train = train_test_split(X,Y,test_size=0.3,random_state=0)
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.25, random_state=0)
# X_test,X_train,Y_test,Y_train = train_test_split(X_poly,Y,test_size=0.3,random_state=0)

#Model Building Using Linear Reression

In [None]:
from sklearn.linear_model import LinearRegression
LR= LinearRegression()
model = LR.fit(X_train.values,y_train)
model




In [None]:
model.predict([[0.211155,0.567481,0.784314,0.022331,0.019863,0.008941,0.020556,0.539668,0.0,0.0,0.1,0.0]])


array([0.82339576])

In [None]:
Prediction= LR.predict(X_test)
Prediction




array([0.41620335, 0.56272205, 0.34053218, ..., 0.43069924, 0.57763036,
       0.56128505])

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test,Prediction)
mse

0.020319543475426156

In [None]:
r2 =r2_score(y_test,Prediction)
r2

0.6384081651714097

In [None]:
import pickle


In [None]:
fileName= 'HousePricePrediction.pkl'
pickle.dump(LR,open(fileName,"wb"))

In [None]:
for i in X.columns:
  print(i)

longitude
latitude
housing_median_age
total_rooms
total_bedrooms
population
households
median_income
ocean_proximity_INLAND
ocean_proximity_ISLAND
ocean_proximity_NEAR_bay
ocean_proximity_Ocean


In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   longitude                 20640 non-null  float64
 1   latitude                  20640 non-null  float64
 2   housing_median_age        20640 non-null  float64
 3   total_rooms               20640 non-null  float64
 4   total_bedrooms            20640 non-null  float64
 5   population                20640 non-null  float64
 6   households                20640 non-null  float64
 7   median_income             20640 non-null  float64
 8   ocean_proximity_INLAND    20640 non-null  float64
 9   ocean_proximity_ISLAND    20640 non-null  float64
 10  ocean_proximity_NEAR_bay  20640 non-null  float64
 11  ocean_proximity_Ocean     20640 non-null  float64
dtypes: float64(12)
memory usage: 1.9 MB
