House Price Predictions

In [1]:
# General Libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Data set(as we have used data set from sklearn)

from sklearn.datasets import fetch_california_housing


In [2]:
# Get the dataset(import libraries)

data = fetch_california_housing()

In [3]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [7]:
#independent data (used for training)
df = pd.DataFrame(data.data, columns = data.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [10]:
# Dependent data

df['Target'] = data.target
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


1. Imported our data
2. then added them into dataframe
3. we got out independant data 
4. then we got dependent data

EDA

In [None]:
# Exploratory data analysis

!pip install sweetviz

In [13]:
import sweetviz as sv

report = sv.analyze(df)

report.show_html("./report.html")

                                             |          | [  0%]   00:00 -> (? left)

Report ./report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


Data Preprocessing

In [14]:
# Feature Engineering

from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent='geopiExercises')

In [16]:
geolocator.reverse("37.88"+","+"-122.23"	).raw['address']#

{'leisure': 'Ecological Study Area',
 'road': 'Summit House Trail',
 'city': 'Oakland',
 'county': 'Alameda County',
 'state': 'California',
 'ISO3166-2-lvl4': 'US-CA',
 'postcode': '94563',
 'country': 'United States',
 'country_code': 'us'}

In [None]:
def location(cord):
  latitude = str(cord[0])
  longitude = str(cord[1])

  location = geolocator.reverse( "Latitude"+","+"Longitude" ).raw['address']# raw returns the dictionary
  
  # if the values are missing replace it with empty string

  if location.get('road') is None:
    location['road'] = None

  if location.get('county') is None:
    location['county'] = None

   loc_update['county'].append(location['county'])
   loc_update['road'].append(location['road'])


In [None]:
loc_update = {"county":[],
              "road":[]}

for i,cord in enumerate(df.iloc[:,6:-1].value):

  location(cord)
  #Contnuously Reading our data and saving it on go.
  pickle.dump(loc_update, open('loc_update.pickle','wb'))

  if i%100 == 0:
    print(i)

In [None]:
# to load the pickle module

import pickle

loc_update = pickle.load(open("/content/loc_update.pickle", "rb"))


In [None]:
loc = pd.DataFrame(loc_update)

In [None]:
loc.info()

In [None]:
# Add new features to Data Frame

for i in loc_update.keys():
  df[i]= loc_update[i]
df = df.sample(axis=0 , frac=1)

df.head(10)


In [None]:
# drop latitude, longitude columns

df = df.drop(labels=["Latitude", "Longitude"], axis =1)
df.head()

In [None]:
df.info()

Using Classification Algorithem To Fill The Missing Categorical Values

Pred road

In [None]:
# applying Logistic Regression to find missing road values

missing_idx = []

for i in range(df.shape[0]):

  if df['road'][i] is None:
    missing_idx.append(i)

# Independant parameters

missing_road_X_train = np.array([ [df['MedInc'][i],df['AveRooms'][i],df['AveBedrms'][i]] for ii in range(df.shape[0]) if i not in missing_idx])

#Dependent parameter

missing_road_Y_train = np.array([df['Road'][i] for i in range(df.shape[0]) if i not in missing_idx])

missing_road_X_test = np.array([ [df['MedInc'][i],df['AveRooms'][i],df['AveBedrms'][i]] for i in range(df.shape[0]) if i in missing_idx])

In [None]:
from sklearn.linear_model import SGDClassifier

# model initialization

model_1 = SGDClassifier()

# model training

model_1.fit(missing_road_X_train,missing_road_Y_train)

missing_road_Y_pred = model_1.predict(missing_road_X_test)

In [None]:
# Add the model to the data frame

for n,i in enumerate(missing_idx):
  df['road'][i]= missing_road_Y_pred[n]

# for lable encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['road'] = le.fit_transform(df['road'])  

Pred county

In [None]:
# applying Logistic Regression to find missing county values

missing_idx = []

for i in range(df.shape[0]):

  if df['county'][i] is None:
    missing_idx.append(i)

# Independant parameters

missing_county_X_train = np.array([ [df['MedInc'][i],df['AveRooms'][i],df['AveBedrms'][i]] for i in range(df.shape[0]) if i not in missing_idx])

#Dependent parameter

missing_county_Y_train = np.array([df['county'][i] for i in range(df.shape[0]) if i not in missing_idx])

missing_county_X_test = np.array([ [df['MedInc'][i],df['AveRooms'][i],df['AveBedrms'][i]] for i in range(df.shape[0]) if i in missing_idx])

In [None]:
from sklearn.linear_model import SGDClassifier

# model initialization

model_1 = SGDClassifier()

# model training

model_1.fit(missing_county_X_train,missing_county_Y_train)

missing_county_Y_pred = model_1.predict(missing_county_X_test)

In [None]:
# Add the model to the data frame

for n,i in enumerate(missing_idx):
  df['county'][i]= missing_county_Y_pred[n]

# for lable encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['county'] = le.fit_transform(df['county'])  

Understanding which model to use

In [None]:
# Dependent Values

Y = df.iloc[:,-3].values

df.drop(labels=['Target'], axis =1, inplace= True)

X = df.iloc[:,:].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(X_train, Y_train)

In [None]:
# model prediction

Y_pred = model.predict(X_test)

In [None]:
# model accuracy

from sklearn.metrics import r2_score

r2_score(Y_test, Y_pred)*100

ADD our own data

In [None]:
inp = np.array([3.3269 ,15.0 ,5.265107 ,1.122807 ,1245.0 ,2.426901 ,15 ,120])

In [None]:
ip = inp.reshape((1,-1))

In [None]:
model.predict(ip)