In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Note: This is my very first Data Analytics and Science project, I know i am still rusty.

# Predicting House Prices
This is a simple analysis and prediction of house prices. We will find out which particular factor affects the price of house.
We will check by:
- Location
- Size (area of space occupied)
- Year built
- All the parameters

# **Imports**

In [None]:
# getting the necessary imports
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# transformers and predictor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder
# performance metrics
from sklearn.metrics import mean_absolute_error
# model selection for train and test data
from sklearn.model_selection import train_test_split


# Reading and Exploring the Data
I am still working on my story telling.

In [None]:
# read the csv file and see the information for each columns
data = pd.read_csv('/kaggle/input/housing-price-dataset/Housing.csv')
data.info()

In [None]:
data.head()

In [None]:
# let's check if there's any with nan values
data.isna().sum()

So we have a clean dataset with no nan value. 
Let's check the shape. i.e No. of columns and rows

In [None]:
data.shape

We have 21,613 rows and 21 columns

**Let's view:**
- Price relative to the number of rooms
- Price relative to size
- Prices of houses based on year
- cluster of the houses and their price

In [None]:
# no of unique values of bedrooms
data['bedrooms'].unique()

In [None]:
# price relative to the number of rooms
sns.barplot(x = data.bedrooms, y = data.price)
plt.ylabel('Price in Millions')

In [None]:
plt.figure(figsize=[15,5])
sns.scatterplot(data, x = 'sqft_living', y = 'price', hue = 'yr_built')


In [None]:
# Prices of houses based on year
# convert the date to pandas date
data['date'] = pd.to_datetime(data['date'])
# to make it show the date with only month
resampled_df = data.resample("M", on="date").mean().reset_index()

# i only want to visulaize data based on month data
plt.figure(figsize=[12,5])
sns.lineplot(data = resampled_df, x = 'date', y = 'price', c= 'red', marker = 'o')
plt.xlabel("Date")
plt.ylabel("Price")
plt.title("Average Price Per Mounth")

**Analysis and question answering**
- 

In [None]:
# knowing the average price for each no. of house bedroom
data.groupby('bedrooms')['price'].mean()

# Data Processing
This will involve
- data cleaning/wrangling
- Checking and removing
    - Outliers
    - Correlations/ removing unncessary features
    - columns with low and high cardinality
- Separate the train from test data, and the feature and target data

In [None]:
# I will like to subset my data to just house lesser than 1.5million
mask_price = data['price'] < 1500000
# removing outliers by only taking data between the 0.1 and0.9 quartile
low, high= data['price'].quantile([0.1,0.9])
mask_card = data['price'].between(low, high)
data = data[mask_price & mask_card]
data.shape

In [None]:
# drop correlating columns
plt.figure(figsize = (15,8))
sns.heatmap(data.corr())

It can be seen that sqft_lot, sqft_lot15, sqft_living15, sqft_living and sqft_above, all have correlation. But for this analysis, i am only going to keep sqft_living.

In [None]:
# before dropping unneccesary colunmns. Let's checking for columns with low and high cardinality
data.nunique()

It can be seen that columns like id (all items are very unique, thus high cardinality), sqft_lot, sqft_lot15
then, floors, waterfrint, view, condition, grade.
We have to drop those columns

In [None]:
data.drop(columns = ['id', 'waterfront', 'view', 'condition', 'grade', 'sqft_lot', 'sqft_lot15', 'floors'], inplace = True)
data.shape
data.info()

In [None]:
# drop the correlators  and other unnecessary columns
data.drop(columns = ['sqft_above', 'sqft_basement', 'sqft_living15', 'zipcode', 'yr_renovated', 'bathrooms', 'date'],inplace = True)

In [None]:
data.info()

**Train and Test, Feature and Target Data**
-

In [None]:
# I will first get the feature and target data out
# Since i will be predicting for different scenrios, the feature variables will be specific to its role but the target woun't

target = 'price'
y_train = data[target]

year = ['yr_built']
yearFeature = data[year]

sqft = ['sqft_living']
sqftFeature = data[sqft]

bedroom = ['bedrooms']
bedroomFeature = data[bedroom]

location = ['lat', 'long']
locationFeature = data[location]

all = ['bedrooms', 'sqft_living', 'yr_built', 'lat', 'long']
allFeature = data[all]
sqftFeature.shape

In [None]:
# then its time to shuffle and split
allFeatTrain, allFeatTest, allTargetTrain, allTargetTest = train_test_split(allFeature, y_train, test_size = 0.3, random_state  = 42,shuffle = True)
yearFeatTrain, yearFeatTest, yearTargetTrain, yearTargetTest = train_test_split(yearFeature, y_train, test_size = 0.3, random_state  = 42,shuffle = True)
sqftFeatTrain, sqftFeatTest, sqftTargetTrain, sqftTargetTest = train_test_split(sqftFeature, y_train, test_size = 0.3, random_state  = 42,shuffle = True)
bedroomFeatTrain, bedroomFeatTest, bedroomTargetTrain, bedroomTargetTest = train_test_split(bedroomFeature, y_train, test_size = 0.3, random_state  = 42,shuffle = True)
locationFeatTrain, locationFeatTest, locationTargetTrain, locationTargetTest = train_test_split(locationFeature, y_train, test_size = 0.3, random_state  = 42,shuffle = True)

yearTargetTest.shape

# Build Model

**The Baseline model**
-

In [None]:
# get the mean of the target data
y_mean = y_train.mean()
# let the mean be a list with the length of the target data
y_base = [y_mean] * len(y_train)
# as a metric measurement, i will use MAE
y_baseline_metric = mean_absolute_error(y_train, y_base).round(2)
print(f'The mean absolute error is {y_baseline_metric}')

**Define Model, Fit and Predict**
-

In [None]:
modelPipeline = make_pipeline(
    # i will use these transformers just in case of an oversight
    #OneHotEncoder(use_cat_names=True),
    SimpleImputer(), 
    # predictor
    Ridge()
)

In [None]:
print(f'{allFeatTrain.shape}, {yearTargetTrain.shape}')

In [None]:
# This is not a recommended approach when dealing multiple factors than this. A for loop is better to split train and test data
features = [yearFeatTrain, sqftFeatTrain, bedroomFeatTrain, locationFeatTrain, allFeatTrain]
theTargets = [yearTargetTrain, sqftTargetTrain, bedroomTargetTrain, locationTargetTrain, allTargetTrain]
testfeatures = [yearFeatTest, sqftFeatTest, bedroomFeatTest, locationFeatTest, allFeatTest] 
testTarget = [yearTargetTest, sqftTargetTest, bedroomTargetTest, locationTargetTest, allTargetTest] 
result = {}

#loop through the feature to fit the model and predict 
for feature, featureTest, price, targetTest  in zip(features, testfeatures, theTargets, testTarget):
    # fit the model
    modelPipeline.fit(feature, price)
    
    #Store the fitted model into a list
    #modelPipeline[feature] = modelPipeline
    
    #predict using the test data
    price_pred = modelPipeline.predict(featureTest)
    
    #get the MAE
    modelMAE = mean_absolute_error(targetTest, price_pred)
    
    # store the result
    result = {'MAE' : modelMAE}

# lets print the result
for feature in features:
    print(f'MAE: {result["MAE"]}')
    

In [None]:
# let me try this
modelPipeline.fit(allFeatTrain, allTargetTrain)
all_pred = modelPipeline.predict(allFeatTest)

In [None]:
MAE = mean_absolute_error(allTargetTest, all_pred).round(2)

In [None]:
MAE

In [None]:
modelPipeline.fit(yearFeatTrain, yearTargetTrain)
year_pred = modelPipeline.predict(yearFeatTest)
yearMAE = mean_absolute_error(yearTargetTest, year_pred).round(2)
yearMAE

In [None]:
modelPipeline.fit(sqftFeatTrain, sqftTargetTrain)
sqft_pred = modelPipeline.predict(sqftFeatTest)
sqftMAE = mean_absolute_error(sqftTargetTest, sqft_pred).round(2)
sqftMAE

In [None]:
modelPipeline.fit(bedroomFeatTrain, bedroomTargetTrain)
bed_pred = modelPipeline.predict(bedroomFeatTest)
bedMAE = mean_absolute_error(bedroomTargetTest, bed_pred).round(2)
bedMAE

In [None]:
modelPipeline.fit(locationFeatTrain, locationTargetTrain)
location_pred = modelPipeline.predict(locationFeatTest)
locationMAE = mean_absolute_error(locationTargetTest, location_pred).round(2)
locationMAE