In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_palette("Reds_r")

# FIRST KAGGLE NOTEBOOK : MULTIPLE LINEAR REGRESSON MODEL

Linear regression basically is a study of relationship between variables and to estimate or predict the dependent variable. 

The simple equation of linear regression is:
                                            Y = MX + B


While Y is the dependent variables that the value we're going to predict, X is the variable that affect the Y or usually called independent variable. In this equation, M is the slope and tells us how much Y is going to change as the X is changing.

For example, if we're going to predict the salary based on how much experience year that people have, the example of regression equation is going to look like this:
                                            SALARY = 3X + 100.
 
We can see that as that as the job experience that the people have is higher, the salary that they get is 3 times higher. 

Furthermore, in this dataset, we're going to predict the price of houses based on some features related to them. Let's start from understanding the dataset.
# **🟥 DATA PREPARATION 🟥**

In [None]:
data = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv') #importing data
print('Top five of dataset:')
data.head()

In [None]:
print('Data information:')
data.info() 

From the information above, now we know that the features of dataset consists of :
1. ID : unique id
2. date : date when the house were sold
3. price : the price of the house
4. bedrooms : number of bedrooms
5. bathrooms : number of bathrooms
6. sqft_living : square footage of living area
7. sqft_lot : square footage of total land space
8. floors : numbers of floors
9. waterfront : explain whether the house faces the waterfront or not
10. view : levels of view the house has in 0 - 4
11. condition : the condition of the house
12. grade 
13. sqft_above : square footage of a house that is above the ground (main floor + upper)
14. sqft_basement : square footage of a house below the ground
15. yr_built : the year when the house were built
16. yr_renovated : the year when the house were renovated
17. zipcode : zipcode area of the house
18. lat : latitude
19. long : longitude
20. sqft_living15
21. sqft_lot15

And there are 21613 entries or row

There are many types of value in statistic and to understand what kind of value that the dataset has is useful to do more analysis and decide what kind of visualization to use.





The first type of value is **Categorical**. Categorical value itself is consists of nominal scale and ordinal scale.


* **Nominal scale:** categorize object from the other objects and do not have value that is higher or lower. Ex: Unique ID, Zipcode, yr
* **Ordinal Scale :** categorize object based on numerical value, wether it's higher or lower than others. Ex: bedrooms, bathrooms, floors, Waterfront, view, condition, grade.

The second type is numerical value that consist of interval scale and ratio scale.
* **Interval** : scale used to provide levels at a point of observation, and to measure how much value is between an observation point and another. Ex: sqft_living, sqft_lot, etc
* **Ratio** : almost the same as interval, but ration values will never fall below zero while interval can have some values below zero such as -10, -11, etc.

Before doing data exploratory, we might want to see if there's some nan values.
# **🟥 DATA CLEANSING 🟥**

In [None]:
data.isna().sum()

From the information above, we see that the dataset is free from missing value.

* # **New Feature**

In the dataset, there is a column that tells when the house were built. We can generate new features of "Age" that may be useful to use in the model.

In [None]:
data['yr_built']

In [None]:
data['Age'] = data['yr_built'].max() - data['yr_built']
data['Age'].head()

# **🟥 DATA EXPLORATION 🟥**

In data exploration, we describe dataset characterization to understand the dataset more using data visuzlization or statistical method. The basic exploration that we can do is using .describe() to see the descriptive statistic for each feature.

In [None]:
data.describe()

* # **Visualization for categorical data**

In [None]:

def ob_numplot(column):
    fig, axs = plt.subplots(5,2, figsize=(15, 10))
    for i, clm in enumerate(column):
        sns.countplot(x = data[clm], ax = axs[i][0])
        sns.boxplot(x = data[clm], y= data['price'], ax = axs[i][1])
    plt.setp(axs)
    plt.tight_layout()
    plt.show()

kol = ['bedrooms', 'floors', 'waterfront', 'view', 'grade']
ob_numplot(kol)

* From the figures, we can see that number of bedrooms is mostly around 3 to 4, while the higher the number of bedrooms the price will be higher but decreasing after around 8.
* 1 and 2 floor is more common than 1.5 or higher than 2. The price is mostly higher as the number of floors increasing but decreasing after 2 to 2.5.
* The houses sold are rarely have waterfront, therefore the price is mostly higher for the houses that have one.
* As the level of view and grade increasing, the price is higher as well

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='bathrooms', y='price', data=data)
plt.xlabel('Bathrooms')
plt.ylabel('Price')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,5))
sns.countplot(x='bathrooms', data=data)
plt.xlabel('Bathrooms')
plt.ylabel('Count')
plt.tight_layout()
plt.show()


* Number of bathrooms mostly is around 1 - 2.5, while the price will be higher as the number of bathrooms increasing

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='zipcode', y='price', data=data)
plt.xlabel('Zipcode')
plt.xticks(rotation = 90)
plt.ylabel('Price')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,5))
sns.lineplot(x='yr_built', y='price', data=data)
plt.xlabel('Year')
plt.ylabel('Price')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,5))
sns.lineplot(x='Age', y='price', data=data)
plt.xlabel('Age')
plt.ylabel('Price')
plt.tight_layout()
plt.show()


* There are houses in some area that is relatively have higher price than other areas.
* Houses that built in around 1940 - 1980 seems to be cheaper.


* # **Visualization for Numerical/Interval Data**

In [None]:
column=['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'lat', 'long', 'price']
for column in column:
    plt.figure(figsize=(10,5))
    sns.scatterplot(x = data[column], y=data['price'])
    plt.tight_layout()
    plt.show()




* sqft_living and sqft_above shows clearer linear lines than others

* # **Correlation Matrix**

Correlation matrix is used to visualize the correlation between numerical variables.

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(data.corr(), annot=True)
plt.title('Plot Correlation')
plt.tight_layout()
plt.show()

#  **🟥 MULTIPLE LINEAR REGRESSION MODEL BUILDING 🟥**

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



In [None]:
X = data.drop(['price', 'yr_built', 'yr_renovated', 'id', 'zipcode', 'date'], axis=1)
Y = data['price']


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
reg = LinearRegression()
reg.fit(X_train, Y_train)
y_pred = reg.predict(X_test)



In [None]:
print('Coefficient:', reg.coef_)
print('Intercept:', reg.intercept_)
print('R^2: %.2f ' % r2_score(Y_test, y_pred))
print('Mean Absolute error:', mean_absolute_error(Y_test, y_pred))
print('Mean Squared error:', mean_squared_error(Y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(Y_test, y_pred)))

In [None]:
#Plotting Y_test and y_pred
plt.scatter(Y_test, y_pred)
plt.xlabel('Price Actual')
plt.ylabel('Predicted Value')
plt.title('True Value vs Predicted Value')
plt.show()

Regression Model with features that have correlation 0.5 and higher.

In [None]:
feature = ['bathrooms', 'sqft_living', 'grade', 'sqft_above', 'sqft_living15']
X1 = data[feature]
Y1 = data[['price']]

In [None]:
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size=0.2)
reg.fit(X1_train, Y1_train)
y1_pred = reg.predict(X1_test)

In [None]:
print('Coefficient:', reg.coef_)
print('Intercept:', reg.intercept_)
print('R^2: %.2f ' % r2_score(Y1_test, y1_pred))
print('Mean Absolute error:', mean_absolute_error(Y1_test, y1_pred))
print('Mean Squared error:', mean_squared_error(Y1_test, y1_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(Y1_test, y1_pred)))

In [None]:
#Plotting Y_test and y_pred
plt.scatter(Y1_test, y1_pred)
plt.xlabel('Price Actual')
plt.ylabel('Predicted Value')
plt.title('True Value vs Predicted Value')
plt.show()

*Any advice will be very valuable