# House Prices - Advanced Regression Techniques ❤️‍🔥
Dataset Source: https://www.kaggle.com/c/house-prices-advanced-regression-techniques
<br><br>
Stanley Nathanael Wijaya - 2702217125

## Task Description

You are a data scientist tasked with developing a predictive model for house prices 
<br><br>
(https://www.kaggle.com/c/house-prices-advanced-regression-techniques)
<br><br>
Your objective is to build a linear regression model that accurately predicts the sale price of houses based on various features.
Split the dataset into a training set and a testing set to evaluate your model's performance (70:30).
Develop a linear regression model using your selected features.
Evaluate your model using appropriate metrics such as Mean Squared Error (MSE), Root Mean Squared Error (RMSE), or R-squared.
Interpret the results and provide insights into the model's performance.

## Import Library

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Read Data

In [None]:
train = pd.read_csv('Dataset/train.csv')
test = pd.read_csv('Dataset/test.csv')
sample_submission = pd.read_csv('Dataset/sample_submission.csv')


In [None]:
train

In [None]:
train.info()

In [None]:
train.drop(['PoolQC' , 'Fence', 'Alley'], axis = 1 , inplace = True)
test.drop(['PoolQC' , 'Fence', 'Alley'], axis = 1 , inplace = True)

## Data Preprocessing

In [None]:
train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True)

for column in train.select_dtypes(include=['object']).columns:
    mode_val = train[column].mode()[0]
    train[column].fillna(mode_val, inplace=True)
    test[column].fillna(mode_val, inplace=True)

In [None]:
train.isna().any().sum()

In [None]:
train.drop('Id' , axis = 1 , inplace = True)
test.drop('Id' , axis = 1 , inplace = True)
train

In [None]:
x = train.select_dtypes(include=['object']).columns
x

In [None]:
label_encoders = {}
for column in x:
    le = LabelEncoder()
    train[column] = le.fit_transform(train[column])
    test[column] = le.transform(test[column])
    label_encoders[column] = le

In [None]:
train.shape,test.shape

In [None]:
x = np.array(train.drop('SalePrice',axis = 1))
y = np.array(train['SalePrice'])
x.shape,y.shape

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val = train_test_split(x,y,random_state=1)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
x_poly_train = poly_reg.fit_transform(x_train)
x_poly_val = poly_reg.transform(x_val)

## Creating Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_poly_train,y_train)
y_pred = model.predict(x_poly_val)

In [None]:
from mlxtend.evaluate import bias_variance_decomp
avg_error,avg_bias,avg_variance =  bias_variance_decomp(model,x_poly_train,y_train,x_poly_val,y_val,loss='mse',random_seed=1)
avg_error,avg_bias,avg_variance

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
x_poly_train = poly_reg.fit_transform(x)
x_test = poly_reg.transform(test)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_poly_train,y)
y_pred = model.predict(x_test)

In [None]:
sub = pd.DataFrame({'Id' : sample_submission.Id , 'SalePrice' : y_pred.reshape(-1)})

In [None]:
sub.to_csv('submission.csv', index=False)