# Housing Price Prediction

### Objective
As a Data Analyst working at a Real Estate Investment Trust, I was tasked with predicting housing prices using data science techniques. The goal is to determine the market price of a house given a set of features such as square footage, number of bedrooms, bathrooms, floors, and other relevant attributes.

## 1. Import Libraries and Setup

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import piplite
await piplite.install('seaborn')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression
%matplotlib inline
from pyodide.http import pyfetch

## 2. Download and Load Data

In [None]:
async def download(url, filename):
    response = await pyfetch(url)
    if response.status == 200:
        with open(filename, "wb") as f:
            f.write(await response.bytes())

filepath='https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/FinalModule_Coursera/data/kc_house_data_NaN.csv'
await download(filepath, "housing.csv")
file_name="housing.csv"
df = pd.read_csv(file_name)
df.head()

## 3. Data Cleaning and Exploration

In [None]:
df.drop(['id', 'Unnamed: 0'], axis=1, inplace=True)
print("number of NaN values for the column bedrooms:", df['bedrooms'].isnull().sum())
print("number of NaN values for the column bathrooms:", df['bathrooms'].isnull().sum())
df['bedrooms'].replace(np.nan, df['bedrooms'].mean(), inplace=True)
df['bathrooms'].replace(np.nan, df['bathrooms'].mean(), inplace=True)
print("number of NaN values for the column bedrooms:", df['bedrooms'].isnull().sum())
print("number of NaN values for the column bathrooms:", df['bathrooms'].isnull().sum())

## 4. Exploratory Data Analysis

In [None]:
sns.boxplot(x='waterfront', y='price', data=df)
plt.title('Price Distribution for House With/ Without Waterfront View')
plt.xlabel('Waterfront View (0 = No, 1 = Yes)')
plt.ylabel('Price')
plt.show()

In [None]:
sns.regplot(x='sqft_above', y='price', data=df)
plt.title('Regression Plot: sqft_above vs price')
plt.xlabel('Square Footage above Ground Level')
plt.ylabel('Price')
plt.show()

## 5. Correlation and Simple Linear Regression

In [None]:
df_numeric = df.select_dtypes(include=[np.number])
df_numeric.corr()['price'].sort_values()

In [None]:
X = df[['sqft_living']]
Y = df['price']
ln = LinearRegression()
ln.fit(X, Y)
ln.score(X, Y)

## 6. Multiple Linear Regression

In [None]:
features = ["floors", "waterfront", "lat", "bedrooms", "sqft_basement", "view", "bathrooms",
            "sqft_living15", "sqft_above", "grade", "sqft_living"]
X = df[features]
Y = df['price']
ln = LinearRegression()
ln.fit(X, Y)
ln.score(X, Y)

## 7. Pipeline with Polynomial Features

In [None]:
Input = [('scale', StandardScaler()),
         ('polynomial', PolynomialFeatures(include_bias=False)),
         ('model', LinearRegression())]
pipeline = Pipeline(Input)
pipeline.fit(X, Y)
pipeline.score(X, Y)

## 8. Ridge Regression with Polynomial Features

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=1)
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(x_train, y_train)
print("R^2 on test data (Linear):", ridge_model.score(x_test, y_test))

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

ridge_model.fit(x_train_poly, y_train)
y_pred = ridge_model.predict(x_test_poly)
r2 = r2_score(y_test, y_pred)
print('R^2 score on Test Data (Polynomial + Ridge):', r2)