<a href="https://colab.research.google.com/github/Saptak10/House_Prices_Prediction/blob/main/HousePricesPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installing** the packages

In [None]:
pip install pandas numpy matplotlib seaborn scikit-learn jupyter

# Data Exploration and Cleaning

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Display the first few rows of the dataset
train.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Exploratory Data Analysis (EDA)

In [7]:
# Check for missing values
missing_data = train.isnull().sum().sort_values(ascending=False)
missing_data = missing_data[missing_data > 0]
print(missing_data)

# Plot correlations using a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(train.corr(), cmap='coolwarm', annot=True)


PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64


ValueError: could not convert string to float: 'RL'

<Figure size 1200x800 with 0 Axes>

# Data Cleaning

In [8]:
# Fill missing values for numerical columns
train.fillna(train.mean(), inplace=True)

# Drop columns that have too many missing values or are irrelevant
train = train.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)


TypeError: unsupported operand type(s) for +: 'int' and 'str'

# Feature Engineering

In [9]:
# Convert categorical variables into dummy/indicator variables
train = pd.get_dummies(train)

# Separate features and target variable
X = train.drop('SalePrice', axis=1)
y = train['SalePrice']


# Model Selection and Training

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")


ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

# Model Evaluation

In [11]:
# Plot actual vs predicted values
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted House Prices")
plt.show()


NameError: name 'y_pred' is not defined

# Save the Model

In [12]:
import joblib

# Save the model to a file
joblib.dump(model, 'house_price_model.pkl')


['house_price_model.pkl']