![Screenshot-63.png](https://i.postimg.cc/rm3hj7jZ/Screenshot-63.png)

# 1.Importing Libraries

In [None]:
"""
Python 3.10 EDA_Housing_Prices_Competition
File name EDA.py

Version: 0.1
Author: MLCV
Date: 2025-06-08
"""

import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('darkgrid')
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# 2. Loading the data

In [None]:
df_train = pd.read_csv('../input/home-data-for-ml-course/train.csv')
df_test = pd.read_csv('../input/home-data-for-ml-course/test.csv')

In [None]:
df_train.head()

# 3. Let's explore

In [None]:
r,c = df_train.shape
print('The training data has {} rows and {} columns'.format(r,c))
r,c = df_test.shape
print('The validation data has {} rows and {} columns'.format(r,c))
#df_train.info()

In [None]:
plt.figure(figsize=(24,8))
# columns with the most null values
cols_with_null=df_train.isnull().sum().sort_values(ascending=False)
# Let's visualize these columns
sns.barplot(x=cols_with_null.index,y=cols_with_null)
plt.xticks(rotation=90)
plt.show;

In [None]:
cols_with_null.head(10)

In [None]:
#cols_to_drop=(cols_with_null.head(6).index).tolist()
#df_train.drop(cols_to_drop,axis=1,inplace=True)
#df_test.drop(cols_to_drop,axis=1,inplace=True)
#df_train.shape

In [None]:
df_train['SalePrice'].isnull().sum()

In [None]:
df_train.head()

In [None]:
df_train.describe()

## A. Short listing some features

In [None]:
important_features=['YearBuilt','LotArea','OverallQual','OverallCond','GrLivArea','1stFlrSF','2ndFlrSF','BedroomAbvGr','OpenPorchSF','PoolArea','SalePrice']
df_train[important_features].describe()

In [None]:
# plt.figure(figsize=(15,12))
# sns.heatmap(df_train.corr())
# plt.show()

**Removal of unimportant columns**

In [None]:
un_imp=['MSSubClass','OverallCond','BsmtFinSF2','LowQualFinSF','BsmtHalfBath','3SsnPorch','YrSold','MoSold','MiscVal','PoolArea']

# 4. OutLier Detection

In [None]:
fig, ax=plt.subplots(1,3,figsize=(28,7))
sns.scatterplot(x=df_train.GrLivArea,y=df_train.SalePrice,size=df_train.BedroomAbvGr,hue=df_train.OverallQual, ax=ax[0])
ax[0].set_title("Ground Living Area")
sns.scatterplot(x=df_train.LotArea,y=df_train.SalePrice,size=df_train.BedroomAbvGr,hue=df_train.OverallQual, ax=ax[1])
ax[1].set_title("LOT AREA")
sns.boxplot(x=df_train.SalePrice);


 * Houses with Ground Living Area more than 4000 are outliers.
 * Houses with Lot Area more than 6000 are Outliers.
 * Sale Price more than 45000 is affecting our model as it mostly contains outliers.

In [None]:
sns.catplot(data=df_train, y='SalePrice', x='OverallQual', kind="boxen"); #mutli col bar plot.

In [None]:
df_train['SalePrice'].quantile(0.995)

In [None]:
rows_2_drop=df_train[df_train['SalePrice']>df_train['SalePrice'].quantile(0.995)].index
df_train.drop(rows_2_drop,inplace=True)

In [None]:
df_train.shape

In [None]:
rows_2_drop=df_train[df_train['GrLivArea']>4000].index
df_train.drop(rows_2_drop,inplace=True)
df_train.shape

In [None]:
df_train[df_train['LotArea']>100000]

In [None]:
rows_2_drop=df_train[df_train['LotArea']>100000].index
df_train.drop(rows_2_drop,inplace=True)
df_train.shape

# 5. Making data ready for modeling

In [None]:
X_train = df_train.drop(['Id','SalePrice'],axis=1)
y_train = df_train.SalePrice
X_test = df_test.drop(['Id'],axis=1)

In [None]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols)])

# 6.Modeling

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_GBR =  GradientBoostingRegressor(n_estimators=1100, loss='squared_error', subsample = 0.35, learning_rate = 0.05,random_state=1)
GBR_Pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', model_GBR)])
GBR_Pipeline.fit(X_train, y_train)
preds_GBR = GBR_Pipeline.predict(X_test)

In [None]:
submission= pd.DataFrame({'Id': df_test.Id,'SalePrice': preds_GBR})



In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv',index=False)

I will be glad to receive feedback, leave your opinion in the comments. Thank you and have a nice day.