In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import warnings
warnings.filterwarnings("ignore")

# **Loading The Datasets** 

In [4]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv') 
submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')

In [5]:
train.head()

# **Brief Description of the dataset columns**
1. SalePrice (target)- the property's sale price in dollars. This is the target variable that you're trying to predict.
1. MSSubClass: The building class
1. MSZoning: The general zoning classification
1. LotFrontage: Linear feet of street connected to property
1. LotArea: Lot size in square feet
1. Street: Type of road access
1. Alley: Type of alley access
1. LotShape: General shape of property
1. LandContour: Flatness of the property
1. Utilities: Type of utilities available
1. LotConfig: Lot configuration
1. LandSlope: Slope of property
1. Neighborhood: Physical locations within Ames city limits
1. Condition1: Proximity to main road or railroad
1. Condition2: Proximity to main road or railroad (if a second is present)
1. BldgType: Type of dwelling
1. HouseStyle: Style of dwelling
1. OverallQual: Overall material and finish quality
1. OverallCond: Overall condition rating
1. YearBuilt: Original construction date
1. YearRemodAdd: Remodel date
1. RoofStyle: Type of roof
1. RoofMatl: Roof material
1. Exterior1st: Exterior covering on house
1. Exterior2nd: Exterior covering on house (if more than one material)
1. MasVnrType: Masonry veneer type
1. MasVnrArea: Masonry veneer area in square feet
1. ExterQual: Exterior material quality
1. ExterCond: Present condition of the material on the exterior
1. Foundation: Type of foundation
1. BsmtQual: Height of the basement
1. BsmtCond: General condition of the basement
1. BsmtExposure: Walkout or garden level basement walls
1. BsmtFinType1: Quality of basement finished area
1. BsmtFinSF1: Type 1 finished square feet
1. BsmtFinType2: Quality of second finished area (if present)
1. BsmtFinSF2: Type 2 finished square feet
1. BsmtUnfSF: Unfinished square feet of basement area
1. TotalBsmtSF: Total square feet of basement area
1. Heating: Type of heating
1. HeatingQC: Heating quality and condition
1. CentralAir: Central air conditioning
1. Electrical: Electrical system
1. 1stFlrSF: First Floor square feet
1. 2ndFlrSF: Second floor square feet
1. LowQualFinSF: Low quality finished square feet (all floors)
1. GrLivArea: Above grade (ground) living area square feet
1. BsmtFullBath: Basement full bathrooms
1. BsmtHalfBath: Basement half bathrooms
1. FullBath: Full bathrooms above grade
1. HalfBath: Half baths above grade
1. Bedroom: Number of bedrooms above basement level
1. Kitchen: Number of kitchens
1. KitchenQual: Kitchen quality
1. TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
1. Functional: Home functionality rating
1. Fireplaces: Number of fireplaces
1. FireplaceQu: Fireplace quality
1. GarageType: Garage location
1. GarageYrBlt: Year garage was built
1. GarageFinish: Interior finish of the garage
1. GarageCars: Size of garage in car capacity
1. GarageArea: Size of garage in square feet
1. GarageQual: Garage quality
1. GarageCond: Garage condition
1. PavedDrive: Paved driveway
1. WoodDeckSF: Wood deck area in square feet
1. OpenPorchSF: Open porch area in square feet
1. EnclosedPorch: Enclosed porch area in square feet
1. 3SsnPorch: Three season porch area in square feet
1. ScreenPorch: Screen porch area in square feet
1. PoolArea: Pool area in square feet
1. PoolQC: Pool quality
1. Fence: Fence quality
1. MiscFeature: Miscellaneous feature not covered in other categories
1. MiscVal: $Value of miscellaneous feature
1. MoSold: Month Sold
1. YrSold: Year Sold
1. SaleType: Type of sale
1. SaleCondition: Condition of sale

In [6]:
print('Number of rows in Training dataset: ',train.shape[0])
print('Number of rows in Testing dataset: ',test.shape[0])
print(f'Number of columns({train.shape[1]-1} features + 1 Traget) in Train: ', train.shape[1])
print(f'Number of columns({test.shape[1]} features) in Test: ', test.shape[1])

# **Exploratory Data Analysis(EDA)** 

In [7]:
print('Columns: ', train.columns)

In [9]:
train.describe().T

In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
sns.scatterplot(x='LotArea',y='SalePrice',data=train)

In [16]:
for col in train.columns: 
    if train[col].dtype !='object':
        plt.figure(figsize=(12,6))
        sns.histplot(train[col])
    

In [15]:
for col in train.columns:
    if train[col].dtype != 'object':
        plt.figure(figsize=(12,6))
        sns.scatterplot(x=col,y='SalePrice',data=train)
    

*Corelation between SalePrice and other Features*

In [18]:
corr_matrix = train.corr()

In [19]:
corr_matrix['SalePrice'].sort_values(ascending=False)

In [20]:
from pandas.plotting import scatter_matrix

features_matrix = ['SalePrice','OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF']

scatter_matrix(train[features_matrix],figsize=(24,12))

# **Data Cleaning**

In [21]:
train_features = train.drop('SalePrice',axis=1)
train_labels = train['SalePrice']

In [22]:
train_features.shape,train_labels.shape

In [25]:
numeric = train_features._get_numeric_data().columns
categoric = [i for i in train_features.columns if i not in numeric]

In [26]:
train_features.isnull().sum().index
null_columns=[]
for i in range(len(train_features.isnull().sum().index)):
    if train_features.isnull().sum()[i] !=0:
        print(train_features.isnull().sum().index[i],'-',train_features.isnull().sum()[i])
        null_columns.append(train_features.isnull().sum().index[i])

In [27]:
null_num_columns = [i for i in null_columns if i in numeric]
null_cat_columns = [i for i in null_columns if i not in numeric]

In [28]:
null_cat_columns

In [30]:
train_num = train_features[numeric]
train_cat = train_features[categoric]

# **Pipeline for Numeric and Categotical Values**

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('std_scalar',StandardScaler())
])

In [32]:
from sklearn.compose import ColumnTransformer

final_pipeline = ColumnTransformer([
    ('num',num_pipeline,list(train_num.columns)),
    ('cat',OneHotEncoder(handle_unknown='ignore'),list(train_cat.columns))
])

train_final = final_pipeline.fit_transform(train_features)

# **Train Models**

# Linear regression

In [33]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(train_final, train_labels)

In [34]:
def pred_rand(model):
    t_data = train_features.loc[:5]
    t_labels = train_labels.loc[:5]

    t_data_tr = final_pipeline.transform(t_data)
    pred = model.predict(t_data_tr)
    pred = [int(i) for i in pred]
    print('Predicted: ',pred)
    print('Real Labels: ',list(t_labels))
    
pred_rand(lr_model)

In [35]:
from sklearn.metrics import mean_squared_error

train_predictions = lr_model.predict(train_final)
lr_mse = mean_squared_error(train_labels, train_predictions)
lr_rmse = np.sqrt(lr_mse)
print(lr_rmse)

# Decision tree

In [36]:
from sklearn.tree import DecisionTreeRegressor

dtree_model = DecisionTreeRegressor()
dtree_model.fit(train_final, train_labels)

In [38]:
pred_rand(dtree_model)
train_predictions = dtree_model.predict(train_final)
dtree_mse = mean_squared_error(train_labels, train_predictions)
dtree_rmse = np.sqrt(dtree_mse)

In [39]:
print(dtree_rmse)

# Cross Validation

In [40]:
from sklearn.model_selection import cross_val_score
dtree_scores = cross_val_score(dtree_model, train_final, train_labels, scoring="neg_mean_squared_error", cv=10)
dtree_scores = np.sqrt(-dtree_scores)

print('Score: ',dtree_scores)
print('Mean: ',dtree_scores.mean())
print('Standard Deviation: ',dtree_scores.std())

# Random forest

In [41]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()
rf_model.fit(train_final, train_labels)

In [42]:
pred_rand(rf_model)
train_predictions =rf_model.predict(train_final)
rf_mse = mean_squared_error(train_labels, train_predictions)
rf_rmse = np.sqrt(rf_mse)

In [43]:
print(rf_rmse)

random forest algorithm works better

In [51]:
rf_model_final = RandomForestRegressor(max_features=50,n_estimators= 200)
rf_model_final.fit(train_final, train_labels)

In [52]:
train_predictions =rf_model_final.predict(train_final)
rf_mse = mean_squared_error(train_labels, train_predictions)
rf_rmse = np.sqrt(rf_mse)
print(rf_rmse)

In [53]:
test.head()

In [55]:
test_final = final_pipeline.transform(test)
pred_prices = rf_model_final.predict(test_final)

In [56]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': pred_prices})
my_submission.to_csv('submission.csv', index=False)