In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#As per competitons description, Log RMSE (i.e root mean squared error) to be used instead of RMSE, to judge predicted values.
#For Ex: if there is a home with actual price 20 crore, whereas there is another home worth 20 lakh, the predicted price lets say for 1st home is 19 crore, and 2nd home is 21 lakh, then the error difference between too is not in the same scale, and costlier house error can dominate the RMSE result.
#In order to avoid it, we need to bring them to same scale, so we take Log(Predicted Value) and Log(Actual Price) which brings them to same scale, and now we can use these 2 values in RMSE calculation.

In [None]:
#Reading Train.csv data
train_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
train_df.head()

# EDA

In [None]:
#lets see features info
train_df.info()

In [None]:
#lets see how many missing values are there in each feature
train_df.isnull().sum()

Since there are so many features, by default pd restricts the features to be displayed

In [None]:
#Applying a filter to see only the features with missing values
train_df.isnull().sum()[train_df.isnull().sum()>0]

In [None]:
#lets see dataset shape
train_df.shape

In [None]:
#lets seperate numerical and categorical features
categorical = []
numerical = []
for feature in train_df.columns:
    if train_df[feature].dtype == 'int64':
        numerical.append(feature)
    else:
        categorical.append(feature)
print("Categorical : ",categorical)
print("Numerical : ",numerical)

In [None]:
print(len(numerical))
print(len(categorical))

In [None]:
#lets visualize the target feature i.e SalePrice, to see if we have normal or skewed data or do we have any outliers.
#using histogram to see the distribution of each price, i.e it shows the count
#where x is the unique sorted prices
#y is the occurence of each of those prices.
import matplotlib.pyplot as plt
plt.hist(train_df['SalePrice'])
plt.show()

1. Its not a bell curve.
2. The centre where the peak is between 100000 and 200000, but it should have been near 300000 and 400000.
3. Does not looks symmetric.
4. Peak is more towards left side,and tail is towards right,so its right skewed.

### SalePrice is right skewed, and needs to be normalized.

In [None]:
#lets see the outliers, using IQR i.e interquartile range
#since IQR is used for skewed data, while Z-score is for gaussian i.e normal data

#IQR = Q3 - Q1
#lower bound = Q1 - 1.5* IQR
#upper bound = Q3 + 1.5* IQR

#we can see this using boxplot
import seaborn as sns
sns.boxplot(train_df['SalePrice'])

### Prices somewhere from above of 300000 are outliers.

In [None]:
#lets see the actual outliers
Q1 = train_df['SalePrice'].quantile(0.25)
Q3 = train_df['SalePrice'].quantile(0.75)
IQR = Q3 - Q1
lowerBound = Q1 - 1.5*IQR
upperBound = Q3 + 1.5*IQR

outliers = train_df[(train_df['SalePrice'] > upperBound) | (train_df['SalePrice'] < lowerBound)]
outliers.shape[0]

61 records are outliers here, since these are expensive houses, and not errors, we cant remove them, we will perform feature transformation using log, and similarly on all input features as well.

# Feature Engineering

In [None]:
#handling numerical feature missing values
for feature in numerical:
    print(train_df[feature].isnull().sum())

None of the numerical features have any missing values.

In [None]:
#handling categorical features missing values
columns_to_be_dropped = []
columns_missing_to_be_replaced_with_mode = []
for feature in categorical:
    if train_df[feature].isnull().sum() > 0:
        #if missing values are more than 60%, then we need to drop those columns
        if train_df[feature].isnull().mean() > 0.6:
            columns_to_be_dropped.append(feature)
        else:
            #need to replace the missing values with their mode
            columns_missing_to_be_replaced_with_mode.append(feature)
print(columns_to_be_dropped)
print(columns_missing_to_be_replaced_with_mode)

In [None]:
#droppping columns
train_df.drop(columns = columns_to_be_dropped, inplace = True)

In [None]:
#replacing the missing values with their mode
for feature in columns_missing_to_be_replaced_with_mode:
    train_df[feature].fillna(train_df[feature].mode()[0], inplace = True)

In [None]:
train_df.isnull().sum()[train_df.isnull().sum() > 0]

Numerical features did not have any missing values.
Categorical features with more than 60% percent missing values has been dropped, and rest has been filled with their mode value i.e most occurent value.

In [None]:
#Encoding
#1. Label Encoding - where each unique value in the feature itself is converted to 0,1,2...(mostly used for preserving the order)
#2. One Hot Encoding - creates new column for each value in the feature, and values are populated with 0 and 1.

In [None]:
filtered_categorical_features = list(set(categorical) - set(columns_to_be_dropped))
filtered_categorical_features

In [None]:
train_df[filtered_categorical_features].head()

If the values of these features requires their order to be preserved, we will use LabelEncoder, else for rest One Hot Encoding.
Manually we need to pick them.

In [None]:
for feature in filtered_categorical_features:
    print(feature, train_df[feature].unique())

In [None]:
train_df['MasVnrArea'] = train_df['MasVnrArea'].astype(float)
train_df['MasVnrArea']

In [None]:
train_df[['LotFrontage','GarageYrBlt']] = train_df[['LotFrontage','GarageYrBlt']].astype(int)
train_df[['LotFrontage','GarageYrBlt']]

In [None]:
#Ordinal mapping for 

#KitchenQual ['Gd' 'TA' 'Ex' 'Fa']
#HeatingQC ['Ex' 'Gd' 'TA' 'Fa' 'Po']
#GarageQual ['TA' 'Fa' 'Gd' 'Ex' 'Po']
#GarageCond ['TA' 'Fa' 'Gd' 'Po' 'Ex']
#BsmtQual ['Gd' 'TA' 'Ex' 'Fa']
#FireplaceQu ['Gd' 'TA' 'Fa' 'Ex' 'Po']
#ExterQual ['Gd' 'TA' 'Ex' 'Fa']
#BsmtCond ['TA' 'Gd' 'Fa' 'Po']
#ExterCond ['TA' 'Gd' 'Fa' 'Po' 'Ex']

#LotShape ['Reg' 'IR1' 'IR2' 'IR3']
#BsmtExposure ['No' 'Gd' 'Mn' 'Av']
#PavedDrive ['Y' 'N' 'P']
#BsmtFinType1 ['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' 'LwQ']
#GarageFinish ['RFn' 'Unf' 'Fin']
#LandSlope ['Gtl' 'Mod' 'Sev']
#BsmtFinType2 ['Unf' 'BLQ' 'ALQ' 'Rec' 'LwQ' 'GLQ']
#Functional ['Typ' 'Min1' 'Maj1' 'Min2' 'Mod' 'Maj2' 'Sev']


In [None]:
quality_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
quality_features = ['KitchenQual','HeatingQC', 'GarageQual','GarageCond','BsmtQual','FireplaceQu','BsmtCond','ExterCond','ExterQual']
for feature in quality_features:
    train_df[feature] = train_df[feature].map(quality_map)
train_df[quality_features]

In [None]:
LotShape_map  = {'Reg':3 ,'IR1':2, 'IR2':1, 'IR3':0}
BsmtExposure_map = {'No':0, 'Gd':3, 'Mn':1 ,'Av':2}
PavedDrive_map  = {'Y':2, 'N':0, 'P':1}
BsmtFinType1_map  = {'GLQ':5, 'ALQ':2, 'Unf':0, 'Rec':3, 'BLQ':4, 'LwQ':1}
BsmtFinType2_map  = {'GLQ':5, 'ALQ':2, 'Unf':0, 'Rec':3, 'BLQ':4, 'LwQ':1}
GarageFinish_map =  {'RFn':1, 'Unf':0 ,'Fin':2}
LandSlope_map =  {'Gtl':2, 'Mod':1, 'Sev':0}
Functional_map = {'Sev': 1,'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7}
additional_ordinal_features = ['LotShape','BsmtExposure','PavedDrive','BsmtFinType1','BsmtFinType2','GarageFinish','LandSlope','Functional']

# Dictionary of all maps
all_maps = {
    'LotShape': LotShape_map,
    'BsmtExposure': BsmtExposure_map,
    'PavedDrive': PavedDrive_map,
    'BsmtFinType1': BsmtFinType1_map,
    'BsmtFinType2': BsmtFinType2_map,
    'GarageFinish': GarageFinish_map,
    'LandSlope': LandSlope_map,
    'Functional': Functional_map
}

for feature in additional_ordinal_features:
    train_df[feature] = train_df[feature].map(all_maps[feature])
train_df[additional_ordinal_features]

In [None]:
categorical_left = list(set(filtered_categorical_features) - set(['GarageYrBlt','LotFrontage','MasVnrArea']) - set(additional_ordinal_features) - set(quality_features))
categorical_left

In [None]:
for feature in categorical_left:
    print(feature, train_df[feature].unique())

In [None]:
#lets drop the features which is not required Ex: Id
train_df.drop(columns = ['Id'], inplace = True)

In [None]:
#lets drop features where almost 95% values are same
#value_counts gives highest to lowest count
#normalize = True gives us the percentage directly
#we can pick the first value and check
low_variance_features = [] #data is almost same
for feature in train_df.columns:
    if train_df[feature].value_counts(normalize = True).iloc[0] >= 0.95:
        low_variance_features.append(feature)
low_variance_features

In [None]:
#drop low_variance_features
train_df.drop(columns= low_variance_features, inplace = True)
train_df.shape

In [None]:
categorical_left= list(set(categorical_left)- set(low_variance_features))

In [None]:
#applying OHE on these categorical_left
dummies = pd.get_dummies(train_df[categorical_left], drop_first = False)
train_df = pd.concat([train_df, dummies], axis=1)
train_df2 = train_df.copy()
train_df.shape

In [None]:
#dropping all categorical_left columns
train_df.drop(columns = categorical_left, inplace = True)
train_df.shape

Label Encoding ends here..

In [None]:
#using YearBuilt , GarageYrBlt , YrSold ,MoSold, YearRemodAdd
#MoSold is month, no need to drop them as they tell which month , season has high/low price.
train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']
train_df['GarageAge'] = train_df['YrSold'] - train_df['GarageYrBlt']
train_df['RemodAdd'] = train_df['YrSold'] - train_df['YearRemodAdd']
train_df.drop(columns = ['YrSold','YearBuilt','GarageYrBlt','YearRemodAdd'], inplace = True)
#clipping negative values to 0
train_df["RemodAdd"] = train_df["RemodAdd"].clip(lower=0)
train_df[['HouseAge','GarageAge','RemodAdd']]

In [None]:
train_df['RemodAdd'].value_counts()

In [None]:
matrix = train_df.corr()
matrix

In [None]:
#lets keep the threshold as -0.9 and 0.9 for similarity between 2 features,
#if its there, we will just keep one of the feature.
cols_n = len(train_df.columns)
rows_n = cols_n
similar_feats = []
for i in range(rows_n):
    for j in range(i+1,cols_n):
        #for correlation values less than 0.9 and greater than 0.9    
        if abs(matrix.iloc[i,j]) > 0.9:
                similar_feats.append(train_df.columns[i])
similar_feats

In [None]:
#dropping similar features
train_df.drop(columns = similar_feats, inplace = True)

we dont have any such features with 90% similarity

In [None]:
#Normalization(to remove skewness of data)
#log(0) is undefined, in order to avoid it we use np.1p(feature) which is log(1+x)

#check skewness first, if more than 0.5 then apply log transformation.
#we just need to check it on numerical features.
numerical_present = list(set(numerical) - set(['Id', 'YearBuilt', 'YearRemodAdd', 'LowQualFinSF', 'KitchenAbvGr', '3SsnPorch', 'PoolArea', 'MiscVal', 'YrSold']))
numerical_present = list(set(numerical_present).union({'HouseAge','GarageAge','RemodAdd'}))
features_requiring_normalization = train_df[numerical_present].skew()[train_df[numerical_present].skew() > 0.5].index
features_requiring_normalization

In [None]:
#lets visualize the skewness for features_requiring_normalization
import math
import matplotlib.pyplot as plt

n = len(features_requiring_normalization)
rows = math.ceil(math.sqrt(n))
cols = math.ceil(n / rows)

fig, axes = plt.subplots(rows, cols, figsize=(20, 20))
axes = axes.flatten()

for i, col in enumerate(features_requiring_normalization):
    axes[i].hist(train_df[col])
    axes[i].set_title(col)

# Hide unused axes
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
train_df[features_requiring_normalization] = np.log1p(train_df[features_requiring_normalization])
train_df[features_requiring_normalization]

In [None]:
features_requiring_scaling = list(set(features_requiring_normalization) - set(['SalePrice']))
features_requiring_scaling

In [None]:
#now we need to scale and bring all the values to same scale, so any high value does not dominates the rest of the values.
from sklearn.preprocessing import StandardScaler #since data is normalized already, else would have used MinMaxScaler
sc = StandardScaler()

#scaling is not applied on target feature and neither on categorical, since it alters the only thing we want to predict as it is.
train_df[features_requiring_scaling] = sc.fit_transform(train_df[features_requiring_scaling])
train_df[features_requiring_scaling]

### Model Building

In [None]:
Y = train_df['SalePrice']
X = train_df.drop(columns = ['SalePrice'])
print(X.shape, Y.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 32)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [None]:
#LinearRegression Model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(x_train, y_train)

y_predicted = model.predict(x_train)
y_test_predicted = model.predict(x_test)

#squared = False, means RMSE.
print("Train RMSE: ", mean_squared_error(y_train, y_predicted,squared = False))
print("Test RMSE: ", mean_squared_error(y_test_predicted, y_test,squared = False))


RMSE lower than 0.15 is actually considered good

In [None]:
#Ridge Regression
from sklearn.linear_model import Ridge

for a in [0.005,0.05,0.1, 0.3, 1, 3, 10, 30, 100]:
    ridge_model = Ridge(alpha=a)
    ridge_model.fit(x_train, y_train)
    
    y2_predicted = ridge_model.predict(x_train)
    y2_test_predicted = ridge_model.predict(x_test)
    
    #squared = False, means RMSE.
    print("Train RMSE: ", mean_squared_error(y_train, y2_predicted,squared = False))
    print("Test RMSE: ", mean_squared_error(y2_test_predicted, y_test,squared = False))


In [None]:
#Lasso Regression
from sklearn.linear_model import Lasso

for a in [0.001,0.1, 0.3, 1, 3, 10, 30, 100]:
    lasso_model = Lasso(alpha=a)
    lasso_model.fit(x_train, y_train)
    
    y3_predicted = lasso_model.predict(x_train)
    y3_test_predicted = lasso_model.predict(x_test)
    
    #squared = False, means RMSE.
    print("Train RMSE: ", mean_squared_error(y_train, y3_predicted,squared = False))
    print("Test RMSE: ", mean_squared_error(y3_test_predicted, y_test,squared = False))


In [None]:
#XGBoost Model
from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.7,
    reg_alpha=0.1,
    reg_lambda=1.0
)
xgb_model.fit(x_train, y_train)

y4_predicted = xgb_model.predict(x_train)
y4_test_predicted = xgb_model.predict(x_test)

#squared = False, means RMSE.
print("Train RMSE: ", mean_squared_error(y_train, y4_predicted,squared = False))
print("Test RMSE: ", mean_squared_error(y4_test_predicted, y_test,squared = False))


In [None]:
#LightBGM Model
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
lgbm.fit(x_train, y_train)

y5_predicted = lgbm.predict(x_train)
y5_test_predicted = lgbm.predict(x_test)

#squared = False, means RMSE.
print("Train RMSE: ", mean_squared_error(y_train, y5_predicted,squared = False))
print("Test RMSE: ", mean_squared_error(y5_test_predicted, y_test,squared = False))


In [None]:
#GradientBoost Model
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(
    n_estimators=3000,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)
gbr.fit(x_train, y_train)

y6_predicted = gbr.predict(x_train)
y6_test_predicted = gbr.predict(x_test)

#squared = False, means RMSE.
print("Train RMSE: ", mean_squared_error(y_train, y6_predicted,squared = False))
print("Test RMSE: ", mean_squared_error(y6_test_predicted, y_test,squared = False))


In [None]:
#SVM Model
from sklearn.svm import SVR

svr = SVR(C=20, epsilon=0.01, kernel='rbf')
svr.fit(x_train, y_train)

y7_predicted = svr.predict(x_train)
y7_test_predicted = svr.predict(x_test)

#squared = False, means RMSE.
print("Train RMSE: ", mean_squared_error(y_train, y7_predicted,squared = False))
print("Test RMSE: ", mean_squared_error(y7_test_predicted, y_test,squared = False))


In [None]:
#KNN Regressor
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(x_train, y_train)

y8_predicted = knn.predict(x_train)
y8_test_predicted = knn.predict(x_test)

#squared = False, means RMSE.
print("Train RMSE: ", mean_squared_error(y_train, y8_predicted,squared = False))
print("Test RMSE: ", mean_squared_error(y8_test_predicted, y_test,squared = False))


Ridge has the lowest RMSE on the test data, so picking it.

In [None]:
#Ridge Regression(Training it on entire X,Y i.e before split)
from sklearn.linear_model import Ridge

for a in [0.001,0.05,0.1, 0.3, 1, 3, 10, 30, 100]:
    ridge_model = Ridge(alpha=a)
    ridge_model.fit(X, Y)
    
    y10_predicted = ridge_model.predict(x_train)
    y10_test_predicted = ridge_model.predict(x_test)
    
    #squared = False, means RMSE.
    print("Train RMSE: ", mean_squared_error(y_train, y10_predicted,squared = False))
    print("Test RMSE: ", mean_squared_error(y10_test_predicted, y_test,squared = False))


0.001 has lowest RMSE here, so picking it

In [None]:
ridge_model = Ridge(alpha=0.001)
ridge_model.fit(X, Y)

y10_predicted = ridge_model.predict(x_train)
y10_test_predicted = ridge_model.predict(x_test)

#squared = False, means RMSE.
print("Train RMSE: ", mean_squared_error(y_train, y10_predicted,squared = False))
print("Test RMSE: ", mean_squared_error(y10_test_predicted, y_test,squared = False))


In [None]:
#Performing the same set of process on test.csv data now
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test_df.shape

In [None]:
numerical = []
categorical = []

for feature in test_df.columns:
    if test_df[feature].dtype in ('int64', 'float64'):
        numerical.append(feature)
    else:
        categorical.append(feature)

print("Numerical: ", numerical)
print("Categorical: ", categorical)

In [None]:
#handling numerical feature missing values
numerical = list(set(numerical) - set(['SalePrice']))
for feature in numerical:
    if test_df[feature].isnull().sum() > 0 :
        test_df[feature].fillna(test_df[feature].mean(), inplace = True)

In [None]:

#droppping columns
test_df.drop(columns = columns_to_be_dropped, inplace = True)

#replacing the missing values with their mode
for feature in columns_missing_to_be_replaced_with_mode:
    test_df[feature].fillna(test_df[feature].mode()[0], inplace = True)

In [None]:
additional_missing_cat = []
for feature in test_df.columns:
    if test_df[feature].isnull().sum()[test_df[feature].isnull().sum() > 0]:
        additional_missing_cat.append(feature)

In [None]:
additional_missing_cat

In [None]:
for feature in additional_missing_cat:
    test_df[feature].fillna(test_df[feature].mode()[0], inplace = True)

In [None]:
test_df.isnull().sum()[test_df.isnull().sum() > 0]

In [None]:
test_df['MasVnrArea'] = test_df['MasVnrArea'].astype(float)
test_df[['LotFrontage','GarageYrBlt']] = test_df[['LotFrontage','GarageYrBlt']].astype(int)

In [None]:
quality_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
quality_features = ['KitchenQual','HeatingQC', 'GarageQual','GarageCond','BsmtQual','FireplaceQu','BsmtCond','ExterCond','ExterQual']
for feature in quality_features:
    test_df[feature] = test_df[feature].map(quality_map)
test_df[quality_features]

In [None]:
LotShape_map  = {'Reg':3 ,'IR1':2, 'IR2':1, 'IR3':0}
BsmtExposure_map = {'No':0, 'Gd':3, 'Mn':1 ,'Av':2}
PavedDrive_map  = {'Y':2, 'N':0, 'P':1}
BsmtFinType1_map  = {'GLQ':5, 'ALQ':2, 'Unf':0, 'Rec':3, 'BLQ':4, 'LwQ':1}
BsmtFinType2_map  = {'GLQ':5, 'ALQ':2, 'Unf':0, 'Rec':3, 'BLQ':4, 'LwQ':1}
GarageFinish_map =  {'RFn':1, 'Unf':0 ,'Fin':2}
LandSlope_map =  {'Gtl':2, 'Mod':1, 'Sev':0}
Functional_map = {'Sev': 1,'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7}
additional_ordinal_features = ['LotShape','BsmtExposure','PavedDrive','BsmtFinType1','BsmtFinType2','GarageFinish','LandSlope','Functional']

# Dictionary of all maps
all_maps = {
    'LotShape': LotShape_map,
    'BsmtExposure': BsmtExposure_map,
    'PavedDrive': PavedDrive_map,
    'BsmtFinType1': BsmtFinType1_map,
    'BsmtFinType2': BsmtFinType2_map,
    'GarageFinish': GarageFinish_map,
    'LandSlope': LandSlope_map,
    'Functional': Functional_map
}

for feature in additional_ordinal_features:
    test_df[feature] = test_df[feature].map(all_maps[feature])
test_df[additional_ordinal_features]

In [None]:
#drop low_variance_features
test_df.drop(columns= low_variance_features, inplace = True)
test_df.shape

In [None]:
#applying OHE on these categorical_left
dummies = pd.get_dummies(test_df[categorical_left], drop_first = False)
test_df = pd.concat([test_df, dummies], axis=1)
test_df.shape

In [None]:
#due to different values in train.csv and test.csv, pd.get_dummies may generate different number of columns, which is expected
#so we need to add features from train to test, to align the feature count of test, then only it can be used by the trained model.
align_cols = list(set(train_df2) - set(test_df))
for col in align_cols:
    test_df[col] = 0

In [None]:
test_df.shape

In [None]:
#dropping all categorical_left columns
test_df.drop(columns = categorical_left, inplace = True)
test_df.shape

In [None]:
#using YearBuilt , GarageYrBlt , YrSold ,MoSold, YearRemodAdd
#MoSold is month, no need to drop them as they tell which month , season has high/low price.
test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']
test_df['GarageAge'] = test_df['YrSold'] - test_df['GarageYrBlt']
test_df['RemodAdd'] = test_df['YrSold'] - test_df['YearRemodAdd']
test_df.drop(columns = ['YrSold','YearBuilt','GarageYrBlt','YearRemodAdd'], inplace = True)
#clipping negative values to 0
test_df["RemodAdd"] = test_df["RemodAdd"].clip(lower=0)

test_df[['HouseAge','GarageAge','RemodAdd']]

In [None]:
test_df["HouseAge"] = test_df["HouseAge"].clip(lower=0)
test_df["GarageAge"] = test_df["GarageAge"].clip(lower=0)

In [None]:
#dropping similar features
test_df.drop(columns = similar_feats, inplace = True)

In [None]:
features_requiring_normalization = list(set(features_requiring_normalization) - set(['SalePrice']))
test_df[features_requiring_normalization] = np.log1p(test_df[features_requiring_normalization])
test_df[features_requiring_normalization]

In [None]:
#scaling is not applied on target feature and neither on categorical, since it alters the only thing we want to predict as it is.
test_df[features_requiring_scaling] = sc.transform(test_df[features_requiring_scaling])
test_df[features_requiring_scaling]

In [None]:
Id = test_df['Id']
feats = test_df.drop(columns = ['Id', 'SalePrice'])
print(feats.shape, Id.shape)

In [None]:
#order also might be different, such as Train is: A,B,C,D
#while test is D,A,C,B
#we need to align them
X2, feats2 = X.align(feats, join="left", axis=1)

In [None]:
feats2.columns

In [None]:
sub_predict = ridge_model.predict(feats2)

In [None]:
sub_predict

In [None]:
submission = pd.DataFrame({
    "Id":Id,
    "SalePrice":np.expm1(sub_predict)
})

submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)