In [None]:
!pip install kagglehub


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from scipy.special import boxcox1p
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso, Ridge
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)

train["SalePrice"] = np.log1p(train["SalePrice"])
y_train = train.SalePrice.values

test_id = test['Id']

ntrain = train.shape[0]
ntest = test.shape[0]
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice', 'Id'], axis=1, inplace=True)

for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
            'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']:
    all_data[col] = all_data[col].fillna("None")

for col in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1',
            'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']:
    all_data[col] = all_data[col].fillna(0)

all_data['LotFrontage'] = all_data.groupby("Neighborhood")['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))

all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
all_data['Functional'] = all_data['Functional'].fillna("Typ")
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
all_data['Utilities'] = all_data['Utilities'].fillna("AllPub")

all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)
all_data['OverallCond'] = all_data['OverallCond'].astype(str)
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual',
        'BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure',
        'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive', 'Street',
        'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold')
for c in cols:
    lbl = LabelEncoder()
    lbl.fit(list(all_data[c].values))
    all_data[c] = lbl.transform(list(all_data[c].values))

all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' : skewed_feats})
skewness = skewness[abs(skewness) > 0.75]
from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)

all_data = pd.get_dummies(all_data)
train_new = all_data[:ntrain]
test_new = all_data[ntrain:]

lasso = make_pipeline(SimpleImputer(strategy='median'), RobustScaler(), Lasso(alpha=0.0005, random_state=1))
ridge = make_pipeline(SimpleImputer(strategy='median'), RobustScaler(), Ridge(alpha=0.6))
xgb = XGBRegressor(learning_rate=0.05, n_estimators=3000, max_depth=3, min_child_weight=1,
                   gamma=0, subsample=0.7, colsample_bytree=0.7, objective='reg:squarederror',
                   nthread=-1, random_state=1)

print("Training Lasso...")
lasso_model = lasso.fit(train_new, y_train)
print("Training Ridge...")
ridge_model = ridge.fit(train_new, y_train)
print("Training XGBoost...")
xgb_model = xgb.fit(train_new, y_train)

print("Generating Predictions...")
pred_lasso = lasso_model.predict(test_new)
pred_ridge = ridge_model.predict(test_new)
pred_xgb = xgb_model.predict(test_new)

final_pred_log = (0.35 * pred_lasso) + (0.35 * pred_ridge) + (0.30 * pred_xgb)

final_pred_real = np.expm1(final_pred_log)


sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = final_pred_real
sub.to_csv('submission_top_tier.csv', index=False)

print("DONE! Download 'submission_top_tier.csv' and submit.")

Training Lasso...
Training Ridge...
Training XGBoost...
Generating Predictions...
DONE! Download 'submission_top_tier.csv' and submit.
