In [490]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score,mean_squared_error

In [491]:
train_df = pd.read_csv("train (2).csv")
test_df = pd.read_csv("test (2).csv")

In [492]:
label = train_df["SalePrice"]
train_df.drop("SalePrice",axis=1,inplace=True)
train_df.drop("Id",inplace=True,axis=1)
test_df.drop("Id",inplace=True,axis=1)

In [493]:
# check the numbers of categorical features in train_df

cat_cols = train_df.select_dtypes(include='object').columns.tolist()
print(f"Number of categorical features: {len(cat_cols)}")

# prompt: to chech the numbers of numeric features in train_df

numeric_cols = train_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
print(f"Number of numeric features: {len(numeric_cols)}")

Number of categorical features: 43
Number of numeric features: 36


In [494]:
combined_df = pd.concat([train_df, test_df], axis=0)

In [495]:
def cat_num(data):
    # check the numbers of categorical features in train_df

    cat_cols = data.select_dtypes(include='object').columns.tolist()
    print(f"Number of categorical features: {len(cat_cols)}")
    cat_col_indices = list(data.columns.get_loc(col) for col in cat_cols)

    # prompt: to chech the numbers of numeric features in train_df

    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
    print(f"Number of numeric features: {len(numeric_cols)}")
    numeric_cols_indices = list(data.columns.get_loc(col) for col in numeric_cols)
    
  
    return cat_col_indices, numeric_cols_indices

In [496]:
cat_col_indices, numeric_cols_indices = cat_num(combined_df)

Number of categorical features: 43
Number of numeric features: 36


In [497]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [498]:
cat_col_indices_values_df = pd.DataFrame(combined_df.iloc[:, cat_col_indices].values, columns=combined_df.iloc[:, cat_col_indices].columns)
cat_col_indices_values_df.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [499]:
numeric_cols_indices_values_df = pd.DataFrame(combined_df.iloc[:, numeric_cols_indices].values, columns=combined_df.iloc[:, numeric_cols_indices].columns)
numeric_cols_indices_values_df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,0.0,...,548.0,0.0,61.0,0.0,0.0,0.0,0.0,0.0,2.0,2008.0
1,20.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,0.0,...,460.0,298.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
2,60.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,0.0,...,608.0,0.0,42.0,0.0,0.0,0.0,0.0,0.0,9.0,2008.0
3,70.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,0.0,...,642.0,0.0,35.0,272.0,0.0,0.0,0.0,0.0,2.0,2006.0
4,60.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,0.0,...,836.0,192.0,84.0,0.0,0.0,0.0,0.0,0.0,12.0,2008.0


In [500]:
kill = []
for i in cat_col_indices_values_df.columns:
    kill.append(list(cat_col_indices_values_df[i].unique()))

print(kill)

[['RL', 'RM', 'C (all)', 'FV', 'RH', nan], ['Pave', 'Grvl'], [nan, 'Grvl', 'Pave'], ['Reg', 'IR1', 'IR2', 'IR3'], ['Lvl', 'Bnk', 'Low', 'HLS'], ['AllPub', 'NoSeWa', nan], ['Inside', 'FR2', 'Corner', 'CulDSac', 'FR3'], ['Gtl', 'Mod', 'Sev'], ['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel', 'Somerst', 'NWAmes', 'OldTown', 'BrkSide', 'Sawyer', 'NridgHt', 'NAmes', 'SawyerW', 'IDOTRR', 'MeadowV', 'Edwards', 'Timber', 'Gilbert', 'StoneBr', 'ClearCr', 'NPkVill', 'Blmngtn', 'BrDale', 'SWISU', 'Blueste'], ['Norm', 'Feedr', 'PosN', 'Artery', 'RRAe', 'RRNn', 'RRAn', 'PosA', 'RRNe'], ['Norm', 'Artery', 'RRNn', 'Feedr', 'PosN', 'PosA', 'RRAn', 'RRAe'], ['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'], ['2Story', '1Story', '1.5Fin', '1.5Unf', 'SFoyer', 'SLvl', '2.5Unf', '2.5Fin'], ['Gable', 'Hip', 'Gambrel', 'Mansard', 'Flat', 'Shed'], ['CompShg', 'WdShngl', 'Metal', 'WdShake', 'Membran', 'Tar&Grv', 'Roll', 'ClyTile'], ['VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'WdShing', 'CemntBd

In [501]:
counter = 0
for i in kill:
    one = len(i)
    # print(one)
    counter += one
    
print(counter)

274


In [502]:
feature_map = {}
idx = 0

# Iterate over each list of categories
for category_list in kill:
# Iterate over each category in the list
    for category in category_list:
        # Assign unique index to the category and update the index
        feature_map[category] = idx
        idx += 1

print(feature_map)


{'RL': 0, 'RM': 1, 'C (all)': 2, 'FV': 3, 'RH': 4, nan: 267, 'Pave': 10, 'Grvl': 9, 'Reg': 11, 'IR1': 12, 'IR2': 13, 'IR3': 14, 'Lvl': 15, 'Bnk': 16, 'Low': 17, 'HLS': 18, 'AllPub': 19, 'NoSeWa': 20, 'Inside': 22, 'FR2': 23, 'Corner': 24, 'CulDSac': 25, 'FR3': 26, 'Gtl': 27, 'Mod': 208, 'Sev': 210, 'CollgCr': 30, 'Veenker': 31, 'Crawfor': 32, 'NoRidge': 33, 'Mitchel': 34, 'Somerst': 35, 'NWAmes': 36, 'OldTown': 37, 'BrkSide': 38, 'Sawyer': 39, 'NridgHt': 40, 'NAmes': 41, 'SawyerW': 42, 'IDOTRR': 43, 'MeadowV': 44, 'Edwards': 45, 'Timber': 46, 'Gilbert': 47, 'StoneBr': 48, 'ClearCr': 49, 'NPkVill': 50, 'Blmngtn': 51, 'BrDale': 52, 'SWISU': 53, 'Blueste': 54, 'Norm': 64, 'Feedr': 67, 'PosN': 68, 'Artery': 65, 'RRAe': 71, 'RRNn': 66, 'RRAn': 70, 'PosA': 69, 'RRNe': 63, '1Fam': 72, '2fmCon': 73, 'Duplex': 74, 'TwnhsE': 75, 'Twnhs': 76, '2Story': 77, '1Story': 78, '1.5Fin': 79, '1.5Unf': 80, 'SFoyer': 81, 'SLvl': 82, '2.5Unf': 83, '2.5Fin': 84, 'Gable': 85, 'Hip': 86, 'Gambrel': 87, 'Mansar

In [503]:
# Map the values in train_df to numerical indices using feature_map
combined_df = combined_df.applymap(lambda x: feature_map.get(x, x))

print(combined_df.head())

   MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \
0          60         0         65.0     8450      10    267        11   
1          20         0         80.0     9600      10    267        11   
2          60         0         68.0    11250      10    267        12   
3          70         0         60.0     9550      10    267        12   
4          60         0         84.0    14260      10    267        12   

   LandContour  Utilities  LotConfig  ...  ScreenPorch  PoolArea  PoolQC  \
0           15         19         22  ...            0         0     267   
1           15         19         23  ...            0         0     267   
2           15         19         22  ...            0         0     267   
3           15         19         24  ...            0         0     267   
4           15         19         23  ...            0         0     267   

   Fence  MiscFeature  MiscVal  MoSold  YrSold  SaleType  SaleCondition  
0    267          267   

In [533]:
# import torch.nn.functional as F

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attention = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, src, src_mask=None):
        src2, _ = self.self_attention(src, src, src, attn_mask=src_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.ffn(src)
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, n_heads, d_ff, dropout=0.1):
        super(Encoder, self).__init__()
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(num_layers)
        ])

    def forward(self, src, src_mask=None):
        for layer in self.layers:
            src = layer(src, src_mask)
        return src

In [539]:
src_input = mapped_cat_col_indices_values_df.values
src_input = torch.Tensor(src_input)

NameError: name 'mapped_cat_col_indices_values_df' is not defined

In [505]:
# Example usage:
# Define parameters
num_layers = 6
d_model = len(cat_cols)
n_heads = 1
d_ff = 2048
dropout = 0.5
src_seq_len = len(cat_cols)
batch_size = 32
vocab_size = counter

In [506]:
torch.manual_seed(42)

# Create encoder instance
encoder = Encoder(num_layers, d_model, n_heads, d_ff, dropout)

# Example input tensor
# src_input = torch.randn(batch_size, src_seq_len, d_model)

In [538]:
try:
    import torchinfo
except:
    !pip install torchinfo
    import torchinfo
    
from torchinfo import summary
summary(encoder,input_size = [src_input.shape])

Layer (type:depth-idx)                   Output Shape              Param #
Encoder                                  [2919, 43]                --
├─ModuleList: 1-1                        --                        --
│    └─EncoderLayer: 2-1                 [2919, 43]                --
│    │    └─MultiheadAttention: 3-1      [2919, 43]                7,568
│    │    └─Dropout: 3-2                 [2919, 43]                --
│    │    └─LayerNorm: 3-3               [2919, 43]                86
│    │    └─Sequential: 3-4              [2919, 43]                178,219
│    │    └─Dropout: 3-5                 [2919, 43]                --
│    │    └─LayerNorm: 3-6               [2919, 43]                86
│    └─EncoderLayer: 2-2                 [2919, 43]                --
│    │    └─MultiheadAttention: 3-7      [2919, 43]                7,568
│    │    └─Dropout: 3-8                 [2919, 43]                --
│    │    └─LayerNorm: 3-9               [2919, 43]                86
│   

In [507]:
cat_col_indices_values_df.values[0]

array(['RL', 'Pave', nan, 'Reg', 'Lvl', 'AllPub', 'Inside', 'Gtl',
       'CollgCr', 'Norm', 'Norm', '1Fam', '2Story', 'Gable', 'CompShg',
       'VinylSd', 'VinylSd', 'BrkFace', 'Gd', 'TA', 'PConc', 'Gd', 'TA',
       'No', 'GLQ', 'Unf', 'GasA', 'Ex', 'Y', 'SBrkr', 'Gd', 'Typ', nan,
       'Attchd', 'RFn', 'TA', 'TA', 'Y', nan, nan, nan, 'WD', 'Normal'],
      dtype=object)

In [508]:
# torch.manual_seed(42)
# # Example input tensor
# src_input = cat_col_indices_values_df.values
# src_input = torch.Tensor(src_input)
# print("Source Input",src_input)
# # enc=encoder(src_input)

In [509]:
def min_max_scaling(data):
    min_val = data.min()
    max_val = data.max()
    scaled_data = (data - min_val) / (max_val - min_val)
    return scaled_data


In [510]:
def z_score_scaling(data):
    mean = data.mean()
    std_dev = data.std()
    scaled_data = (data - mean) / std_dev
    return scaled_data


In [511]:
import numpy as np

In [512]:
val = np.nan
val

nan

In [513]:
if scaled_data == val:
    scaled_data = 0
    print(scaled_data)

In [514]:
scaled_data

0

In [515]:
def robust_scaling(data):

    median = np.median(data, axis=0)
    q1 = np.percentile(data, 25, axis=0)
    q3 = np.percentile(data, 75, axis=0)
    iqr = q3 - q1
    scaled_data = (data - median) / iqr
    val = np.nan
    for i in data:
        if i == val:
            i = 0
    return scaled_data

In [516]:
for i in combined_df:
    if combined_df[i].isnull().sum() == 0:
        continue
    else:
        combined_df[i].fillna((combined_df[i].mean()), inplace=True)

In [517]:
def preprocess(train_df,cat_col_indices=cat_col_indices,
               numeric_cols_indices=numeric_cols_indices,
               scaling=z_score_scaling,
               feature_map=feature_map,
               encoder=encoder,random_seed=42):
    
    cat_col_indices_values_df = pd.DataFrame(train_df.iloc[:, cat_col_indices].values, columns=train_df.iloc[:, cat_col_indices].columns)
    # print("Categorical features\n",cat_col_indices_values_df.head())
    numeric_cols_indices_values_df = pd.DataFrame(train_df.iloc[:, numeric_cols_indices].values, columns=train_df.iloc[:, numeric_cols_indices].columns)
    # print("Continouns Variable",numeric_cols_indices_values_df.head())
    
    # Map the values in train_df to numerical indices using feature_map
    mapped_cat_col_indices_values_df = cat_col_indices_values_df.applymap(lambda x: feature_map.get(x, x))
    

    # print("Mapped Cat_col",mapped_cat_col_indices_values_df.head())
   
    # Example input tensor
    src_input = mapped_cat_col_indices_values_df.values
    src_input = torch.Tensor(src_input)
    # print("Source Input",src_input)
    
    torch.manual_seed(42)
    new = []
    enc_values = []
    
    for i in np.arange(len(src_input)):
        new.append(encoder(src_input[i].unsqueeze(dim=0)))
        

    for i in new:
        enc_values.append(i.squeeze().detach().numpy())
        
    enc_values = pd.DataFrame(enc_values)
    enc_values.head()
    # print(enc_values[:5])
    enc_values.columns = cat_col_indices_values_df.columns
        
    if scaling == min_max_scaling:
        scale = numeric_cols_indices_values_df.apply(min_max_scaling)
        # print("Categorical Value Scaling",scale.head())
        
    elif scaling == z_score_scaling:
        scale = numeric_cols_indices_values_df.apply(z_score_scaling)
        # print("Numerical Value scaling",scale.head())
        
    elif scaling == robust_scaling:
        scale = numeric_cols_indices_values_df.apply(robust_scaling)
    
    else:
         scale = numeric_cols_indices_values_df.apply(scale)
        
    df = pd.concat([enc_values,scale],axis=1)
    
    return df

In [518]:
import numpy as np

In [519]:
from sklearn.model_selection import train_test_split

In [520]:
def run_model(model,X_train=X_train,y_train=y_train,X_test=X_val,y_test=y_val):

    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_r2_score = r2_score(y_train,y_train_pred).round(4)
    model_mse = mean_squared_error(y_train,y_train_pred).round(4)
    model_rmse = np.sqrt(model_mse).round(4)

    model_test_r2_score = r2_score(y_test,y_test_pred).round(4)
    model_test_mse = mean_squared_error(y_test,y_test_pred).round(4)
    model_test_rmse = np.sqrt(model_test_mse).round(4)

    print("Model Performance For Traning Set")
    print("--"*5)
    print("r2_score: ", model_r2_score)
    print("mean squared error: ", model_mse)
    print("rmse: ", model_test_rmse)
    print("--"*5)

    print("Model Performance For Test Set")
    print("--"*5)
    print("r2_score: ", model_test_r2_score)
    print("mean squared error: ", model_test_mse)
    print("rmse: ", model_test_rmse)
    print("--"*5)
    print(model)
    print("__"*5)

    model_name = model.__repr__()

    # Check if the length of the model name is greater than 20
    if len(model_name) > 20:
        # Take the first ten letters of the model name
        model_name = model_name[:10]

    return {"Model Name" : model_name,
            "r2_score" : model_test_r2_score,
            "mean squared error" : model_test_mse,
            "rmse" : model_test_rmse}


### STOP

In [524]:
from sklearn.linear_model import LassoCV
las = LassoCV(random_state = 24,cv=5)

run_model(las)
pred = las.predict(test)
# submission_df = pd.DataFrame({'Id': test_id, 'SalePrice': pred})
# submission_df.to_csv('lasso.csv', index=False)

Model Performance For Traning Set
----------
r2_score:  0.7889
mean squared error:  1281968999.7079
rmse:  36073.9351
----------
Model Performance For Test Set
----------
r2_score:  0.8142
mean squared error:  1301328793.253
rmse:  36073.9351
----------
LassoCV(cv=5, random_state=24)
__________


In [525]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

In [531]:
# for name, model in models.items():
#     print(f"Training {name}...")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for train_index, val_index in kf.split(train_df):
   
    X_train, X_val = train_df.iloc[train_index], train_df.iloc[val_index]
    y_train, y_val = label.iloc[train_index], label.iloc[val_index]

    # model.fit(X_train, y_train)
    # y_pred = model.predict(X_val)
    # rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    # scores.append(rmse)
    run_model(rfr)
    

Model Performance For Traning Set
----------
r2_score:  0.9481
mean squared error:  314962278.8715
rmse:  30715.8475
----------
Model Performance For Test Set
----------
r2_score:  0.8653
mean squared error:  943463286.5488
rmse:  30715.8475
----------
RandomForestRegressor(max_leaf_nodes=60, n_estimators=200)
__________
Model Performance For Traning Set
----------
r2_score:  0.9485
mean squared error:  312746411.3635
rmse:  30524.0059
----------
Model Performance For Test Set
----------
r2_score:  0.867
mean squared error:  931714936.0827
rmse:  30524.0059
----------
RandomForestRegressor(max_leaf_nodes=60, n_estimators=200)
__________
Model Performance For Traning Set
----------
r2_score:  0.9493
mean squared error:  307572074.4734
rmse:  30656.9444
----------
Model Performance For Test Set
----------
r2_score:  0.8658
mean squared error:  939848237.0913
rmse:  30656.9444
----------
RandomForestRegressor(max_leaf_nodes=60, n_estimators=200)
__________
Model Performance For Traning Se

In [532]:
run_model(rfr)

Model Performance For Traning Set
----------
r2_score:  0.9489
mean squared error:  310439628.5097
rmse:  30327.904
----------
Model Performance For Test Set
----------
r2_score:  0.8687
mean squared error:  919781763.0317
rmse:  30327.904
----------
RandomForestRegressor(max_leaf_nodes=60, n_estimators=200)
__________


{'Model Name': 'RandomFore',
 'r2_score': 0.8687,
 'mean squared error': 919781763.0317,
 'rmse': 30327.904}

In [None]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

In [None]:
models = {
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    'LightGBM': lgb.LGBMRegressor(),
    'CatBoost': cb.CatBoostRegressor(silent=True)
}


In [None]:
data = preprocess(combined_df,scaling=robust_scaling)
data.head()

In [None]:
missing=data.isnull().sum().sort_values(ascending=False)
missing=missing.drop(missing[missing==0].index)
df = pd.DataFrame(missing)
data.drop(df.T.columns,inplace=True,axis=1)
# train_df.drop("Id",inplace=True,axis=1)
data.head()

In [None]:
train_df = data[:train_df.shape[0]]
test_df = data[train_df.shape[0]:]

In [None]:
train.shape

In [None]:
test.shape

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df,label,random_state=42,test_size=0.25)

In [None]:
result = []
for name, model in models.items():
    mod = run_model(model)
    # df = pd.DataFrame([run_model(model)])
    result.append(mod)

In [None]:
result

In [None]:
for i in result:
    print(i)
    columns = i.keys()

In [None]:
columns

In [None]:
import pandas as pd
# Create DataFrame
# data = [result,random_forest]
# print(data)
# robust_scaling_df = pd.DataFrame.from_dict(data, orient='index', columns=columns)

df = pd.DataFrame(result)

# print(robust_scaling_df)
print(df.head())

In [None]:
df

In [None]:
import catboost as cb

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Define base models
base_models = [
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    xgb.XGBRegressor(),
    lgb.LGBMRegressor(),
    cb.CatBoostRegressor(silent=True)
]

# Define parameters for grid search
parameters = [
    {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10]},  # RandomForest
    {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5]},  # GradientBoosting
    {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5]},  # XGBoost
    {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5]},  # LightGBM
    {'iterations': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5]}  # CatBoost
]

# Train and tune base models
tuned_base_models = []
for model, params in zip(base_models, parameters):
    grid_search = GridSearchCV(model, params, cv=5)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    tuned_model = model.set_params(**best_params)
    tuned_base_models.append(tuned_model)

# Create StackingCVRegressor with Lasso meta-learner
stacking_regressor = StackingCVRegressor(regressors=tuned_base_models,
                                         meta_regressor=Lasso(),
                                         cv=5,
                                         use_features_in_secondary=True,
                                         store_train_meta_features=True)

# Fit the stacking regressor
stacked_res = run_model(stacking_regressor)

# # Evaluate on test set
# y_pred = stacking_regressor.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# print("Mean Squared Error:", mse)


In [None]:
stacked_res

In [None]:
df2 = pd.DataFrame(stacked_res,index=[0])
df2.head()

In [None]:
df = pd.concat([df, df2], ignore_index=True)  # Combine row-wise

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [None]:
linear = run_model(lr)

In [None]:
df3 = pd.DataFrame(linear,index=[0])
df3.head()

In [None]:
df = pd.concat([df, df3], ignore_index=True)  # Combine row-wise

In [None]:
df

In [None]:
df.to_csv("Robust_Scaling_Analysis.csv")

### Plot Graphs

In [None]:
import pandas as pd
compare_results = pd.DataFrame([random_forest,cat_boost,stack,stack_model])
compare_results

In [None]:
def plot_graphs(results: Dict[str,List[float]]):
    
    loss = results['rmse']
    