In [74]:
import pandas as pd
# from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from sentence_transformers.quantization import quantize_embeddings
from tqdm import tqdm

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

from catboost import CatBoostRegressor, Pool, cv

import json
# Read the JSON file we just created
with open('extracted_data_description.json', 'r') as file:
    data_desc_dict = json.load(file)

# json_loaded_data


In [2]:
def prepare_dataset(df):
    # For categorical features, replace NA with "Missing"
    categorical_columns = df.select_dtypes(include=['object']).columns
    for key in categorical_columns:
        df[key] = df[key].astype('category').cat.add_categories("Missing").fillna("Missing")
    categorical_columns = df.select_dtypes(include=['category']).columns
    return df, categorical_columns


def get_text_dataset(dataset_df, data_desc_dict):
    dataset_df_copy = pd.DataFrame()
    for col in dataset_df.columns:
        if col in data_desc_dict:
            # print(col)
            if data_desc_dict[col]['type'] == "Numeric":
                dataset_df_copy[col] =  dataset_df[col].astype(str)
                dataset_df_copy[col] = dataset_df_copy[col].map(lambda x: data_desc_dict[col]['description'] + ' is ' + x)
            elif data_desc_dict[col]['type'] == "Categorical":
                dataset_df_copy[col] = (dataset_df[col].astype(str)).map(data_desc_dict[col]['values'])
                # fill na with missing
                dataset_df_copy[col] = dataset_df_copy[col].fillna("Missing")
                dataset_df_copy[col] = dataset_df_copy[col].map(lambda x: data_desc_dict[col]['description'] + ' is ' + x)
            else:
                raise ValueError("Unknown type: {}".format(data_desc_dict[col]['type']))
    return dataset_df_copy
            
def get_text_embeddings(dataset_df_text, embedding_model):

    dataset_df_embedding = pd.DataFrame()
    bad_columns = []
    embeddings_dict = {}
    print("embedding columns")
    for col in tqdm(dataset_df_text.columns):
        try:
            unique_values = dataset_df_text[col].unique()
            unique_values_embeddings = {x: embedding_model.encode(x) for x in unique_values}
            dataset_df_embedding[col] = dataset_df_text[col].map(unique_values_embeddings)
            embeddings_dict[col] = unique_values_embeddings
        except:
            bad_columns.append(col)
    print("Skip embedding on these columns: {}".format(bad_columns))

    return dataset_df_embedding, embeddings_dict

class EmbeddingPCA():
    def __init__(self):
        self.pca_model = {}
        self.scalar_model = {}
    
    def fit_transform(self, embeddings_dict, dimensions = 1):
        new_embeddings_dict = {}
        for col in embeddings_dict:
            self.scalar_model[col] = StandardScaler()
            self.pca_model[col] = PCA(n_components=dimensions)
            cur_embedding = np.array(list(embeddings_dict[col].values()))
            cur_embedding_standardize = self.scalar_model[col].fit_transform(cur_embedding)
            cur_embedding_reduce = self.pca_model[col].fit_transform(cur_embedding_standardize)
            new_embeddings_dict[col] = {k: v[0] for k, v in zip(embeddings_dict[col].keys(), cur_embedding_reduce)}
        return new_embeddings_dict
    
    def transform(self, embeddings_dict):
        # if not fit, raise error
        if self.pca_model == {}:
            raise ValueError("PCA model not fitted")
        else:
            new_embeddings_dict = {}
            for col in embeddings_dict:
                cur_embedding = np.array(list(embeddings_dict[col].values()))
                cur_embedding_standardize = self.scalar_model[col].transform(cur_embedding)
                cur_embedding_reduce = self.pca_model[col].transform(cur_embedding_standardize)
                new_embeddings_dict[col] = {k: v[0] for k, v in zip(embeddings_dict[col].keys(), cur_embedding_reduce)}
            return new_embeddings_dict


    
def get_pca_embedding(dataset_df_text, pca_embeddings_dict):
    dataset_df_pca_embedding = pd.DataFrame()
    for col in dataset_df_text.columns:
        dataset_df_pca_embedding[col + '_' + 'emb'] = dataset_df_text[col].map(pca_embeddings_dict[col])
    return dataset_df_pca_embedding
    

In [4]:
train_file_path = "../data/house-prices/train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset_df.shape))
dataset_df.head(3)

target_column = 'SalePrice'
y = dataset_df.pop(target_column)
X = dataset_df.drop('Id', axis=1)

Full train dataset shape is (1460, 81)


In [5]:
dataset_df_text = get_text_dataset(X, data_desc_dict)
dataset_df_text.head(2)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,Identifies the type of dwelling involved in th...,Identifies the general zoning classification o...,Linear feet of street connected to property is...,Lot size in square feet is 8450,Type of road access to property is Paved,Type of alley access to property is Missing,General shape of property is Regular,Flatness of the property is Near Flat/Level,Type of utilities available is All public Util...,Lot configuration is Inside lot,...,Screen porch area in square feet is 0,Pool area in square feet is 0,Pool quality is Missing,Fence quality is Missing,Miscellaneous feature not covered in other cat...,$Value of miscellaneous feature is 0,Month Sold (MM) is 2,Year Sold (YYYY) is 2008,Type of sale is Warranty Deed - Conventional,Condition of sale is Normal Sale
1,Identifies the type of dwelling involved in th...,Identifies the general zoning classification o...,Linear feet of street connected to property is...,Lot size in square feet is 9600,Type of road access to property is Paved,Type of alley access to property is Missing,General shape of property is Regular,Flatness of the property is Near Flat/Level,Type of utilities available is All public Util...,Lot configuration is Frontage on 2 sides of pr...,...,Screen porch area in square feet is 0,Pool area in square feet is 0,Pool quality is Missing,Fence quality is Missing,Miscellaneous feature not covered in other cat...,$Value of miscellaneous feature is 0,Month Sold (MM) is 5,Year Sold (YYYY) is 2007,Type of sale is Warranty Deed - Conventional,Condition of sale is Normal Sale


In [76]:
dimensions = 512
text_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=dimensions)
dataset_df_embedding, embeddings_dict  = get_text_embeddings(dataset_df_text, text_model)

embedding columns


100%|██████████| 77/77 [02:15<00:00,  1.76s/it]

Skip embedding on these columns: []





In [77]:
embedding_pca = EmbeddingPCA()
pca_embeddings_dict = embedding_pca.fit_transform(embeddings_dict)
X_emb = get_pca_embedding(dataset_df_text, pca_embeddings_dict)

X_new = pd.concat([X, X_emb], axis=1)

In [78]:
X_train, categorical_columns = prepare_dataset(pd.concat([X, X_emb], axis=1))
X_train[categorical_columns].head(10)

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
1,RL,Pave,Missing,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
2,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
3,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,Missing,Missing,Missing,WD,Abnorml
4,RL,Pave,Missing,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
5,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,...,Attchd,Unf,TA,TA,Y,Missing,MnPrv,Shed,WD,Normal
6,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
7,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,PosN,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Shed,WD,Normal
8,RM,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Artery,...,Detchd,Unf,Fa,TA,Y,Missing,Missing,Missing,WD,Abnorml
9,RL,Pave,Missing,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,Artery,...,Attchd,RFn,Gd,TA,Y,Missing,Missing,Missing,WD,Normal


In [79]:
# Prepare dataset
# target_column = 'SalePrice'
# y = dataset_df.pop(target_column)
# X = dataset_df.drop('Id', axis=1)



In [80]:
# # Define model parameters
# params = {
#     'iterations': 1000,
#     'depth': 6,
#     'loss_function': 'RMSE',
#     'verbose': 200,
#     'early_stopping_rounds': 50
# }

# # Cross-validation parameters
# cv_params = {
#     'fold_count': 5,     # Number of folds in CV
#     'shuffle': True,     # Shuffle data before splitting into batches
#     'partition_random_seed': 0,  # Random seed for shuffling
#     'stratified': False,  # Whether to perform stratified sampling
#     'plot': True         # Whether to plot curve of metrics during training
# }

In [81]:
# # Perform cross-validation
# pool_data = Pool(data=X, label=y, cat_features=categorical_columns.to_list())
# cv_results = cv(pool=pool_data, params=params, fold_count=cv_params['fold_count'],
#                 shuffle=cv_params['shuffle'], partition_random_seed=cv_params['partition_random_seed'],
#                 stratified=cv_params['stratified'], plot=cv_params['plot'])

# # Output results
# print(cv_results)

In [82]:
# # Grid Search
# from sklearn.model_selection import GridSearchCV

# # Define the model
# model = CatBoostRegressor()

# # Set up the parameter grid
# param_grid = {
#     'depth': [4, 6, 8, 10],
#     'learning_rate': [0.01, 0.03, 0.05, 0.1],
#     'iterations': [2000]
# }

# # Configure GridSearchCV
# # When cv=None, default is 5-fold cross validation
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
#                            cv=None, scoring='neg_mean_squared_error', verbose=2)

# # Fit GridSearchCV
# grid_search.fit(X, y, cat_features=categorical_columns.to_list(), verbose=200)

# # Best parameters and best score
# print("Best parameters:", grid_search.best_params_)
# print("Best RMSE:", (-grid_search.best_score_) ** 0.5)

In [89]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(
    cat_features=categorical_columns.to_list(),
    verbose=200,
    depth=6,
    iterations=30000,
    # iterations=300,
    learning_rate=0.03,
)

# Train the model
model.fit(X_train, y)

0:	learn: 77838.1862318	total: 111ms	remaining: 55m 41s
200:	learn: 19250.8946561	total: 15.8s	remaining: 39m 8s
400:	learn: 15123.7771654	total: 30.1s	remaining: 37m 4s
600:	learn: 12567.2579072	total: 44.7s	remaining: 36m 27s
800:	learn: 10494.0051396	total: 1m 1s	remaining: 37m 39s
1000:	learn: 8993.8069509	total: 1m 19s	remaining: 38m 19s
1200:	learn: 7826.6188478	total: 1m 36s	remaining: 38m 37s
1400:	learn: 6864.1057234	total: 1m 53s	remaining: 38m 37s
1600:	learn: 6160.3044644	total: 2m 10s	remaining: 38m 32s
1800:	learn: 5513.0450970	total: 2m 25s	remaining: 37m 55s
2000:	learn: 4971.2665789	total: 2m 41s	remaining: 37m 40s
2200:	learn: 4544.8476530	total: 2m 58s	remaining: 37m 40s
2400:	learn: 4140.0636278	total: 3m 15s	remaining: 37m 24s
2600:	learn: 3771.3473153	total: 3m 32s	remaining: 37m 18s
2800:	learn: 3446.8791492	total: 3m 48s	remaining: 37m 1s
3000:	learn: 3167.8855682	total: 4m 4s	remaining: 36m 44s
3200:	learn: 2925.4862845	total: 4m 21s	remaining: 36m 30s
3400:	le

<catboost.core.CatBoostRegressor at 0x7ff4c0320890>

In [90]:
feature_importances = model.get_feature_importance()
print(pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False))

              Feature  Importance
45          GrLivArea   13.652408
16        OverallQual   12.922771
95    OverallQual_emb    9.621930
37        TotalBsmtSF    4.356372
137    GarageCars_emb    3.736317
..                ...         ...
113  BsmtFinType2_emb    0.000230
8           Utilities    0.000108
4              Street    0.000056
87      Utilities_emb    0.000033
83         Street_emb    0.000003

[156 rows x 2 columns]


In [91]:
test_file_path = "../data/house-prices/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_data, _ = prepare_dataset(test_data)
test_data.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,Missing,Reg,Lvl,AllPub,Inside,...,120,0,Missing,MnPrv,Missing,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,Missing,IR1,Lvl,AllPub,Corner,...,0,0,Missing,Missing,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,Missing,IR1,Lvl,AllPub,Inside,...,0,0,Missing,MnPrv,Missing,0,3,2010,WD,Normal


In [92]:
# get text embedding as well
test_data_text = get_text_dataset(test_data, data_desc_dict)
_, test_embeddings_dict  = get_text_embeddings(test_data_text, text_model)

test_pca_embeddings_dict = embedding_pca.transform(test_embeddings_dict)
X_test_emb = get_pca_embedding(test_data_text, test_pca_embeddings_dict)
X_test = pd.concat([test_data, X_test_emb], axis=1)
X_test.head(3)

embedding columns


  0%|          | 0/77 [00:00<?, ?it/s]

100%|██████████| 77/77 [01:52<00:00,  1.46s/it]


Skip embedding on these columns: []


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch_emb,PoolArea_emb,PoolQC_emb,Fence_emb,MiscFeature_emb,MiscVal_emb,MoSold_emb,YrSold_emb,SaleType_emb,SaleCondition_emb
0,20,RH,80.0,11622,Pave,Missing,Reg,Lvl,AllPub,Inside,...,2.65273,26.343601,24.747883,15.385204,-17.585976,38.915447,-5.000162,-12.179186,15.406698,-14.946821
1,20,RL,81.0,14267,Pave,Missing,IR1,Lvl,AllPub,Corner,...,-0.670933,26.343601,24.747883,-13.560848,-0.83734,-0.449458,-5.000162,-12.179186,15.406698,-14.946821
2,60,RL,74.0,13830,Pave,Missing,IR1,Lvl,AllPub,Inside,...,-0.670933,26.343601,24.747883,15.385204,-17.585976,38.915447,-7.453216,-12.179186,15.406698,-14.946821


In [93]:
sample_submission_df = pd.read_csv('../data/house-prices/sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(X_test)
sample_submission_df.to_csv('../working/catboost-cv-rattana.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,120669.143406
1,1462,160649.951146
2,1463,188243.018352
3,1464,187030.209516
4,1465,189642.307195
