In [6]:
import pandas as pd
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


import json
# Read the JSON file we just created
with open('extracted_data_description.json', 'r') as file:
    data_desc_dict = json.load(file)

# json_loaded_data


In [3]:
def prepare_dataset(df):
    # For categorical features, replace NA with "Missing"
    categorical_columns = df.select_dtypes(include=['object']).columns
    for key in categorical_columns:
        df[key] = df[key].astype('category').cat.add_categories("Missing").fillna("Missing")
    categorical_columns = df.select_dtypes(include=['category']).columns
    return df, categorical_columns

In [33]:
def get_text_dataset(dataset_df, dataset_desc):
    dataset_df_copy = dataset_df.copy()
    for col in dataset_df_copy.columns:
        if col in data_desc_dict:
            # print(col)
            if data_desc_dict[col]['type'] == "Numeric":
                dataset_df_copy[col] =  dataset_df_copy[col].astype(str)
                dataset_df_copy[col] = dataset_df_copy[col].map(lambda x: data_desc_dict[col]['description'] + ' is ' + x)
            elif data_desc_dict[col]['type'] == "Categorical":
                dataset_df_copy[col] = (dataset_df_copy[col].astype(str)).map(data_desc_dict[col]['values'])
                # fill na with missing
                dataset_df_copy[col] = dataset_df_copy[col].fillna("Missing")
                dataset_df_copy[col] = dataset_df_copy[col].map(lambda x: data_desc_dict[col]['description'] + ' is ' + x)
            else:
                raise ValueError("Unknown type: {}".format(data_desc_dict[col]['type']))
    return dataset_df_copy
            
        
    

In [36]:
train_file_path = "../data/house-prices/train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset_df.shape))
dataset_df.head(3)

Full train dataset shape is (1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [37]:
dataset_df_text = get_text_dataset(dataset_df, data_desc_dict)
dataset_df_text.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,Identifies the type of dwelling involved in th...,Identifies the general zoning classification o...,Linear feet of street connected to property is...,Lot size in square feet is 8450,Type of road access to property is Paved,Type of alley access to property is Missing,General shape of property is Regular,Flatness of the property is Near Flat/Level,Type of utilities available is All public Util...,...,Pool area in square feet is 0,Pool quality is Missing,Fence quality is Missing,Miscellaneous feature not covered in other cat...,$Value of miscellaneous feature is 0,Month Sold (MM) is 2,Year Sold (YYYY) is 2008,Type of sale is Warranty Deed - Conventional,Condition of sale is Normal Sale,208500
1,2,Identifies the type of dwelling involved in th...,Identifies the general zoning classification o...,Linear feet of street connected to property is...,Lot size in square feet is 9600,Type of road access to property is Paved,Type of alley access to property is Missing,General shape of property is Regular,Flatness of the property is Near Flat/Level,Type of utilities available is All public Util...,...,Pool area in square feet is 0,Pool quality is Missing,Fence quality is Missing,Miscellaneous feature not covered in other cat...,$Value of miscellaneous feature is 0,Month Sold (MM) is 5,Year Sold (YYYY) is 2007,Type of sale is Warranty Deed - Conventional,Condition of sale is Normal Sale,181500
2,3,Identifies the type of dwelling involved in th...,Identifies the general zoning classification o...,Linear feet of street connected to property is...,Lot size in square feet is 11250,Type of road access to property is Paved,Type of alley access to property is Missing,General shape of property is Slightly irregular,Flatness of the property is Near Flat/Level,Type of utilities available is All public Util...,...,Pool area in square feet is 0,Pool quality is Missing,Fence quality is Missing,Miscellaneous feature not covered in other cat...,$Value of miscellaneous feature is 0,Month Sold (MM) is 9,Year Sold (YYYY) is 2008,Type of sale is Warranty Deed - Conventional,Condition of sale is Normal Sale,223500


In [40]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from sentence_transformers.quantization import quantize_embeddings

# 1. Specify preffered dimensions
dimensions = 512

# 2. load model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=dimensions)

# For retrieval you need to pass this prompt.
query = 'Represent this sentence for searching relevant passages: A man is eating a piece of bread'

docs = [
    query,
    "A man is eating food.",
    "A man is eating pasta.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
]

# 2. Encode
embeddings = model.encode(docs)

# Optional: Quantize the embeddings
binary_embeddings = quantize_embeddings(embeddings, precision="ubinary")

similarities = cos_sim(embeddings[0], embeddings[1:])
print('similarities:', similarities)


similarities: tensor([[0.7634, 0.5918, 0.0685, 0.2835]])


In [38]:
# # create a new dataset that reprsent dataset_df with text description

# dataset_df_copy = dataset_df.copy()
# for col in dataset_df_copy.columns:
#     if col in data_desc_dict:
#         # print(col)
#         if data_desc_dict[col]['type'] == "Numeric":
#             dataset_df_copy[col] =  dataset_df_copy[col].astype(str)
#             dataset_df_copy[col] = dataset_df_copy[col].map(lambda x: data_desc_dict[col]['description'] + ' is ' + x)
#         elif data_desc_dict[col]['type'] == "Categorical":
#             dataset_df_copy[col] = (dataset_df_copy[col].astype(str)).map(data_desc_dict[col]['values'])
#             # fill na with missing
#             dataset_df_copy[col] = dataset_df_copy[col].fillna("Missing")
#             dataset_df_copy[col] = dataset_df_copy[col].map(lambda x: data_desc_dict[col]['description'] + ' is ' + x)
#         else:
#             raise ValueError("Unknown type: {}".format(data_desc_dict[col]['type']))
        
    

In [39]:
# dataset_df_copy

In [13]:
dataset_df, categorical_columns = prepare_dataset(dataset_df)
dataset_df[categorical_columns].head(10)

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
1,RL,Pave,Missing,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
2,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
3,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,Missing,Missing,Missing,WD,Abnorml
4,RL,Pave,Missing,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
5,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,...,Attchd,Unf,TA,TA,Y,Missing,MnPrv,Shed,WD,Normal
6,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
7,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,PosN,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Shed,WD,Normal
8,RM,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Artery,...,Detchd,Unf,Fa,TA,Y,Missing,Missing,Missing,WD,Abnorml
9,RL,Pave,Missing,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,Artery,...,Attchd,RFn,Gd,TA,Y,Missing,Missing,Missing,WD,Normal


In [14]:
# Prepare dataset
target_column = 'SalePrice'
y = dataset_df.pop(target_column)
X = dataset_df.drop('Id', axis=1)

In [16]:
X.shape

(1460, 79)

In [17]:
# Define model parameters
params = {
    'iterations': 1000,
    'depth': 6,
    'loss_function': 'RMSE',
    'verbose': 200,
    'early_stopping_rounds': 50
}

# Cross-validation parameters
cv_params = {
    'fold_count': 5,     # Number of folds in CV
    'shuffle': True,     # Shuffle data before splitting into batches
    'partition_random_seed': 0,  # Random seed for shuffling
    'stratified': False,  # Whether to perform stratified sampling
    'plot': True         # Whether to plot curve of metrics during training
}

In [18]:
# # Perform cross-validation
# pool_data = Pool(data=X, label=y, cat_features=categorical_columns.to_list())
# cv_results = cv(pool=pool_data, params=params, fold_count=cv_params['fold_count'],
#                 shuffle=cv_params['shuffle'], partition_random_seed=cv_params['partition_random_seed'],
#                 stratified=cv_params['stratified'], plot=cv_params['plot'])

# # Output results
# print(cv_results)

In [19]:
# # Grid Search
# from sklearn.model_selection import GridSearchCV

# # Define the model
# model = CatBoostRegressor()

# # Set up the parameter grid
# param_grid = {
#     'depth': [4, 6, 8, 10],
#     'learning_rate': [0.01, 0.03, 0.05, 0.1],
#     'iterations': [2000]
# }

# # Configure GridSearchCV
# # When cv=None, default is 5-fold cross validation
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
#                            cv=None, scoring='neg_mean_squared_error', verbose=2)

# # Fit GridSearchCV
# grid_search.fit(X, y, cat_features=categorical_columns.to_list(), verbose=200)

# # Best parameters and best score
# print("Best parameters:", grid_search.best_params_)
# print("Best RMSE:", (-grid_search.best_score_) ** 0.5)

In [20]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(
    cat_features=categorical_columns.to_list(),
    verbose=200,
    depth=6,
    # iterations=30000,
    iterations=300,
    learning_rate=0.03,
)

# Train the model
model.fit(X, y)

0:	learn: 77830.9231784	total: 184ms	remaining: 55s
200:	learn: 19621.1566880	total: 17.8s	remaining: 8.78s
299:	learn: 17312.1289993	total: 26.7s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7fdc933d3c90>

In [21]:
feature_importances = model.get_feature_importance()
print(pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False))

         Feature  Importance
16   OverallQual   21.846928
45     GrLivArea   14.556685
33    BsmtFinSF1    5.525413
60    GarageCars    4.513654
42      1stFlrSF    4.450717
..           ...         ...
34  BsmtFinType2    0.000000
22   Exterior1st    0.000000
72         Fence    0.000000
73   MiscFeature    0.000000
8      Utilities    0.000000

[79 rows x 2 columns]


In [24]:
test_file_path = "../data/house-prices/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_data, _ = prepare_dataset(test_data)
test_data.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,Missing,Reg,Lvl,AllPub,Inside,...,120,0,Missing,MnPrv,Missing,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,Missing,IR1,Lvl,AllPub,Corner,...,0,0,Missing,Missing,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,Missing,IR1,Lvl,AllPub,Inside,...,0,0,Missing,MnPrv,Missing,0,3,2010,WD,Normal


In [25]:
sample_submission_df = pd.read_csv('../data/house-prices/sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(test_data)
sample_submission_df.to_csv('../working/catboost-cv-rattana.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,118568.634085
1,1462,163302.73777
2,1463,179491.307826
3,1464,185359.550748
4,1465,198235.835777
