In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [64]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn import metrics
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import warnings
from sklearn.metrics import r2_score
from PIL import Image
import re

warnings.filterwarnings('ignore')
import catboost as cb
from catboost import CatBoostRegressor

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Using CLIP to create image-text embeddings, then classify each image to provided text category

In [59]:
from transformers import CLIPProcessor, CLIPModel

clipModel = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [89]:
#stores images and ids
image_list = []
image_homeids = []

In [87]:
#test categories to be used with CLIP model, for house quality scoring
sizes = ["Small house", "Medium house", "Large house"]
styles = ["Modern house", "Traditional house"]
conditions = [
    "Newly constructed house",
    "Well-maintained house",
    # "House in need of minor repairs",
    "House in need of major repairs or renovations"
]

categories = [
    f"{size}, {style}, {condition}"
    for size in sizes
    for style in styles
    for condition in conditions
]

for i, combination in enumerate(categories, 1):
    print(f"{i}. {combination}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clipModel.to(device)
text_inputs = processor(text=categories, return_tensors="pt", padding=True)
text_inputs = {key: value.to(device) for key, value in text_inputs.items()}

1. Small house, Modern house, Newly constructed house
2. Small house, Modern house, Well-maintained house
3. Small house, Modern house, House in need of minor repairs
4. Small house, Modern house, House in need of major renovations
5. Small house, Traditional house, Newly constructed house
6. Small house, Traditional house, Well-maintained house
7. Small house, Traditional house, House in need of minor repairs
8. Small house, Traditional house, House in need of major renovations
9. Medium house, Modern house, Newly constructed house
10. Medium house, Modern house, Well-maintained house
11. Medium house, Modern house, House in need of minor repairs
12. Medium house, Modern house, House in need of major renovations
13. Medium house, Traditional house, Newly constructed house
14. Medium house, Traditional house, Well-maintained house
15. Medium house, Traditional house, House in need of minor repairs
16. Medium house, Traditional house, House in need of major renovations
17. Large house, 

In [90]:
count = 0
for anImg in os.listdir('/content/drive/MyDrive/!mlData/info558Kaggle/SimPictures'):
  imgHomeId = anImg[5:10]
  #remove trailing 0s
  imgHomeId = imgHomeId.lstrip('0')
  if imgHomeId not in image_homeids:
    image_list.append(Image.open(f'/content/drive/MyDrive/!mlData/info558Kaggle/SimPictures/{anImg}').convert("RGB"))
    image_homeids.append(imgHomeId)
    count += 1
print(count)


1200


In [91]:
#run clip model on text-image inputs to get text-image embeddings
inputs = processor(text=categories, images=image_list, return_tensors="pt", padding=True)
inputs = {key: value.to(device) for key, value in inputs.items()}
with torch.no_grad():
    outputs = clipModel(**inputs)
#get logies
logits_per_image = outputs.logits_per_image
#see category mapping
predicted_indices = torch.argmax(logits_per_image, dim=1)
predicted_categories = [categories[idx] for idx in predicted_indices]

In [99]:
clip_results_df = pd.DataFrame({
    "homeid": [int(homeid) for homeid in image_homeids],
    "predicted_category": predicted_categories
})
clip_results_df.head()

Unnamed: 0,homeid,predicted_category
0,5734,"Medium house, Modern house, House in need of m..."
1,4744,"Large house, Modern house, House in need of ma..."
2,7704,"Large house, Modern house, House in need of mi..."
3,6774,"Large house, Traditional house, House in need ..."
4,1626,"Medium house, Modern house, House in need of m..."


In [117]:
#load in original data first
data = pd.read_csv('/content/drive/MyDrive/!mlData/info558Kaggle/df-train_1000x12.csv')
test_data = pd.read_csv('/content/drive/MyDrive/!mlData/info558Kaggle/df-test_200x11.csv')
print(data.dtypes)
print(data.isnull().sum())
print(len(data))

homeid                int64
feat_yrbuilt          int64
feat_nrooms           int64
feat_nbed             int64
feat_nbath          float64
feat_sqft             int64
feat_proptype        object
feat_basement        object
feat_stories         object
feat_solar           object
feat_nsales           int64
target_homevalue      int64
dtype: object
homeid              0
feat_yrbuilt        0
feat_nrooms         0
feat_nbed           0
feat_nbath          0
feat_sqft           0
feat_proptype       0
feat_basement       0
feat_stories        0
feat_solar          0
feat_nsales         0
target_homevalue    0
dtype: int64
1000


In [122]:
data = data.merge(clip_results_df, on="homeid", how="left")
data.head()

Unnamed: 0,homeid,feat_yrbuilt,feat_nrooms,feat_nbed,feat_nbath,feat_sqft,feat_proptype,feat_basement,feat_stories,feat_solar,feat_nsales,target_homevalue,predicted_category_x,predicted_category_y
0,5936,2024,16,2,7.0,3456,Condominium,no,single,no,2,151512,"Large house, Modern house, House in need of mi...","Large house, Modern house, House in need of mi..."
1,7314,1899,13,2,1.0,500,Condominium,no,single,no,1,356344,"Large house, Traditional house, House in need ...","Large house, Traditional house, House in need ..."
2,5448,2024,7,2,9.0,7414,SingleFam,no,single,no,2,78112,"Small house, Traditional house, Newly construc...","Small house, Traditional house, Newly construc..."
3,8560,2007,11,2,5.5,6619,SingleFam,no,single,no,3,872768,"Large house, Modern house, House in need of ma...","Large house, Modern house, House in need of ma..."
4,5783,2024,15,3,4.5,3226,MultiFam,no,multiple,no,1,946086,"Medium house, Modern house, Newly constructed ...","Medium house, Modern house, Newly constructed ..."


In [119]:
test_data = test_data.merge(clip_results_df, on="homeid", how="left")
test_data.head()

Unnamed: 0,homeid,feat_yrbuilt,feat_nrooms,feat_nbed,feat_nbath,feat_sqft,feat_proptype,feat_basement,feat_stories,feat_solar,feat_nsales,predicted_category
0,4997,1734,8,7,6.5,4500,MultiFam,yes,multiple,no,2,"Medium house, Modern house, House in need of m..."
1,7647,2024,10,3,4.0,4470,Condominium,no,single,no,0,"Large house, Modern house, House in need of ma..."
2,8202,1692,13,2,3.0,3985,SingleFam,no,single,no,2,"Small house, Traditional house, House in need ..."
3,5573,2024,8,2,6.0,2079,SingleFam,no,single,no,3,"Small house, Modern house, Newly constructed h..."
4,4612,2024,3,2,7.5,500,MultiFam,yes,multiple,no,2,"Medium house, Modern house, Newly constructed ..."


In [123]:
#merge data with clip_results_df
dissimilar_rows = data[data["predicted_category_x"] != data["predicted_category_y"]]
if not dissimilar_rows.empty:
    print("Dissimilar rows found:")
    print(dissimilar_rows)
else:
    data = data.drop("predicted_category_y", axis=1)
    data = data.rename(columns={"predicted_category_x": "predicted_category"})
    print("No dissimilar rows found.")
print(data.dtypes)
data.head()

No dissimilar rows found.
homeid                  int64
feat_yrbuilt            int64
feat_nrooms             int64
feat_nbed               int64
feat_nbath            float64
feat_sqft               int64
feat_proptype          object
feat_basement          object
feat_stories           object
feat_solar             object
feat_nsales             int64
target_homevalue        int64
predicted_category     object
dtype: object


Unnamed: 0,homeid,feat_yrbuilt,feat_nrooms,feat_nbed,feat_nbath,feat_sqft,feat_proptype,feat_basement,feat_stories,feat_solar,feat_nsales,target_homevalue,predicted_category
0,5936,2024,16,2,7.0,3456,Condominium,no,single,no,2,151512,"Large house, Modern house, House in need of mi..."
1,7314,1899,13,2,1.0,500,Condominium,no,single,no,1,356344,"Large house, Traditional house, House in need ..."
2,5448,2024,7,2,9.0,7414,SingleFam,no,single,no,2,78112,"Small house, Traditional house, Newly construc..."
3,8560,2007,11,2,5.5,6619,SingleFam,no,single,no,3,872768,"Large house, Modern house, House in need of ma..."
4,5783,2024,15,3,4.5,3226,MultiFam,no,multiple,no,1,946086,"Medium house, Modern house, Newly constructed ..."


In [124]:
def encode_cols(input_data):
  encoder = LabelEncoder()
  for col in input_data.columns:
      if input_data[col].dtype == 'object':
          input_data[col] = encoder.fit_transform(input_data[col])
  print(input_data.head())
  return input_data

In [125]:
data = encode_cols(data)

   homeid  feat_yrbuilt  feat_nrooms  feat_nbed  feat_nbath  feat_sqft  \
0    5936          2024           16          2         7.0       3456   
1    7314          1899           13          2         1.0        500   
2    5448          2024            7          2         9.0       7414   
3    8560          2007           11          2         5.5       6619   
4    5783          2024           15          3         4.5       3226   

   feat_proptype  feat_basement  feat_stories  feat_solar  feat_nsales  \
0              0              0             1           0            2   
1              0              0             1           0            1   
2              2              0             1           0            2   
3              2              0             1           0            3   
4              1              0             0           0            1   

   target_homevalue  predicted_category  
0            151512                   1  
1            356344       

In [126]:
X=data.drop(['target_homevalue','homeid'],axis=1)
y=data['target_homevalue']
print(X.head())
print(y.head())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=412)

train_dataset = cb.Pool(X_train, y_train)
test_dataset = cb.Pool(X_test, y_test)

   feat_yrbuilt  feat_nrooms  feat_nbed  feat_nbath  feat_sqft  feat_proptype  \
0          2024           16          2         7.0       3456              0   
1          1899           13          2         1.0        500              0   
2          2024            7          2         9.0       7414              2   
3          2007           11          2         5.5       6619              2   
4          2024           15          3         4.5       3226              1   

   feat_basement  feat_stories  feat_solar  feat_nsales  predicted_category  
0              0             1           0            2                   1  
1              0             1           0            1                   3  
2              0             1           0            2                  15  
3              0             1           0            3                   0  
4              0             0           0            1                   8  
0    151512
1    356344
2     78112
3    8727

In [127]:
catBoostModel = cb.CatBoostRegressor(loss_function="RMSE", task_type="GPU")

In [128]:
grid = {'iterations': [25, 50, 100, 150],
        'learning_rate': [0.001, 0.03, 0.1],
        'depth': [2, 4, 6, 8, 10],
        'l2_leaf_reg': [0.2, 0.5, 1, 3, 10, 15]}
catBoostModel.grid_search(grid, train_dataset)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
33:	learn: 415370.1412629	test: 443923.0298113	best: 443923.0298113 (33)	total: 596ms	remaining: 281ms
34:	learn: 411938.1902529	test: 441224.5072577	best: 441224.5072577 (34)	total: 606ms	remaining: 260ms
35:	learn: 408717.5133375	test: 439380.0124794	best: 439380.0124794 (35)	total: 613ms	remaining: 238ms
36:	learn: 405190.5050813	test: 437131.0906939	best: 437131.0906939 (36)	total: 644ms	remaining: 226ms
37:	learn: 402012.9971391	test: 435308.0035396	best: 435308.0035396 (37)	total: 686ms	remaining: 217ms
38:	learn: 398793.6743490	test: 433722.0721448	best: 433722.0721448 (38)	total: 696ms	remaining: 196ms
39:	learn: 396068.8841260	test: 431857.2649531	best: 431857.2649531 (39)	total: 703ms	remaining: 176ms
40:	learn: 393242.8490834	test: 430580.5848665	best: 430580.5848665 (40)	total: 710ms	remaining: 156ms
41:	learn: 390178.4341760	test: 427274.4454086	best: 427274.4454086 (41)	total: 718ms	remaining: 137ms
42:	lear



Training on fold [0/3]
0:	learn: 582012.9224317	test: 609062.5501368	best: 609062.5501368 (0)	total: 17.2ms	remaining: 2.56s
1:	learn: 551965.8027577	test: 578760.6618712	best: 578760.6618712 (1)	total: 24.2ms	remaining: 1.79s
2:	learn: 521883.6275150	test: 552305.9236036	best: 552305.9236036 (2)	total: 40.7ms	remaining: 1.99s
3:	learn: 491720.3404095	test: 525072.9105833	best: 525072.9105833 (3)	total: 50.4ms	remaining: 1.84s
4:	learn: 470960.1548120	test: 507230.5986522	best: 507230.5986522 (4)	total: 57ms	remaining: 1.65s
5:	learn: 447258.2828101	test: 487046.1018325	best: 487046.1018325 (5)	total: 74.1ms	remaining: 1.78s
6:	learn: 429722.8560178	test: 471980.6205825	best: 471980.6205825 (6)	total: 89.7ms	remaining: 1.83s
7:	learn: 414748.0699277	test: 457292.3681205	best: 457292.3681205 (7)	total: 105ms	remaining: 1.86s
8:	learn: 397939.9147207	test: 444604.4683648	best: 444604.4683648 (8)	total: 121ms	remaining: 1.89s
9:	learn: 389070.2478315	test: 435187.3526605	best: 435187.3526



{'params': {'depth': 8,
  'learning_rate': 0.1,
  'l2_leaf_reg': 0.2,
  'iterations': 150},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               4

In [129]:
pred = catBoostModel.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)
print("Testing performance")
print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))

Testing performance
RMSE: 394045.23
R2: 0.23


In [130]:
def submit_csv(test_data, model):
  test_data_input = test_data.drop(['homeid'], axis=1)
  test_data_input = encode_cols(test_data_input)
  test_pred = model.predict(test_data_input)
  submission = pd.DataFrame({'homeid': test_data['homeid'], 'target_homevalue': test_pred})
  submission.to_csv('submission.csv', index=False)

In [131]:
submit_csv(test_data, catBoostModel)

   feat_yrbuilt  feat_nrooms  feat_nbed  feat_nbath  feat_sqft  feat_proptype  \
0          1734            8          7         6.5       4500              1   
1          2024           10          3         4.0       4470              0   
2          1692           13          2         3.0       3985              2   
3          2024            8          2         6.0       2079              2   
4          2024            3          2         7.5        500              1   

   feat_basement  feat_stories  feat_solar  feat_nsales  predicted_category  
0              1             0           0            2                   7  
1              0             1           0            0                   0  
2              0             1           0            2                  15  
3              0             1           0            3                  14  
4              1             0           0            2                   9  


Linear Regression performance

In [132]:
def linear_reg_results(X_train, y_train, X_test, y_test):
  from sklearn.linear_model import LinearRegression
  from sklearn.metrics import mean_squared_error, r2_score
  reg = LinearRegression().fit(X_train, y_train)
  score = reg.score(X_train, y_train)
  y_pred = reg.predict(X_test)
  rmse = (np.sqrt(mean_squared_error(y_test, y_pred)))
  r2 = r2_score(y_test, y_pred)
  print("Testing performance")
  print("RMSE: {:.2f}".format(rmse))
  print("R2: {:.2f}".format(r2))

In [133]:
linear_reg_results(X_train, y_train, X_test, y_test)

Testing performance
RMSE: 421623.24
R2: 0.12
