In [1]:
import pandas as pd
import numpy as np
import os
import sys
import datetime
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [2]:
data = pd.read_csv('/Users/shaheen/Documents/job_search_jan_2024/home_tasks/nft-valuations/data/data_with_features.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21329 entries, 0 to 21328
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   token_index                21329 non-null  int64  
 1   timestamp                  21329 non-null  object 
 2   eth                        21329 non-null  float64
 3   usd                        21329 non-null  float64
 4   date                       21329 non-null  object 
 5   Skin Tone                  21329 non-null  object 
 6   Type                       21329 non-null  object 
 7   Hair                       21329 non-null  object 
 8   Eyewear                    21329 non-null  object 
 9   Mouth                      21329 non-null  object 
 10  Headwear                   21329 non-null  object 
 11  Facial Hair                21329 non-null  object 
 12  Smoking Device             21329 non-null  object 
 13  Other:Earring              21329 non-null  obj

# Modeling
- Steps:
    - Split the data into time based train and validation i.e. year < 2023 => train data and year = 2023 => test data
    - We will divide the token index into two groups sold, never_sold
        - for 'sold' group we will train a regression model to predict the evaluation based on features calculated in previous notebook
        - for 'never_sold', I will use a clustering approach based on metadata and use cluster assignment to predict valuation

### Model Training for group: 'Sold'

In [3]:
data.head()

Unnamed: 0,token_index,timestamp,eth,usd,date,Skin Tone,Type,Hair,Eyewear,Mouth,...,Trait Count,rarest_property_name,rarity_score,open,year,month,day_of_month,day_of_week,week_of_month,sum_last_7_days_eth_sales
0,544,2017-06-23 21:05:06,0.01,3.2697,2017-06-23,Medium,Male,Crazy Hair,Unknown,Unknown,...,3,Hair,59.139764,320.97,2017,6,23,4,4,0.0
1,1841,2017-06-23 23:40:04,0.25,82.85,2017-06-23,Darker,Female,Mohawk Thin,Green Eye Shadow,Black Lipstick,...,5,Eyewear,84.150533,320.97,2017,6,23,4,4,0.01
2,5916,2017-06-23 23:35:25,0.3,99.42,2017-06-23,Medium,Male,Mohawk Dark,VR,Unknown,...,4,Eyewear,61.194638,320.97,2017,6,23,4,4,0.26
3,4641,2017-06-23 23:31:47,0.2,66.28,2017-06-23,Lighter,Female,Wild Blonde,Purple Eye Shadow,Unknown,...,5,Hair,126.170786,320.97,2017,6,23,4,4,0.56
4,6318,2017-06-23 23:26:08,0.107,34.97295,2017-06-23,Lighter,Male,Mohawk,Eye Mask,Unknown,...,6,Eyewear,88.424911,320.97,2017,6,23,4,4,0.76


In [4]:
data = data.drop(['usd', 'date', 'timestamp'], axis=1)
data.head()

Unnamed: 0,token_index,eth,Skin Tone,Type,Hair,Eyewear,Mouth,Headwear,Facial Hair,Smoking Device,...,Trait Count,rarest_property_name,rarity_score,open,year,month,day_of_month,day_of_week,week_of_month,sum_last_7_days_eth_sales
0,544,0.01,Medium,Male,Crazy Hair,Unknown,Unknown,Unknown,Unknown,Unknown,...,3,Hair,59.139764,320.97,2017,6,23,4,4,0.0
1,1841,0.25,Darker,Female,Mohawk Thin,Green Eye Shadow,Black Lipstick,Unknown,Unknown,Unknown,...,5,Eyewear,84.150533,320.97,2017,6,23,4,4,0.01
2,5916,0.3,Medium,Male,Mohawk Dark,VR,Unknown,Unknown,Unknown,Unknown,...,4,Eyewear,61.194638,320.97,2017,6,23,4,4,0.26
3,4641,0.2,Lighter,Female,Wild Blonde,Purple Eye Shadow,Unknown,Unknown,Unknown,Cigarette,...,5,Hair,126.170786,320.97,2017,6,23,4,4,0.56
4,6318,0.107,Lighter,Male,Mohawk,Eye Mask,Unknown,Unknown,Unknown,Unknown,...,6,Eyewear,88.424911,320.97,2017,6,23,4,4,0.76


In [5]:
train = data[~data.year.isin([2023])]
test = data[data.year.isin([2023])]
print(train.shape)
print(test.shape)

(19090, 25)
(2239, 25)


### XGBoost

In [7]:
# categorical columns
categorical_cols = ['Skin Tone', 'Type', 'Hair', 'Eyewear', 'Mouth', 'Headwear',
       'Facial Hair', 'Smoking Device', 'Other:Earring', 'Neckwear',
       'Skin Feature', 'Other:Medical Mask', 'Other:Clown Nose', 'Trait Count',
       'rarest_property_name', 'year', 'month',
       'day_of_month', 'day_of_week', 'week_of_month']

# Separate features and target
X = train.drop(['eth', 'token_index'], axis=1)
y = train['eth']
X_test = test.drop(['eth', 'token_index'], axis=1)
y_test = test['eth']

# Split the data into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

for col in categorical_cols:
    X_train[col] = X_train[col].astype('category')
    X_valid[col] = X_valid[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# One-hot encode categorical columns
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_valid_encoded = encoder.transform(X_valid[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])

# Get column names for one-hot encoded columns
encoded_cols = encoder.get_feature_names_out(categorical_cols)

# Create DataFrames from one-hot encoded arrays
X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=encoded_cols, index=X_train.index)
X_valid_encoded = pd.DataFrame(X_valid_encoded.toarray(), columns=encoded_cols, index=X_valid.index)
X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=encoded_cols, index=X_test.index)

# Drop original categorical columns
X_train.drop(categorical_cols, axis=1, inplace=True)
X_valid.drop(categorical_cols, axis=1, inplace=True)
X_test.drop(categorical_cols, axis=1, inplace=True)

# Concatenate the encoded columns
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_valid = pd.concat([X_valid, X_valid_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)


# Create the XGBoost DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
dtest = xgb.DMatrix(X_test)

In [16]:


# Define the XGBRegressor
xgb_model = XGBRegressor()

# Parameters for Grid Search
params_grid = {
    'max_depth': [1, 3, 6, 9],
    'subsample': [0.6, 0.8, 1.0, 1.2, 1.4],
    'colsample_bytree': [0.6, 0.8, 1.0, 1.2],
    'learning_rate': [0.01, 0.1, 0.3, 0.4, 0.5]
}

# Perform Grid Search CV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params_grid, scoring='neg_mean_squared_error', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters from Grid Search
print("Best Parameters:", grid_search.best_params_)

# Use the best model
best_xgb_model = grid_search.best_estimator_

Fitting 3 folds for each of 400 candidates, totalling 1200 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.4, 'max_depth': 3, 'subsample': 1.0}


In [17]:
# Predict on test data
y_pred_test = best_xgb_model.predict(X_test)

# Calculate RMSE on test data
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
print("RMSE on test data:", rmse_test)
test['eth_pred'] = y_pred
test[['token_index', 'eth', 'eth_pred']].head(10)

RMSE on test data: 49.40150617317731


Unnamed: 0,token_index,eth,eth_pred
19090,5064,66.66,25.671949
19091,6357,69.99,26.302702
19092,5337,68.0,24.706207
19093,7526,68.5,23.6789
19094,4453,65.0,24.706207
19095,7206,76.5,32.208763
19096,8334,75.0,26.870445
19097,8091,64.95,24.505318
19098,1334,67.5,24.706207
19099,5537,73.9,24.709898
