In [None]:
!pip install -q seaborn
!pip install --pre pycaret

In [None]:
import requests
import json
import time
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers

# For ML Models
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

from pycaret.regression import RegressionExperiment

from pycaret.time_series import TSForecastingExperiment

from google.colab import files
from datetime import date, datetime

## Training ML Models

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
dataset=pd.read_csv('collectible_sales.csv')

In [None]:
dataset.head()

In [None]:
dataset.columns

In [None]:
dataset=dataset.drop(['tokenId'],axis=1)
dataset=dataset.drop(['project-salesToken'],axis=1)
dataset=dataset.drop(['project-avgPriceToken'],axis=1)
dataset=dataset.drop(['prev-project-salesToken'],axis=1)
dataset=dataset.drop(['prev-project-avgPriceToken'],axis=1)

In [None]:
dataset.isnull().sum()

In [None]:
dataset=dataset.dropna()

In [None]:
dataset = dataset[dataset.usdPrice >= 1]
dataset = dataset[dataset['prev-usdPrice'] >= 1]
dataset = dataset[dataset.holdTime > 0]

In [None]:
dataset.describe()

### Data Analysis
https://towardsdatascience.com/transforming-skewed-data-73da4c2d0d16



In [None]:
for feat in dataset.columns:
  plt.hist(dataset[feat], bins=200)
  plt.xlabel(feat)
  plt.ylabel('frequency')
  plt.title('histogram of '+feat)
  plt.show()
  print()

We can observe that the distribution for usd price follows a power-law distribution, to fix that, and to make it kind of Gaussian distribution, let’s convert the values to the log form i.e. we’ll be replacing the price values with log(price+1).

### Train Test data split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2)

In [None]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('usdPrice')
test_labels = test_features.pop('usdPrice')

In [None]:
score={}

### Training on multiple models

#### 1. Sklearn linear regression

In [None]:
train_features['timestamp'] = train_features['timestamp'].apply(lambda x: int(datetime.timestamp(datetime.strptime(x.split('T')[0],"%Y-%m-%d"))))
test_features['timestamp'] = test_features['timestamp'].apply(lambda x: int(datetime.timestamp(datetime.strptime(x.split('T')[0],"%Y-%m-%d"))))

train_features['date'] = train_features['date'].apply(lambda x: int(datetime.timestamp(datetime.strptime(x.split('T')[0],"%Y-%m-%d"))))
test_features['date'] = test_features['date'].apply(lambda x: int(datetime.timestamp(datetime.strptime(x.split('T')[0],"%Y-%m-%d"))))

In [None]:
reg = LinearRegression().fit(train_features, train_labels)

In [None]:
score['sklearn-regression']=reg.score(test_features,test_labels)

In [None]:
# Get feature importance

normalized_coeff = np.abs(reg.coef_/ np.linalg.norm(reg.coef_))
for i in range(len(train_features.columns)):
  print(train_features.columns[i],normalized_coeff[i])

In [None]:
fig = plt.figure(figsize=(15,8))
ax = fig.add_axes([0,0,1,1])
ax.bar(train_features.columns, normalized_coeff)
ax.set_xlabel("Features")
ax.set_ylabel("Relative Importance")
plt.show()

In [None]:
train_labels=np.log(train_labels)
train_features['prevUSDPrice']=np.log(train_features['prevUSDPrice'])

test_labels=np.log(test_labels)
test_features['prevUSDPrice']=np.log(test_features['prevUSDPrice'])

In [None]:
reg = LinearRegression().fit(train_features, train_labels)

In [None]:
score['sklearn-reg-log-label']=reg.score(test_features,test_labels)

In [None]:
score

#### 2. PyCaret - Regression (AutoML Models)

In [None]:
dataset

In [None]:
# Converting string to datetime
dataset['timestamp'] = pd.to_datetime(dataset['timestamp'], format='%Y-%m-%d')
dataset['prev-timestamp'] = pd.to_datetime(dataset['prev-timestamp'], format='%Y-%m-%d')
dataset['date'] = pd.to_datetime(dataset['date'], format='%Y-%m-%d')

In [None]:
# Converting datetime to unix
dataset['timestamp'] = dataset['timestamp'].apply(lambda x: int(datetime.timestamp(x))*1000)
dataset['prev-timestamp'] = dataset['prev-timestamp'].apply(lambda x: int(datetime.timestamp(x))*1000)
dataset['date'] = dataset['date'].apply(lambda x: int(datetime.timestamp(x))*1000)

In [None]:
dataset=dataset.drop(['project'],axis=1)

In [None]:
dataset['popularity']=dataset['prev-usdPrice']/dataset['prev-project-avgPriceUSD']

In [None]:
dataset

In [None]:
s = RegressionExperiment()
s.setup(dataset, target = 'usdPrice',session_id = 123)

In [None]:
best = s.compare_models()

In [None]:
print(best)

In [None]:
s.evaluate_model(best)

In [None]:
s.plot_model(best, plot = 'residuals')

In [None]:
s.plot_model(best, plot = 'feature')

In [None]:
s.predict_model(best)

In [None]:
s.save_model(best, 'my_best_pipeline')

In [None]:
loaded_model = s.load_model('my_best_pipeline')
print(loaded_model)