This Notebook will create a cosine similarity matrix and output it so that we may use it for our ML project

In [1]:
#Import the libraries we need
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

Import NFT dataset. It can be downloaded from this URL : 
https://www.kaggle.com/datasets/hemil26/nft-collections-dataset?resource=download

In [2]:
data_frame = pd.read_csv('nft_sales.csv', error_bad_lines=False)

In [3]:
data_frame.head()

Unnamed: 0,Collections,Sales,Buyers,Txns,Owners
0,Axie Infinity,"$4,090,222,023",1790587,17670824,2130467
1,Bored Ape Yacht Club,"$2,439,754,017",12052,32670,6586
2,CryptoPunks,"$2,388,467,992",6076,22269,3804
3,Mutant Ape Yacht Club,"$1,744,822,678",23768,51775,13121
4,Art Blocks,"$1,310,734,558",33549,184470,36091


In [4]:
#Remove NaN rows
data_frame = data_frame.dropna()

Since we are using Cosine Similairty for our first recommendations model, we will need to find a way to convert our number inputs into text. We will do this the following way. We first will clean the data by converting each element in a column to a integer value. Then we will find the percentiles of the values in each column. Then we will replace the value with a string denoting what percentile the value belonged in.

In [5]:
#Remove Commas
data_frame["Owners"]=data_frame["Owners"].str.replace(',','')
#Remove empty rows

#turn into int
data_frame["Owners"] = data_frame["Owners"].astype(int)

In [6]:
#Get Percentiles
print(data_frame["Owners"].quantile(0.25))
print(data_frame["Owners"].quantile(0.50))
print(data_frame["Owners"].quantile(0.75))

3253.0
4411.0
5719.5


In [7]:
#Convert number of owners to text for cosine similarity. This is not the size of the collection. This is the number of unique wallets which own a part of the collection
def round_numbers_oweners(number_input):
    if number_input < 3253:
        return "small"
    elif number_input < 4411:
        return "medium"
    elif number_input < 5719:
        return "large"
    else:
        return "huge"

In [8]:
#Remove Commas
data_frame["Txns"]=data_frame["Txns"].str.replace(',','')
#Remove empty rows

#turn into int
data_frame["Txns"] = data_frame["Txns"].astype(int)

#Get Percentiles
print(data_frame["Txns"].quantile(0.25))
print(data_frame["Txns"].quantile(0.50))
print(data_frame["Txns"].quantile(0.75))

13111.0
18437.0
24606.5


In [9]:
def round_numbers_transactions(number_input):
    if number_input < 13111:
        return "pequeno"
    elif number_input < 18437:
        return "medio"
    elif number_input < 24606:
        return "grande"
    else:
        return "enorme"

In [10]:
#Remove Commas
data_frame["Buyers"]=data_frame["Buyers"].str.replace(',','')
#Remove empty rows

#turn into int
data_frame["Buyers"] = data_frame["Buyers"].astype(int)

#Get Percentiles
print(data_frame["Buyers"].quantile(0.25))
print(data_frame["Buyers"].quantile(0.50))
print(data_frame["Buyers"].quantile(0.75))

5324.0
8239.0
11204.5


In [11]:
def round_numbers_buyers(number_input):
    if number_input < 5324:
        return "jageun"
    elif number_input < 8239:
        return "junggan"
    elif number_input < 11204:
        return "keun"
    else:
        return "eomcheongnan"

In [12]:
#Remove Commas
data_frame["Sales"]=data_frame["Sales"].str.replace(',','')
#Remove Money Sign
data_frame["Sales"]=data_frame["Sales"].str.replace('$','')
#Remove empty rows

#turn into int
data_frame["Sales"] = data_frame["Sales"].astype('int64')

#Get Percentiles
print(data_frame["Sales"].quantile(0.25))
print(data_frame["Sales"].quantile(0.50))
print(data_frame["Sales"].quantile(0.75))

29761266.5
46444604.0
87674392.0


In [13]:
def round_numbers_sales(number_input):
    if number_input < 29761266:
        return "petit"
    elif number_input < 46444604:
        return "moyen"
    elif number_input < 87674392:
        return "enorme"
    else:
        return "massif"

In [14]:
# Convert Number input into text defining the percentile the value was in
data_frame["Sales"]= data_frame["Sales"].apply(round_numbers_sales)
data_frame["Buyers"]= data_frame["Buyers"].apply(round_numbers_buyers)
data_frame["Txns"]= data_frame["Txns"].apply(round_numbers_transactions)
data_frame["Owners"]= data_frame["Owners"].apply(round_numbers_oweners)

In [15]:
#This creates a new features column which has text from all of the other coloumns
data_frame["Features"] = data_frame["Collections"] + " " + (data_frame["Sales"]) + " " + (data_frame["Buyers"]) + " " + (data_frame["Txns"]) + " " + (data_frame["Owners"])

In [16]:
data_frame.head()

Unnamed: 0,Collections,Sales,Buyers,Txns,Owners,Features
0,Axie Infinity,massif,eomcheongnan,enorme,huge,Axie Infinity massif eomcheongnan enorme huge
1,Bored Ape Yacht Club,massif,eomcheongnan,enorme,huge,Bored Ape Yacht Club massif eomcheongnan enorm...
2,CryptoPunks,massif,junggan,grande,medium,CryptoPunks massif junggan grande medium
3,Mutant Ape Yacht Club,massif,eomcheongnan,enorme,huge,Mutant Ape Yacht Club massif eomcheongnan enor...
4,Art Blocks,massif,eomcheongnan,enorme,huge,Art Blocks massif eomcheongnan enorme huge


In [34]:
data_frame[0:100]

Unnamed: 0,Collections,Sales,Buyers,Txns,Owners,Features
0,Axie Infinity,massif,eomcheongnan,enorme,huge,Axie Infinity massif eomcheongnan enorme huge
1,Bored Ape Yacht Club,massif,eomcheongnan,enorme,huge,Bored Ape Yacht Club massif eomcheongnan enorm...
2,CryptoPunks,massif,junggan,grande,medium,CryptoPunks massif junggan grande medium
3,Mutant Ape Yacht Club,massif,eomcheongnan,enorme,huge,Mutant Ape Yacht Club massif eomcheongnan enor...
4,Art Blocks,massif,eomcheongnan,enorme,huge,Art Blocks massif eomcheongnan enorme huge
...,...,...,...,...,...,...
106,Kiwami,enorme,eomcheongnan,enorme,medium,Kiwami enorme eomcheongnan enorme medium
108,CryptoBatz by Ozzy Osbourne,enorme,eomcheongnan,grande,huge,CryptoBatz by Ozzy Osbourne enorme eomcheongna...
109,Lil Heroes,enorme,jageun,pequeno,medium,Lil Heroes enorme jageun pequeno medium
111,Crypto Bull Society,enorme,junggan,pequeno,medium,Crypto Bull Society enorme junggan pequeno medium


In [17]:
#convert text from features to matrix
cm = CountVectorizer().fit_transform(data_frame["Features"])

In [18]:
#Get the cosine similarity matrix from the count matrix
cs = cosine_similarity(cm)
print(cs)

[[1.         0.57735027 0.18257419 ... 0.         0.         0.16666667]
 [0.57735027 1.         0.15811388 ... 0.         0.         0.14433757]
 [0.18257419 0.15811388 1.         ... 0.36514837 0.18257419 0.18257419]
 ...
 [0.         0.         0.36514837 ... 1.         0.33333333 0.33333333]
 [0.         0.         0.18257419 ... 0.33333333 1.         0.33333333]
 [0.16666667 0.14433757 0.18257419 ... 0.33333333 0.33333333 1.        ]]


In [24]:
#Turn into pandas dataFrame
similarity_dataframe = pd.DataFrame(cs)

In [25]:
similarity_dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,221,222,223,224,225,226,227,228,229,230
0,1.0,0.57735,0.182574,0.57735,0.666667,0.730297,0.617213,0.547723,0.547723,0.547723,...,0.0,0.0,0.0,0.0,0.0,0.0,0.182574,0.0,0.0,0.166667
1,0.57735,1.0,0.158114,0.875,0.57735,0.632456,0.534522,0.474342,0.474342,0.474342,...,0.0,0.0,0.0,0.0,0.0,0.0,0.158114,0.0,0.0,0.144338
2,0.182574,0.158114,1.0,0.158114,0.182574,0.2,0.169031,0.2,0.4,0.4,...,0.4,0.0,0.4,0.0,0.182574,0.2,0.2,0.365148,0.182574,0.182574
3,0.57735,0.875,0.158114,1.0,0.57735,0.632456,0.534522,0.474342,0.474342,0.474342,...,0.0,0.0,0.0,0.0,0.0,0.0,0.158114,0.0,0.0,0.144338
4,0.666667,0.57735,0.182574,0.57735,1.0,0.730297,0.617213,0.547723,0.547723,0.547723,...,0.0,0.0,0.0,0.0,0.0,0.0,0.182574,0.0,0.0,0.166667


In [26]:
#Rename columns and rows
similarity_dataframe.columns = data_frame.Collections

In [27]:
similarity_dataframe.head()

Collections,Axie Infinity,Bored Ape Yacht Club,CryptoPunks,Mutant Ape Yacht Club,Art Blocks,Otherdeed,NBA Top Shot,Azuki,CloneX,Moonbirds,...,CryptoonGoonz,Deafbeef,Illuminati,Bastard Gan Punks V2,Fishy Fam,Potatoz,Mindblowon,Sipherian Surge,Wool Pouch,Los Muertos
0,1.0,0.57735,0.182574,0.57735,0.666667,0.730297,0.617213,0.547723,0.547723,0.547723,...,0.0,0.0,0.0,0.0,0.0,0.0,0.182574,0.0,0.0,0.166667
1,0.57735,1.0,0.158114,0.875,0.57735,0.632456,0.534522,0.474342,0.474342,0.474342,...,0.0,0.0,0.0,0.0,0.0,0.0,0.158114,0.0,0.0,0.144338
2,0.182574,0.158114,1.0,0.158114,0.182574,0.2,0.169031,0.2,0.4,0.4,...,0.4,0.0,0.4,0.0,0.182574,0.2,0.2,0.365148,0.182574,0.182574
3,0.57735,0.875,0.158114,1.0,0.57735,0.632456,0.534522,0.474342,0.474342,0.474342,...,0.0,0.0,0.0,0.0,0.0,0.0,0.158114,0.0,0.0,0.144338
4,0.666667,0.57735,0.182574,0.57735,1.0,0.730297,0.617213,0.547723,0.547723,0.547723,...,0.0,0.0,0.0,0.0,0.0,0.0,0.182574,0.0,0.0,0.166667


In [29]:
similarity_dataframe.index = data_frame.Collections

In [32]:
similarity_dataframe.head()

Collections,Axie Infinity,Bored Ape Yacht Club,CryptoPunks,Mutant Ape Yacht Club,Art Blocks,Otherdeed,NBA Top Shot,Azuki,CloneX,Moonbirds,...,CryptoonGoonz,Deafbeef,Illuminati,Bastard Gan Punks V2,Fishy Fam,Potatoz,Mindblowon,Sipherian Surge,Wool Pouch,Los Muertos
Collections,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Axie Infinity,1.0,0.57735,0.182574,0.57735,0.666667,0.730297,0.617213,0.547723,0.547723,0.547723,...,0.0,0.0,0.0,0.0,0.0,0.0,0.182574,0.0,0.0,0.166667
Bored Ape Yacht Club,0.57735,1.0,0.158114,0.875,0.57735,0.632456,0.534522,0.474342,0.474342,0.474342,...,0.0,0.0,0.0,0.0,0.0,0.0,0.158114,0.0,0.0,0.144338
CryptoPunks,0.182574,0.158114,1.0,0.158114,0.182574,0.2,0.169031,0.2,0.4,0.4,...,0.4,0.0,0.4,0.0,0.182574,0.2,0.2,0.365148,0.182574,0.182574
Mutant Ape Yacht Club,0.57735,0.875,0.158114,1.0,0.57735,0.632456,0.534522,0.474342,0.474342,0.474342,...,0.0,0.0,0.0,0.0,0.0,0.0,0.158114,0.0,0.0,0.144338
Art Blocks,0.666667,0.57735,0.182574,0.57735,1.0,0.730297,0.617213,0.547723,0.547723,0.547723,...,0.0,0.0,0.0,0.0,0.0,0.0,0.182574,0.0,0.0,0.166667


In [31]:
#Note : toexcel writes object to an Excel Sheet
similarity_dataframe.to_excel('CosineSimilairtyDataFrame.xlsx')

In [33]:
similarity_dataframe.to_csv('CosineSimilairtyDataFrame_csv.csv')