In [2]:
import pandas as pd
import numpy as np
import requests
import json

from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Loading Data

In [3]:
file_url = "https://raw.githubusercontent.com/TawerV10/dropshub/main/data/alphadrops.json?token=GHSAT0AAAAAACL3E3A5XMFFEA2XTEIM6A2AZNZBB6Q"
response = requests.get(file_url)
if response.status_code == 200:
    data = json.loads(response.text)
    print(f'Count: {len(data)}')
else:
  print(response.status_code)

Count: 138


In [4]:
df = pd.DataFrame(data)
df.shape

(138, 10)

In [5]:
df.head()

Unnamed: 0,title,tags,invest,network,status,description,strategy,website,discord,logo
0,Renzo,Restaking,3.2M,Ethereum,Mainnet,Renzo is a Liquid Restaking Token (LRT) and St...,✅ Stake ETH to obtain ezETH and earn Eigenlaye...,https://app.renzoprotocol.com/?ref=0x4bb12cc38...,,https://api.typedream.com/v0/document/public/1...
1,Butter,Dex,,Mantle,Mainnet,Butter Swap is the premier decentralized liqui...,✅ Make swaps to earn points & fishing attempts...,https://butter.xyz/s/0T1WCK,https://discord.com/invite/butterxyz,https://api.typedream.com/v0/document/public/1...
2,Parcl,Derivatives,11.6M,Solana,Mainnet,Parcl v3 is a perpetuals exchange designed for...,✅ Trade (1 point per $)\n✅ Provide liquidity (...,https://app.parcl.co/referrals,https://twitter.com/Parcl,https://api.typedream.com/v0/document/public/1...
3,Derivio,"Derivatives, Dex",,zkSync,Mainnet,Derivio is an ecosystem of derivative protocol...,✅ Trade perps\n✅ Trade binary options (predict...,https://derivio.xyz/,https://discord.gg/RYfV4ahPeQ,https://api.typedream.com/v0/document/public/1...
4,Ambient,"Dex, Defi",6.5M,"Ethereum, Scroll",Mainnet,Ambient (formerly CrocSwap) is a decentralized...,✅ Trade\n✅ Provide Liquidity,https://ambient.finance,https://discord.com/invite/ambient-finance,https://api.typedream.com/v0/document/public/1...


## Initial Data Analysis

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        138 non-null    object
 1   tags         133 non-null    object
 2   invest       62 non-null     object
 3   network      136 non-null    object
 4   status       130 non-null    object
 5   description  138 non-null    object
 6   strategy     138 non-null    object
 7   website      109 non-null    object
 8   discord      134 non-null    object
 9   logo         138 non-null    object
dtypes: object(10)
memory usage: 10.9+ KB


In [7]:
df.columns

Index(['title', 'tags', 'invest', 'network', 'status', 'description',
       'strategy', 'website', 'discord', 'logo'],
      dtype='object')

Adding new column as id for each project

In [8]:
df['id'] = range(1, len(df) + 1)
df.shape

(138, 11)

Handling only useful columns to build similarity model

In [9]:
columns = ['id', 'title', 'tags', 'invest', 'network', 'status', 'description', 'strategy']
df = df[columns]

In [10]:
df.head()

Unnamed: 0,id,title,tags,invest,network,status,description,strategy
0,1,Renzo,Restaking,3.2M,Ethereum,Mainnet,Renzo is a Liquid Restaking Token (LRT) and St...,✅ Stake ETH to obtain ezETH and earn Eigenlaye...
1,2,Butter,Dex,,Mantle,Mainnet,Butter Swap is the premier decentralized liqui...,✅ Make swaps to earn points & fishing attempts...
2,3,Parcl,Derivatives,11.6M,Solana,Mainnet,Parcl v3 is a perpetuals exchange designed for...,✅ Trade (1 point per $)\n✅ Provide liquidity (...
3,4,Derivio,"Derivatives, Dex",,zkSync,Mainnet,Derivio is an ecosystem of derivative protocol...,✅ Trade perps\n✅ Trade binary options (predict...
4,5,Ambient,"Dex, Defi",6.5M,"Ethereum, Scroll",Mainnet,Ambient (formerly CrocSwap) is a decentralized...,✅ Trade\n✅ Provide Liquidity


## Exploratory Data Analysis

Getting missed data

In [11]:
missed = df.isnull().sum()
missed[missed > 1]

tags        5
invest     76
network     2
status      8
dtype: int64

Checking for duplicates

In [12]:
df.duplicated().sum()

0

Merging text of columns to summary

In [13]:
merge_columns = ['tags', 'invest', 'network', 'status', 'description', 'strategy']
df['summary'] = df[merge_columns].apply(lambda row: ' '.join(row.dropna().astype(str).str.lower()), axis=1)

Cleaning from unneeded symbols

In [14]:
import re

to_replace = [',', '✅', '☑️', '🚨', '🔸', '\n']
def clean_text(row):
  for element in to_replace:
    row = row.replace(element, '')

  row = row[:-1] if row[-1] in ['.', ',', ':', '!', '?'] else row
  row = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', row) if 'https' in row or 'http' in row else row

  return row.replace('  ', ' ').strip()

In [15]:
df['summary'] = df['summary'].apply(clean_text)

In [16]:
for i in range(df.shape[0]):
  print(i + 1, df['summary'][i])

1 restaking 3.2m ethereum mainnet renzo is a liquid restaking token (lrt) and strategy manager for eigenlayer. it is the interface to the eigenlayer ecosystem securing actively validated services (avss) and offering a higher yield than eth staking. stake eth to obtain ezeth and earn eigenlayer and renzo points
2 dex mantle mainnet butter swap is the premier decentralized liquidity venue on mantle the ethereum layer-2 redefining scalability with its modular architecture and decentralized data availability component. make swaps to earn points & fishing attempts use the fishing attempts to earn mnt points will be converted into butter tokens
3 derivatives 11.6m solana mainnet parcl v3 is a perpetuals exchange designed for real estate synthetics. it supports cross margined perps trading on various real estate markets. lps add liquidity to a single lp pool per exchange where they take on trader pnl as well as earn trading fees. trade (1 point per $) provide liquidity (3 points per $) bind r

In [18]:
df.head()

Unnamed: 0,id,title,tags,invest,network,status,description,strategy,summary
0,1,Renzo,Restaking,3.2M,Ethereum,Mainnet,Renzo is a Liquid Restaking Token (LRT) and St...,✅ Stake ETH to obtain ezETH and earn Eigenlaye...,restaking 3.2m ethereum mainnet renzo is a liq...
1,2,Butter,Dex,,Mantle,Mainnet,Butter Swap is the premier decentralized liqui...,✅ Make swaps to earn points & fishing attempts...,dex mantle mainnet butter swap is the premier ...
2,3,Parcl,Derivatives,11.6M,Solana,Mainnet,Parcl v3 is a perpetuals exchange designed for...,✅ Trade (1 point per $)\n✅ Provide liquidity (...,derivatives 11.6m solana mainnet parcl v3 is a...
3,4,Derivio,"Derivatives, Dex",,zkSync,Mainnet,Derivio is an ecosystem of derivative protocol...,✅ Trade perps\n✅ Trade binary options (predict...,derivatives dex zksync mainnet derivio is an e...
4,5,Ambient,"Dex, Defi",6.5M,"Ethereum, Scroll",Mainnet,Ambient (formerly CrocSwap) is a decentralized...,✅ Trade\n✅ Provide Liquidity,dex defi 6.5m ethereum scroll mainnet ambient ...


In [25]:
new_df = df[['id', 'title', 'summary']]

## Model Building

Applying an algorithm for suffix stripping

In [26]:
ps = PorterStemmer()

In [27]:
def stem(text):
  lst = []
  for word in text.split():
    lst.append(ps.stem(word))

  return ' '.join(lst)

In [28]:
new_df['summary'][0]

'restaking 3.2m ethereum mainnet renzo is a liquid restaking token (lrt) and strategy manager for eigenlayer. it is the interface to the eigenlayer ecosystem securing actively validated services (avss) and offering a higher yield than eth staking. stake eth to obtain ezeth and earn eigenlayer and renzo points'

In [29]:
stem(new_df['summary'][0])

'restak 3.2m ethereum mainnet renzo is a liquid restak token (lrt) and strategi manag for eigenlayer. it is the interfac to the eigenlay ecosystem secur activ valid servic (avss) and offer a higher yield than eth staking. stake eth to obtain ezeth and earn eigenlay and renzo point'

In [None]:
new_df['summary'] = new_df['summary'].apply(stem)

Formating datasets to vectors rows are samples and columns are common words

In [31]:
cv = CountVectorizer(max_features=1000, stop_words='english')
vectors = cv.fit_transform(new_df['summary']).toarray()
vectors.shape

(138, 1000)

In [33]:
cv.get_feature_names_out()[:100]

array(['0m', '10', '1014306106', '10m', '11', '119m', '13m', '16', '173',
       '17m', '18', '18m', '19', '1m', '2024', '20m', '21', '215m',
       '225m', '22m', '23', '25m', '273m', '29m', '2m', '3000', '30m',
       '33m', '3m', '458m', '4m', '5000', '5m', '64', '6m', '726', '726m',
       '72m', '7m', '80m', '8m', '95', '9m', 'abstract', 'access',
       'accomplish', 'account', 'achiev', 'activ', 'add', 'addit',
       'address', 'adopt', 'advanc', 'advantag', 'aggreg', 'ai',
       'airdrop', 'algorithm', 'allbridg', 'allow', 'alpha', 'alphadrop',
       'ambassador', 'ambient', 'amitej', 'amm', 'analysi', 'analyt',
       'ancient8', 'android', 'ani', 'announc', 'anoth', 'app', 'applic',
       'application', 'applications', 'apps', 'apr', 'apto', 'arbitrari',
       'arbitrum', 'architectur', 'argent', 'arraki', 'articl', 'asset',
       'assets', 'assets2', 'atlendi', 'attempt', 'attention', 'auction',
       'aurora', 'autom', 'autonom', 'avail', 'avalanch', 'averag'],
     

Getting similarity of each project to others

In [34]:
similarity = cosine_similarity(vectors)
similarity.shape

(138, 138)

Evaluating model

In [41]:
SIMILAR_PROJECT_COUNT = 10

In [42]:
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x: x[1])[1:SIMILAR_PROJECT_COUNT+1]

[(32, 0.5146502354656655),
 (30, 0.3674234614174767),
 (8, 0.3423265984407289),
 (14, 0.27386127875258304),
 (19, 0.22206996305928162),
 (122, 0.21081851067789195),
 (46, 0.1863389981249825),
 (23, 0.18195633571851272),
 (1, 0.17712297710801905),
 (104, 0.16903085094570333)]

In [43]:
for i in range(similarity.shape[0]):
  project_title = df['title'][i]
  similar_projects_ids = [id_prob[0] for id_prob in sorted(list(enumerate(similarity[i])), reverse=True, key=lambda x: x[1])[1:SIMILAR_PROJECT_COUNT+1]]
  try:
    similar_projects = [df.query('id == @id + 1')['title'].iloc[0] for id in similar_projects_ids]

    print(project_title, similar_projects)
  except Exception as ex:
    print(ex, similar_projects_ids)

Renzo ['Eigen Layer', 'Ether.fi', 'Kelp DAO', 'Swell', 'Helio Protocol', 'Charm', 'Rage Trade', 'zkSync', 'Butter', 'Volmex Finance']
Butter ['Hyperliquid', 'Charm', 'Ekubo', 'Volmex Finance', 'ChainHop', 'Metamask', 'Tevaera', 'StarkDefi', 'Primex Finance', 'DefiLlama']
Parcl ['Hyperliquid', 'KiloEx', 'Synquote', 'Drift Protocol', 'Derivio', 'Ekubo', 'Arrakis Finance', 'Scroll', 'Rage Trade', 'Ambient']
Derivio ['KiloEx', 'Synquote', 'Drift Protocol', 'Parcl', 'Rage Trade', 'Vest Exchange', 'Scroll', 'Polynomial', 'Hyperliquid', 'ZKX Protocol']
Ambient ['Ekubo', 'Swaap', 'Charm', 'Maverick', 'Volmex Finance', 'Primitive Finance', 'Primex Finance', 'Arrakis Finance', 'Bebop', 'Shell Protocol']
BracketX ['Shell Protocol', 'Rage Trade', 'DeFi Saver', 'Synquote', 'KiloEx', 'Voltz Protocol', 'Diamond Protocol', 'Dolomite', 'Odos', 'Vest Exchange']
Nostra Finance ['Sentiment', 'Timeswap', 'Dolomite', 'Drift Protocol', 'Marginfi', 'Volmex Finance', 'Helio Protocol', 'Davos Protocol', 'Ekubo'

In [None]:
similarity_df = pd.DataFrame(columns=['id', 'similar_projects_ids'])

for i in range(similarity.shape[0]):
    id_value = df['id'][i]
    similar_projects_ids = ', '.join([str(id_prob[0]) for id_prob in sorted(enumerate(similarity[i]), reverse=True, key=lambda x: x[1])[1:SIMILAR_PROJECT_COUNT+1]])

    similarity_df = similarity_df.append({'id': id_value, 'similar_projects_ids': similar_projects_ids}, ignore_index=True)

In [46]:
similarity_df.head()

Unnamed: 0,id,similar_projects_ids
0,1,"32, 30, 8, 14, 19, 122, 46, 23, 1, 104"
1,2,"12, 122, 10, 104, 100, 53, 131, 106, 136, 135"
2,3,"12, 21, 65, 18, 3, 10, 84, 16, 46, 4"
3,4,"21, 65, 18, 2, 46, 113, 16, 78, 12, 98"
4,5,"10, 72, 122, 37, 104, 97, 136, 84, 63, 124"


In [47]:
similarity_df.to_csv('similarity.csv', index=False)