In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from gensim.models import Word2Vec
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
df1 = pd.read_csv("Cleaned Data/Cleaned_Funding_Details.csv")
df1.head()

Unnamed: 0,Name,Funding Round,First Funding Date,Raised Amount,Valuation Amount,Investors
0,Raiinmaker,Seed,3 Apr 2024,$ 7.50M,,"Jump Capital, Cypher Capital, MEXC, Gate.io"
1,Satoshisync,Undisclosed,3 Apr 2024,,,"ZBS CAPITAL, Maven Capital, MH Ventures, Chain..."
2,Legitimate,Seed,3 Apr 2024,$ 4.30M,,"Lemniscap, Sfermion, Draper Associates, Verbal"
3,Mangata,Undisclosed,3 Apr 2024,$ 5.00M,$ 80M,"Polychain Capital, CMS Holdings, Master Ventur..."
4,Bonus-Block,Undisclosed,2 Apr 2024,$ 2.30M,,"ZBS CAPITAL, Rarestone Capital, Faculty Group,..."


In [3]:
df2 = pd.read_csv("Cleaned Data/Cleaned_Overview_Details.csv")
df2.head()

Unnamed: 0,Crypto Name,Service,Total Raised,Description,Links
0,Raiinmaker,Blockchain Service,7500000.0,Raiinmaker is a project focused on acceleratin...,[[[['https://cryptorank.io/categories/blockcha...
1,Satoshisync,DeFi,4300000.0,SatoshiSync is The Permissionless Chain Agnost...,[[[['https://cryptorank.io/categories/blockcha...
2,Legitimate,Blockchain Infrastructure,10600000.0,Legitimate presents an on-chain framework meti...,[[[['https://cryptorank.io/categories/blockcha...
3,Mangata,Blockchain Infrastructure,300000.0,Gasp offers native cross-chain swaps without r...,"[[[['https://cryptorank.io/categories/chain'],..."
4,Bonus-Block,Blockchain Service,4000000.0,BonusBlock is a Web3 marketing platform that i...,"[[[['https://www.agora.finance/', 'https://twi..."


In [4]:
data = pd.merge(df1,df2, left_on = ["Name"], right_on = ["Crypto Name"])
data.head()

Unnamed: 0,Name,Funding Round,First Funding Date,Raised Amount,Valuation Amount,Investors,Crypto Name,Service,Total Raised,Description,Links
0,Raiinmaker,Seed,3 Apr 2024,$ 7.50M,,"Jump Capital, Cypher Capital, MEXC, Gate.io",Raiinmaker,Blockchain Service,7500000.0,Raiinmaker is a project focused on acceleratin...,[[[['https://cryptorank.io/categories/blockcha...
1,Satoshisync,Undisclosed,3 Apr 2024,,,"ZBS CAPITAL, Maven Capital, MH Ventures, Chain...",Satoshisync,DeFi,4300000.0,SatoshiSync is The Permissionless Chain Agnost...,[[[['https://cryptorank.io/categories/blockcha...
2,Legitimate,Seed,3 Apr 2024,$ 4.30M,,"Lemniscap, Sfermion, Draper Associates, Verbal",Legitimate,Blockchain Infrastructure,10600000.0,Legitimate presents an on-chain framework meti...,[[[['https://cryptorank.io/categories/blockcha...
3,Mangata,Undisclosed,3 Apr 2024,$ 5.00M,$ 80M,"Polychain Capital, CMS Holdings, Master Ventur...",Mangata,Blockchain Infrastructure,300000.0,Gasp offers native cross-chain swaps without r...,"[[[['https://cryptorank.io/categories/chain'],..."
4,Bonus-Block,Undisclosed,2 Apr 2024,$ 2.30M,,"ZBS CAPITAL, Rarestone Capital, Faculty Group,...",Bonus-Block,Blockchain Service,4000000.0,BonusBlock is a Web3 marketing platform that i...,"[[[['https://www.agora.finance/', 'https://twi..."


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                40 non-null     object 
 1   Funding Round       40 non-null     object 
 2   First Funding Date  40 non-null     object 
 3   Raised Amount       35 non-null     object 
 4   Valuation Amount    3 non-null      object 
 5   Investors           39 non-null     object 
 6   Crypto Name         40 non-null     object 
 7   Service             40 non-null     object 
 8   Total Raised        40 non-null     float64
 9   Description         40 non-null     object 
 10  Links               38 non-null     object 
dtypes: float64(1), object(10)
memory usage: 3.6+ KB


In [6]:
data['First Funding Date'] = pd.to_datetime(data['First Funding Date'])

# Extract year, month, and day components
data['First Funding Year'] = data['First Funding Date'].dt.year
data['First Funding Month'] = data['First Funding Date'].dt.month
data['First Funding Day'] = data['First Funding Date'].dt.day

In [7]:
data['Valuation Amount'] = data['Valuation Amount'].str.extract(r'\$ (\d+\.?\d*)M').astype(float) * 1e6

In [8]:
data.head()

Unnamed: 0,Name,Funding Round,First Funding Date,Raised Amount,Valuation Amount,Investors,Crypto Name,Service,Total Raised,Description,Links,First Funding Year,First Funding Month,First Funding Day
0,Raiinmaker,Seed,2024-04-03,$ 7.50M,,"Jump Capital, Cypher Capital, MEXC, Gate.io",Raiinmaker,Blockchain Service,7500000.0,Raiinmaker is a project focused on acceleratin...,[[[['https://cryptorank.io/categories/blockcha...,2024,4,3
1,Satoshisync,Undisclosed,2024-04-03,,,"ZBS CAPITAL, Maven Capital, MH Ventures, Chain...",Satoshisync,DeFi,4300000.0,SatoshiSync is The Permissionless Chain Agnost...,[[[['https://cryptorank.io/categories/blockcha...,2024,4,3
2,Legitimate,Seed,2024-04-03,$ 4.30M,,"Lemniscap, Sfermion, Draper Associates, Verbal",Legitimate,Blockchain Infrastructure,10600000.0,Legitimate presents an on-chain framework meti...,[[[['https://cryptorank.io/categories/blockcha...,2024,4,3
3,Mangata,Undisclosed,2024-04-03,$ 5.00M,80000000.0,"Polychain Capital, CMS Holdings, Master Ventur...",Mangata,Blockchain Infrastructure,300000.0,Gasp offers native cross-chain swaps without r...,"[[[['https://cryptorank.io/categories/chain'],...",2024,4,3
4,Bonus-Block,Undisclosed,2024-04-02,$ 2.30M,,"ZBS CAPITAL, Rarestone Capital, Faculty Group,...",Bonus-Block,Blockchain Service,4000000.0,BonusBlock is a Web3 marketing platform that i...,"[[[['https://www.agora.finance/', 'https://twi...",2024,4,2


In [37]:
processed_data = data.drop(columns=["Crypto Name", "Name", "Raised Amount", "First Funding Date", "Valuation Amount", "Links"])

In [38]:
processed_data.fillna("not available", inplace=True)
encoder = OneHotEncoder()
processed_data = pd.get_dummies(processed_data, columns= ["Service", "Funding Round"], dtype = int)
processed_data["Inverstors_and_desc"] = processed_data["Investors"]+" "+processed_data["Description"]
processed_data = processed_data.drop(columns=["Investors","Description"])
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# List of numeric column names
numeric_columns = ['Total Raised', 'First Funding Year', 'First Funding Month',
       'First Funding Day', 'Service_Blockchain Infrastructure',
       'Service_Blockchain Service', 'Service_CeFi', 'Service_Chain',
       'Service_DeFi', 'Service_GameFi', 'Service_Social',
       'Service_Stablecoin', 'Funding Round_Angel', 'Funding Round_Pre-Seed',
       'Funding Round_Pre-Series A', 'Funding Round_Seed',
       'Funding Round_Series A', 'Funding Round_Strategic',
       'Funding Round_Undisclosed']

# Apply min-max normalization to each numeric column
processed_data[numeric_columns] = scaler.fit_transform(processed_data[numeric_columns])

index_value = 2
test_data = processed_data[processed_data.index == index_value]
processed_data = processed_data[processed_data.index != index_value]
processed_data.head()

Unnamed: 0,Total Raised,First Funding Year,First Funding Month,First Funding Day,Service_Blockchain Infrastructure,Service_Blockchain Service,Service_CeFi,Service_Chain,Service_DeFi,Service_GameFi,Service_Social,Service_Stablecoin,Funding Round_Angel,Funding Round_Pre-Seed,Funding Round_Pre-Series A,Funding Round_Seed,Funding Round_Series A,Funding Round_Strategic,Funding Round_Undisclosed,Inverstors_and_desc
0,0.08743,0.0,1.0,0.068966,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,"Jump Capital, Cypher Capital, MEXC, Gate.io Ra..."
1,0.04975,0.0,1.0,0.068966,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"ZBS CAPITAL, Maven Capital, MH Ventures, Chain..."
3,0.002649,0.0,1.0,0.068966,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"Polychain Capital, CMS Holdings, Master Ventur..."
4,0.046217,0.0,1.0,0.034483,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"ZBS CAPITAL, Rarestone Capital, Faculty Group,..."
5,0.140418,0.0,1.0,0.034483,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"Amber Group, Borderless Capital, SNZ Holding, ..."


In [27]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 1 to 39
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Total Raised                       39 non-null     float64
 1   First Funding Year                 39 non-null     float64
 2   First Funding Month                39 non-null     float64
 3   First Funding Day                  39 non-null     float64
 4   Service_Blockchain Infrastructure  39 non-null     float64
 5   Service_Blockchain Service         39 non-null     float64
 6   Service_CeFi                       39 non-null     float64
 7   Service_Chain                      39 non-null     float64
 8   Service_DeFi                       39 non-null     float64
 9   Service_GameFi                     39 non-null     float64
 10  Service_Social                     39 non-null     float64
 11  Service_Stablecoin                 39 non-null     float64
 

In [28]:
processed_data.columns

Index(['Total Raised', 'First Funding Year', 'First Funding Month',
       'First Funding Day', 'Service_Blockchain Infrastructure',
       'Service_Blockchain Service', 'Service_CeFi', 'Service_Chain',
       'Service_DeFi', 'Service_GameFi', 'Service_Social',
       'Service_Stablecoin', 'Funding Round_Angel', 'Funding Round_Pre-Seed',
       'Funding Round_Pre-Series A', 'Funding Round_Seed',
       'Funding Round_Series A', 'Funding Round_Strategic',
       'Funding Round_Undisclosed', 'Inverstors_and_desc'],
      dtype='object')

### RECOMMENDAR SYSTEM

In [22]:
# Train Word2Vec model on textual descriptions
texts = [text.split() for text in processed_data['Inverstors_and_desc']]
embedding_dim = 100  # Adjust as needed
word2vec_model = Word2Vec(texts, vector_size=embedding_dim, window=5, min_count=1, workers=4)

# Create embeddings for textual descriptions
def create_embeddings(description):
    words = description.split()
    embeddings = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embedding_dim)

# Create feature matrix with embeddings and numerical features
feature_matrix = []
for index, entry in processed_data.iterrows():
    embeddings = create_embeddings(entry['Inverstors_and_desc'])
    numerical_features = [
        entry['Total Raised'],
        entry['First Funding Year'],
        entry['First Funding Month'],
        entry['First Funding Day'],
        entry['Service_Blockchain Infrastructure'],
        entry['Service_Blockchain Service'],
        entry['Service_CeFi'],
        entry['Service_Chain'],
        entry['Service_DeFi'],
        entry['Service_GameFi'],
        entry['Service_Social'],
        entry['Service_Stablecoin'],
        entry['Funding Round_Angel'],
        entry['Funding Round_Pre-Seed'],
        entry['Funding Round_Pre-Series A'],
        entry['Funding Round_Seed'],
        entry['Funding Round_Series A'],
        entry['Funding Round_Strategic'],
        entry['Funding Round_Undisclosed']
    ]
    feature_vector = np.concatenate([embeddings, numerical_features])
    feature_matrix.append(feature_vector)

feature_matrix = np.array(feature_matrix)

# Perform SVD on the feature matrix
U, sigma, Vt = np.linalg.svd(feature_matrix)

# Determine the value of k based on explained variance ratio
explained_variance_ratio = np.cumsum(sigma ** 2) / np.sum(sigma ** 2)
threshold = 0.95  # Adjust as needed
k = np.argmax(explained_variance_ratio >= threshold) + 1

# Reconstruction of feature matrix
reconstructed_feature_matrix = np.dot(U[:, :k], np.dot(np.diag(sigma[:k]), Vt[:k, :]))

def match(test_data, processed_data, top_n=5):
    all_top_indices = []
    all_top_similarities = []
    important_columns_per_match = []
    for index, entry in test_data.iterrows():
        embeddings = create_embeddings(entry['Inverstors_and_desc'])
        numerical_features = [
            entry['Total Raised'],
            entry['First Funding Year'],
            entry['First Funding Month'],
            entry['First Funding Day'],
            entry['Service_Blockchain Infrastructure'],
            entry['Service_Blockchain Service'],
            entry['Service_CeFi'],
            entry['Service_Chain'],
            entry['Service_DeFi'],
            entry['Service_GameFi'],
            entry['Service_Social'],
            entry['Service_Stablecoin'],
            entry['Funding Round_Angel'],
            entry['Funding Round_Pre-Seed'],
            entry['Funding Round_Pre-Series A'],
            entry['Funding Round_Seed'],
            entry['Funding Round_Series A'],
            entry['Funding Round_Strategic'],
            entry['Funding Round_Undisclosed']
        ]
        feature_vector = np.concatenate([embeddings, numerical_features])
        
        # Calculate similarities between the new data and existing data
        similarities = np.dot(reconstructed_feature_matrix, feature_vector.T)
        
        # Find top n similar services
        top_indices = np.argsort(similarities)[::-1][:top_n]
        # print(top_indices)
        top_similarities = similarities[top_indices]
        # print(top_similarities)
        
        all_top_indices.extend(top_indices)
        all_top_similarities.extend(top_similarities)
        # print(all_top_similarities)

        

    # Find top 5 unique indices based on the top 5 similarity scores
    sorted_indices = np.argsort(all_top_similarities)[::-1]
    unique_top_indices = [all_top_indices[idx] for idx in sorted_indices][:top_n]
    # print(unique_top_indices)
    
    return unique_top_indices

top_indices = match(test_data, processed_data, top_n=5)
print("Top 5 Matches for the new data (indices):", top_indices)



Top 5 Matches for the new data (indices): [13, 3, 7, 11, 16]


In [15]:
recommended_df = data.iloc[top_indices]
recommended_df

Unnamed: 0,Name,Funding Round,First Funding Date,Raised Amount,Valuation Amount,Investors,Crypto Name,Service,Total Raised,Description,Links,First Funding Year,First Funding Month,First Funding Day
11,Deeplink,Undisclosed,2024-04-02,$ 8.00M,,"Gobi Partners, Bigger Than Race, DeepBrainChain",Deeplink,Blockchain Infrastructure,4600000.0,DeepLink emerges as a pioneering Web3 Cloud Ga...,[[[['https://cryptorank.io/categories/blockcha...,2024,4,2
15,Bsx,Undisclosed,2024-04-01,,,"CMS Holdings, Kyros Ventures, Saison Capital, ...",Bsx,DeFi,10000000.0,BSX is building a decentralized limit order bo...,"[[[['https://cryptorank.io/categories/defi'], ...",2024,4,1
13,Metados,Undisclosed,2024-04-02,$ 2.45M,,"Avalon Wealth Club, Kangaroo Capital, Brainbox",Metados,GameFi,2100000.0,MetaDos is Free-to-Ply-to-Own Esports Battle R...,"[[[['https://cryptorank.io/categories/defi'], ...",2024,4,2
7,Camp-Network,Seed,2024-04-02,$ 4.00M,,"Maven 11 Capital, OKX Ventures, HTX Ventures, ...",Camp-Network,Chain,5000000.0,Camp Network is a modular Layer 2 (L2) solutio...,[[[['https://cryptorank.io/categories/blockcha...,2024,4,2
1,Satoshisync,Undisclosed,2024-04-03,,,"ZBS CAPITAL, Maven Capital, MH Ventures, Chain...",Satoshisync,DeFi,4300000.0,SatoshiSync is The Permissionless Chain Agnost...,[[[['https://cryptorank.io/categories/blockcha...,2024,4,3


In [75]:
recommended_df = data.iloc[top_indices]
recommended_df

Unnamed: 0,Name,Funding Round,First Funding Date,Raised Amount,Valuation Amount,Investors,Crypto Name,Service,Total Raised,Description,Links,First Funding Year,First Funding Month,First Funding Day
20,Sophon-Xyz,Undisclosed,2024-03-29,$ 10.00M,,"The Spartan Group, OKX Ventures, Maven 11 Capi...",Sophon-Xyz,Chain,1100000.0,Sophon is a modular blockchain.,[[[['https://cryptorank.io/categories/blockcha...,2024,3,29
24,Taproot-Labs,Undisclosed,2024-03-28,,,"Negentropy Capital, BitValue Capital, Hussein ...",Taproot-Labs,Chain,10000000.0,Taproot Labs is innovating in the Bitcoin ecos...,"[[[['https://elixir.games/', 'https://twitter....",2024,3,28
19,Arrow-Markets,Series A,2024-03-30,$ 4.00M,,"Delphi Digital, Framework Ventures",Arrow-Markets,DeFi,85000000.0,Arrow Markets represents the next generation o...,[[[['https://cryptorank.io/categories/gamefi']...,2024,3,30
21,Biconomy,Strategic,2024-03-28,,,"Balaji Srinivasan, ConsenSys, Jump Capital, Bo...",Biconomy,Blockchain Service,20500000.0,Biconomy provides plug-n-play APIs to make web...,"[[[['https://cryptorank.io/categories/chain'],...",2024,3,28
5,Iotex,Undisclosed,2024-04-02,$ 50.00M,,"Amber Group, Borderless Capital, SNZ Holding, ...",Iotex,Chain,12000000.0,IoTeX was founded in 2017 and operates as an o...,[[[['https://cryptorank.io/categories/gamefi']...,2024,4,2


In [76]:
input_df = data.iloc[list(test_data.index)]
input_df

Unnamed: 0,Name,Funding Round,First Funding Date,Raised Amount,Valuation Amount,Investors,Crypto Name,Service,Total Raised,Description,Links,First Funding Year,First Funding Month,First Funding Day
37,Peaq,Undisclosed,2024-03-27,$ 15.00M,,"Borderless Capital, Generative Ventures, The S...",Peaq,Chain,75000.0,Kima disrupts traditional interoperability sol...,"[[[['https://cryptorank.io/categories/defi'], ...",2024,3,27
38,Metablox,Strategic,2024-03-26,$ 5.00M,,"Volt Capital, Anagram Crypto, IoTeX, SNZ Holding",Metablox,Blockchain Service,3000000.0,MyPrize is a web3 gaming startup dedicated to ...,,2024,3,26
39,Kimafinance,Strategic,2024-03-26,,,,Kimafinance,DeFi,17000000.0,0G is a Modular AI Chain featuring a scalable ...,,2024,3,26


### Weighted Similarity

In [44]:
# Train Word2Vec model on textual descriptions
texts = [text.split() for text in processed_data['Inverstors_and_desc']]
embedding_dim = 100  # Adjust as needed
word2vec_model = Word2Vec(texts, vector_size=embedding_dim, window=5, min_count=1, workers=4)

# Create embeddings for textual descriptions
def create_embeddings(description):
    words = description.split()
    embeddings = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embedding_dim)

# Create feature matrix with embeddings and numerical features
feature_matrix = []
for index, entry in processed_data.iterrows():
    embeddings = create_embeddings(entry['Inverstors_and_desc'])
    numerical_features = [
        entry['Total Raised'],
        entry['First Funding Year'],
        entry['First Funding Month'],
        entry['First Funding Day'],
        entry['Service_Blockchain Infrastructure'],
        entry['Service_Blockchain Service'],
        entry['Service_CeFi'],
        entry['Service_Chain'],
        entry['Service_DeFi'],
        entry['Service_GameFi'],
        entry['Service_Social'],
        entry['Service_Stablecoin'],
        entry['Funding Round_Angel'],
        entry['Funding Round_Pre-Seed'],
        entry['Funding Round_Pre-Series A'],
        entry['Funding Round_Seed'],
        entry['Funding Round_Series A'],
        entry['Funding Round_Strategic'],
        entry['Funding Round_Undisclosed']
    ]
    feature_vector = np.concatenate([embeddings, numerical_features])
    feature_matrix.append(feature_vector)

feature_matrix = np.array(feature_matrix)
print(len(feature_matrix[0]))

# Perform SVD on the feature matrix
U, sigma, Vt = np.linalg.svd(feature_matrix)

# Determine the value of k based on explained variance ratio
explained_variance_ratio = np.cumsum(sigma ** 2) / np.sum(sigma ** 2)
threshold = 0.95  # Adjust as needed
k = np.argmax(explained_variance_ratio >= threshold) + 1

# Reconstruction of feature matrix
reconstructed_feature_matrix = np.dot(U[:, :k], np.dot(np.diag(sigma[:k]), Vt[:k, :]))


def match(test_data, processed_data, column_weights, top_n=5):
    all_top_indices = []
    all_top_similarities = []

    desc_shrey = "MetaBlox is delivering enterprise-grade WiFi roaming to the public worldwide, ensuring uninterrupted, low-cost access to Web3 and the Metaverse compared to cellular services, as well as enhanced network security and an exceptional overall experience."
    important_columns_per_match = []
    for index, entry in test_data.iterrows():
        embeddings = create_embeddings(desc_shrey)
        numerical_features = [
            entry['Total Raised'] * column_weights['Total Raised'],
            entry['First Funding Year'] * column_weights['First Funding Year'],
            entry['First Funding Month'] * column_weights['First Funding Month'],
            entry['First Funding Day'] * column_weights['First Funding Day'],
            entry['Service_Blockchain Infrastructure'] * column_weights['Service_Blockchain Infrastructure'],
            entry['Service_Blockchain Service'] * column_weights['Service_Blockchain Service'],
            entry['Service_CeFi'] * column_weights['Service_CeFi'],
            entry['Service_Chain'] * column_weights['Service_Chain'],
            entry['Service_DeFi'] * column_weights['Service_DeFi'],
            entry['Service_GameFi'] * column_weights['Service_GameFi'],
            entry['Service_Social'] * column_weights['Service_Social'],
            entry['Service_Stablecoin'] * column_weights['Service_Stablecoin'],
            entry['Funding Round_Angel'] * column_weights['Funding Round_Angel'],
            entry['Funding Round_Pre-Seed'] * column_weights['Funding Round_Pre-Seed'],
            entry['Funding Round_Pre-Series A'] * column_weights['Funding Round_Pre-Series A'],
            entry['Funding Round_Seed'] * column_weights['Funding Round_Seed'],
            entry['Funding Round_Series A'] * column_weights['Funding Round_Series A'],
            entry['Funding Round_Strategic'] * column_weights['Funding Round_Strategic'],
            entry['Funding Round_Undisclosed'] * column_weights['Funding Round_Undisclosed']
        ]
        feature_vector = np.concatenate([embeddings, numerical_features])
        
        # Calculate similarities between the new data and existing data
        similarities = np.dot(reconstructed_feature_matrix, feature_vector.T)
        
        # Find top n similar services
        top_indices = np.argsort(similarities)[::-1][:top_n]
        top_similarities = similarities[top_indices]
        
        all_top_indices.extend(top_indices)
        all_top_similarities.extend(top_similarities)

    # Find top n unique indices based on the top n similarity scores
    sorted_indices = np.argsort(all_top_similarities)[::-1]
    unique_top_indices = [all_top_indices[idx] for idx in sorted_indices][:top_n]
    
    return unique_top_indices

# Define column weights
column_weights = {
    'Total Raised': 0.8,
    'First Funding Year': 0.6,
    'First Funding Month': 0.6,
    'First Funding Day': 0.6,
    'Service_Blockchain Infrastructure': 0.6,
    'Service_Blockchain Service': 0.6,
    'Service_CeFi': 1,
    'Service_Chain': 0.6,
    'Service_DeFi': 0.6,
    'Service_GameFi': 0.6,
    'Service_Social': 0.6,
    'Service_Stablecoin': 0.6,
    'Funding Round_Angel': 0.6,
    'Funding Round_Pre-Seed': 1,
    'Funding Round_Pre-Series A': 0.6,
    'Funding Round_Seed': 0.6,
    'Funding Round_Series A': 0.6,
    'Funding Round_Strategic': 0.6,
    'Funding Round_Undisclosed': 0.6
}

# Call the match function with column weights
top_indices = match(test_data, processed_data, column_weights, top_n=5)
print("Top 5 Matches for the new data (indices):", top_indices)


119
Top 5 Matches for the new data (indices): [7, 22, 0, 13, 15]


In [24]:
recommended_df = data.iloc[top_indices]
recommended_df

Unnamed: 0,Name,Funding Round,First Funding Date,Raised Amount,Valuation Amount,Investors,Crypto Name,Service,Total Raised,Description,Links,First Funding Year,First Funding Month,First Funding Day
13,Metados,Undisclosed,2024-04-02,$ 2.45M,,"Avalon Wealth Club, Kangaroo Capital, Brainbox",Metados,GameFi,2100000.0,MetaDos is Free-to-Ply-to-Own Esports Battle R...,"[[[['https://cryptorank.io/categories/defi'], ...",2024,4,2
3,Mangata,Undisclosed,2024-04-03,$ 5.00M,80000000.0,"Polychain Capital, CMS Holdings, Master Ventur...",Mangata,Blockchain Infrastructure,300000.0,Gasp offers native cross-chain swaps without r...,"[[[['https://cryptorank.io/categories/chain'],...",2024,4,3
7,Camp-Network,Seed,2024-04-02,$ 4.00M,,"Maven 11 Capital, OKX Ventures, HTX Ventures, ...",Camp-Network,Chain,5000000.0,Camp Network is a modular Layer 2 (L2) solutio...,[[[['https://cryptorank.io/categories/blockcha...,2024,4,2
11,Deeplink,Undisclosed,2024-04-02,$ 8.00M,,"Gobi Partners, Bigger Than Race, DeepBrainChain",Deeplink,Blockchain Infrastructure,4600000.0,DeepLink emerges as a pioneering Web3 Cloud Ga...,[[[['https://cryptorank.io/categories/blockcha...,2024,4,2
16,Lockchain-Ai,Seed,2024-04-01,$ 4.60M,,"Lerer Hippeau, Arrington XRP Capital, White St...",Lockchain-Ai,Blockchain Service,8750000.0,Lockchain.ai is a pioneering risk management p...,[[[['https://cryptorank.io/categories/blockcha...,2024,4,1


In [84]:
input_df = data.iloc[list(test_data.index)]
input_df

Unnamed: 0,Name,Funding Round,First Funding Date,Raised Amount,Valuation Amount,Investors,Crypto Name,Service,Total Raised,Description,Links,First Funding Year,First Funding Month,First Funding Day
37,Peaq,Undisclosed,2024-03-27,$ 15.00M,,"Borderless Capital, Generative Ventures, The S...",Peaq,Chain,75000.0,Kima disrupts traditional interoperability sol...,"[[[['https://cryptorank.io/categories/defi'], ...",2024,3,27
38,Metablox,Strategic,2024-03-26,$ 5.00M,,"Volt Capital, Anagram Crypto, IoTeX, SNZ Holding",Metablox,Blockchain Service,3000000.0,MyPrize is a web3 gaming startup dedicated to ...,,2024,3,26
39,Kimafinance,Strategic,2024-03-26,,,,Kimafinance,DeFi,17000000.0,0G is a Modular AI Chain featuring a scalable ...,,2024,3,26
