In [13]:
# Given a text this script will try to extract every sequence of words between
# two quotation marks and between two asterisks.

import re
import sys
import os
import pandas as pd
import numpy as np

def extract_quotes(text):
    """Extracts every sequence of words between two quotation marks."""
    quotes = re.findall(r'\"(.+?)\"', text)
    return quotes

def extract_asterisks(text):
    """Extracts every sequence of words between two asterisks."""
    asterisks = re.findall(r'\*(.+?)\*', text)
    return asterisks

allowed_lm = ['Constant Product Market Maker (CPMM)',
 'Concentrated Liquidity (CL)',
 'synthetic Proactive Market Making (sPMM)',
 'Constant Sum Market Maker (CSMM)',
 'Constant Mean Market Maker (CMMM)',
 'Hybrid Constant Function Market Makers (CFMMs)',
 'Dynamic Automated Market Maker (DAMM)',
 'Proactive Market Maker (PMM)',
 'Virtual Automated Market Makers (vAMM)',
 'Other',
 'Cannot determine']

allowed_license = ['Yes', 'No', 'Cannot determine']

In [5]:
allowed_lm

['Constant Product Market Maker (CPMM)',
 'Concentrated Liquidity (CL)',
 'synthetic Proactive Market Making (sPMM)',
 'Constant Sum Market Maker (CSMM)',
 'Constant Mean Market Maker (CMMM)',
 'Hybrid Constant Function Market Makers (CFMMs)',
 'Dynamic Automated Market Maker (DAMM)',
 'Proactive Market Maker (PMM)',
 'Virtual Automated Market Makers (vAMM)',
 'Other',
 'Cannot determine']

In [3]:
txt = """  Based on the provided context, the liquidity model employed by the DEX is a "Constant Sum Market Maker (CSMM)" model."""
extract_quotes(txt)

['Constant Sum Market Maker (CSMM)']

In [10]:
# A function that extracts the liquidity model from a text.
# If no liquidity model from the allowed list is found, extract the text between quotation marks
# and the text between asterisks and return them as a list.

def extract_feature(text, allowed):
    """Extracts the feature value from a text."""
    for feature in allowed:
        if feature in text:
            return feature

In [88]:
# for all DEX in answers, for all models, extract quotes and asterisks in license and iquidity model txt files and save excel file per model
#answer_path = 'C:/Users/mmahmoud/Desktop/answers'
answer_path = 'answers'
answers = os.listdir(answer_path)
save_path = 'extracted_features'

for dex_name in answers:
    dex_path = os.path.join(answer_path, dex_name)
    liquidity_model_path = os.path.join(dex_path, 'liquidity_model')
    license_path = os.path.join(dex_path, 'license')

    for model in os.listdir(liquidity_model_path):
        model_path = os.path.join(liquidity_model_path, model)
        txt_files = os.listdir(model_path)
        for txt_file in txt_files:
            txt_path = os.path.join(model_path, txt_file)
            with open(txt_path, 'r') as f:
                txt = f.read()
            feature = extract_feature(txt, allowed_lm)
            print(f'{dex_name} - {model} - {txt_file} - liquidity model: {feature}')
            if not os.path.exists(f'{save_path}/{model}_{txt_file[:-4]}.xlsx'):
                df = pd.DataFrame(columns=['dex_name', 'liquidity_model', 'license'])
                # New row as a Series
                new_row = pd.Series({'dex_name': dex_name, 'liquidity_model': feature, 'license': ''})
                # Use concat to append the new row
                df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
            else:
                df = pd.read_excel(f'{save_path}/{model}_{txt_file[:-4]}.xlsx')
                if dex_name in df['dex_name'].values:
                    df.loc[df['dex_name'] == dex_name, 'liquidity_model'] = feature
                else:
                    # New row as a Series
                    new_row = pd.Series({'dex_name': dex_name, 'liquidity_model': feature, 'license': ''})
                    # Use concat to append the new row
                    df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
            df.to_excel(f'{save_path}/{model}_{txt_file[:-4]}.xlsx', index=False)
    

    for model in os.listdir(license_path):
        model_path = os.path.join(license_path, model)
        txt_files = os.listdir(model_path)
        for txt_file in txt_files:
            txt_path = os.path.join(model_path, txt_file)
            with open(txt_path, 'r') as f:
                txt = f.read()
            feature = extract_feature(txt, allowed_license)
            print(f'{dex_name} - {model} - {txt_file} - license: {feature}')

            if not os.path.exists(f'{save_path}/{model}_{txt_file[:-4]}.xlsx'):
                df = pd.DataFrame(columns=['dex_name', 'liquidity_model', 'license'])
                # New row as a Series
                new_row = pd.Series({'dex_name': dex_name, 'liquidity_model': '', 'license': feature})
                # Use concat to append the new row
                df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
            else:
                df = pd.read_excel(f'{save_path}/{model}_{txt_file[:-4]}.xlsx')
                if dex_name in df['dex_name'].values:
                    df.loc[df['dex_name'] == dex_name, 'license'] = feature
                else:
                    # New row as a Series
                    new_row = pd.Series({'dex_name': dex_name, 'liquidity_model': '', 'license': feature})
                    # Use concat to append the new row
                    df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
            df.to_excel(f'{save_path}/{model}_{txt_file[:-4]}.xlsx', index=False)

ApolloX - Llama-2-13B-chat-GPTQ - k_5_cs_500_co_100.txt - liquidity model: Constant Product Market Maker (CPMM)
ApolloX - Llama-2-70B-chat-GGUF - k_5_cs_500_co_100.txt - liquidity model: Constant Product Market Maker (CPMM)
ApolloX - Llama-2-7b-Chat-GGUF - k_5_cs_500_co_100.txt - liquidity model: Proactive Market Maker (PMM)
ApolloX - Mistral-7B-Instruct-v0.1-GGUF - k_5_cs_500_co_100.txt - liquidity model: None
ApolloX - Wizard-Vicuna-13B-Uncensored-GPTQ - k_5_cs_500_co_100.txt - liquidity model: None
ApolloX - Llama-2-13B-chat-GPTQ - k_5_cs_500_co_100.txt - license: Yes
ApolloX - Llama-2-70B-chat-GGUF - k_5_cs_500_co_100.txt - license: Yes
ApolloX - Llama-2-7b-Chat-GGUF - k_5_cs_500_co_100.txt - license: None
ApolloX - Mistral-7B-Instruct-v0.1-GGUF - k_5_cs_500_co_100.txt - license: Yes
ApolloX - Wizard-Vicuna-13B-Uncensored-GPTQ - k_5_cs_500_co_100.txt - license: Yes
Compound - Llama-2-13B-chat-GPTQ - k_5_cs_500_co_100.txt - liquidity model: Cannot determine
Compound - Llama-2-70B-ch

In [89]:
ground_truth = pd.read_excel('ground truth.xlsx')
ground_truth

Unnamed: 0,dex_name,liquidity_model,license
0,Uniswap v3,Concentrated Liquidity (CL),No
1,PancakeSwap v3,Concentrated Liquidity (CL),Yes
2,Apollox,Other,Yes
3,Quickswap v3,Concentrated Liquidity (CL),Cannot determine
4,SpaceFi,Cannot determine,Cannot determine
5,YokaiSwap,Cannot determine,Yes
6,WOOFi,synthetic Proactive Market Making (sPMM),Yes
7,Orca,Concentrated Liquidity (CL),Yes
8,Jupiter,Cannot determine,Yes
9,dYdX,Other,Yes


In [90]:
import re 
# normalization pattern to remove all non-alphanumeric characters and lower case all letters
pattern = re.compile(r'\W+')

In [93]:
# For each answer file in extracted_features, compare the ground truth with the extracted values and calculate the accuracy.

ground_truth = pd.read_excel('ground truth.xlsx')
extracted_features_path = 'extracted_features'
extracted_features = os.listdir(extracted_features_path)

# normalize the dex_name column of ground truth
ground_truth = ground_truth.applymap(lambda x: pattern.sub('', str(x).lower().replace(' ', '')))

for file in extracted_features:
    # file excel like : 
    # dex_name, liquidity_model, license
    # 0x, CSMM, Yes

    liquidity_model_accuracy = []
    license_accuracy = []
    
    df = pd.read_excel(os.path.join(extracted_features_path, file))
    # normalize each column of df
    df = df.applymap(lambda x: pattern.sub('', str(x).lower().replace(' ', '')))
    
    # for each dex_name in df compare the ground truth with the extracted values and calculate the accuracy.

    for dex_name in df['dex_name'].values:
        # get the ground truth
        gt_liquidity_model = ground_truth.loc[ground_truth['dex_name'] == dex_name, 'liquidity_model'].values[0]
        gt_license = ground_truth.loc[ground_truth['dex_name'] == dex_name, 'license'].values[0]
        # get the extracted values
        extracted_liquidity_model = df.loc[df['dex_name'] == dex_name, 'liquidity_model'].values[0]
        extracted_license = df.loc[df['dex_name'] == dex_name, 'license'].values[0]
        # compare
        if gt_liquidity_model == extracted_liquidity_model:
            liquidity_model_accuracy.append(1)
        else:
            liquidity_model_accuracy.append(0)
        if gt_license == extracted_license:
            license_accuracy.append(1)
        else:
            license_accuracy.append(0)

    print(f'=========={file.split(".")[0]}==========')
    print(f'liquidity model accuracy: {np.mean(liquidity_model_accuracy)}')
    print(f'license accuracy: {np.mean(license_accuracy)}')

liquidity model accuracy: 0.4
license accuracy: 0.6
liquidity model accuracy: 0.26666666666666666
license accuracy: 0.8666666666666667
liquidity model accuracy: 0.2
license accuracy: 0.6666666666666666
liquidity model accuracy: 0.2
license accuracy: 0.6
liquidity model accuracy: 0.0
license accuracy: 0.5333333333333333
