## Section 5.4: Information Used for Website Analysis

In [1]:
# Loading python modules
import glob
import json

In [2]:
key_list = [
    'AccessURL',
    'ExtractText',
    'ExtractHyperlink',
    'GetSearchResult',
    'SearchX/Twitter',
    'SearchReddit',
    'RetrieveWHOIS',
    'RetrieveDNSRecord',
    'RetrieveCertificate' 
]

In [3]:
 # Calculate actions from the analysis results
 def calculate_actions(path_list):
    used_list = [0, 0, 0, 0, 0, 0, 0, 0, 0]
    selected_list = [0, 0, 0, 0, 0, 0, 0, 0, 0]
    for path in path_list:
        actions = list()
        with open(path, 'r') as file:
            data = json.load(file)
        for dic in data:
            actions.append(dic['action'])
        for value in list(set(actions)):
            if value not in key_list:
                continue
            used_list[key_list.index(value)]+=1

        for action in actions:
            if action not in key_list:
                continue
            selected_list[key_list.index(action)]+=1
    return used_list, selected_list

In [4]:
# Counting keywords included in the basis for determining scam websites
def count_keywords_separately(input_strings, keywords):  
    counts = [[0] * len(keyword_list) for keyword_list in keywords]  
    total_counts = [0]*len(keywords)
    for input_string in input_strings:
        done_keyword = []
        temp_counts = [0]*len(keywords)
        for category_idx, keyword_list in enumerate(keywords):  
            for keyword_idx, keyword in enumerate(keyword_list):  
                if keyword and keyword in input_string:
                    if keyword not in done_keyword:
                        counts[category_idx][keyword_idx] += 1
                        done_keyword.append(keyword)
                        if temp_counts[category_idx] <= 0:
                            temp_counts[category_idx] +=1
        for index, temp in enumerate(temp_counts):
            total_counts[index] += temp
    return total_counts, counts  

In [5]:
# Table 9: Selected Information Types and Keywords
information_type = ['Certificate Information', 'Company Information', 'Contact Information', 'Domain Name', 'Payment Method', 'Privacy Information', 'Social Engineering', 'Unusual Price', 'User Review', 'Website Status']
keywords = [
    ['tls', 'certificate', 'https', 'ssl'],
    ['company information', 'non-existent companies','non-existent company','physical address'],
    ['email', 'phone number',' contact information','toll-free number'],
    ['whois', 'registrant', 'privacy service', 'domain' ,'dns'],
    ['payment', 'bitcoin', 'cryptocurrency'],
    ['privacy policy', 'privacy notation', 'privacy policies', 'privacy protection'],
    ['psychological', 'lure', 'urgency', 'unrealistic', 'phishing tactic', 'scam tactic', 'short timeframe'],
    ['low price', 'discounts', 'free items', 'high return', 'guaranteed returns', 'free delivery', 'free shipping'],  
    ['social media', 'feedback', 'review', 'twitter', 'reddit', 'complaint','report', 'discussion', 'forum', 'low trust score', 'negative', 'indicators', 'social platforms'],
    ['update', 'copyright', 'outdated', 'up-to-date']
]

In [6]:
# Table 7: Number of Tools Selected and Usage per LLM

# GPT-4 results
path_list = glob.glob('./gpt-4_results/classification_accuracy/*/*/*.json')
used_list, selected_list = calculate_actions(path_list)
print ('----------GPT-4----------')
for key, selected, used in zip(key_list, selected_list, used_list):
    print (key, selected, used/len(path_list)*100)

# GPT-3.5 results
path_list = glob.glob('./gpt-3.5_results/classification_accuracy/*/*/*.json')
used_list, selected_list = calculate_actions(path_list)
print ('----------GPT-3.5----------')
for key, selected, used in zip(key_list, selected_list, used_list):
    print (key, selected, used/len(path_list)*100)

# Gemini Pro results
path_list = glob.glob('./geminipro_results/classification_accuracy/*/*/*.json')
used_list, selected_list = calculate_actions(path_list)
print ('----------Gemini Pro----------')
for key, selected, used in zip(key_list, selected_list, used_list):
    print (key, selected, used/len(path_list)*100)

----------GPT-4----------
AccessURL 2724 99.95833333333334
ExtractText 2723 99.0
ExtractHyperlink 1018 40.458333333333336
GetSearchResult 1797 68.04166666666667
SearchX/Twitter 1060 43.583333333333336
SearchReddit 1545 63.541666666666664
RetrieveWHOIS 2479 99.5
RetrieveDNSRecord 617 25.458333333333332
RetrieveCertificate 1276 52.87500000000001
----------GPT-3.5----------
AccessURL 2417 99.0
ExtractText 2398 95.33333333333334
ExtractHyperlink 281 11.25
GetSearchResult 51 2.125
SearchX/Twitter 22 0.8750000000000001
SearchReddit 12 0.5
RetrieveWHOIS 1128 45.83333333333333
RetrieveDNSRecord 51 2.125
RetrieveCertificate 85 3.5416666666666665
----------Gemini Pro----------
AccessURL 2471 80.95833333333333
ExtractText 3406 90.25
ExtractHyperlink 1088 34.458333333333336
GetSearchResult 552 17.708333333333336
SearchX/Twitter 270 9.833333333333332
SearchReddit 196 7.166666666666667
RetrieveWHOIS 419 16.416666666666664
RetrieveDNSRecord 129 5.0
RetrieveCertificate 83 3.3333333333333335


In [7]:
# Table 8: Information in Reasons for Website Decision

# GPT-4 results
path_list = glob.glob('./gpt-4_results/classification_accuracy/*/*/*.json')
reason_list = list()
for path in path_list:
    with open(path, 'r') as file:
        data = json.load(file)
    reason_list.append(data[-1]['observation']['reason'].lower())
total_counts, detailed_counts = count_keywords_separately(reason_list, keywords)

print ('----------GPT-4----------')
for information, count in zip(information_type, total_counts):
    print (information, count, count/len(reason_list)*100)

# GPT-3.5 results
path_list = glob.glob('./gpt-3.5_results/classification_accuracy/*/*/*.json')
reason_list = list()
for path in path_list:
    with open(path, 'r') as file:
        data = json.load(file)
    reason_list.append(data[-1]['observation']['reason'].lower())
total_counts, detailed_counts = count_keywords_separately(reason_list, keywords)

print ('----------GPT-3.5----------')
for information, count in zip(information_type, total_counts):
    print (information, count, count/len(reason_list)*100)

# Gemini Pro results
path_list = glob.glob('./geminipro_results/classification_accuracy/*/*/*.json')
reason_list = list()
for path in path_list:
    with open(path, 'r') as file:
        data = json.load(file)
    reason_list.append(data[-1]['observation']['reason'].lower())
total_counts, detailed_counts = count_keywords_separately(reason_list, keywords)

print ('----------Gemini Pro----------')
for information, count in zip(information_type, total_counts):
    print (information, count, count/len(reason_list)*100)

----------GPT-4----------
Certificate Information 770 32.083333333333336
Company Information 307 12.791666666666668
Contact Information 621 25.874999999999996
Domain Name 1866 77.75
Payment Method 339 14.124999999999998
Privacy Information 379 15.791666666666668
Social Engineering 796 33.166666666666664
Unusual Price 1104 46.0
User Review 1544 64.33333333333333
Website Status 294 12.25
----------GPT-3.5----------
Certificate Information 43 1.7916666666666667
Company Information 405 16.875
Contact Information 227 9.458333333333334
Domain Name 952 39.666666666666664
Payment Method 315 13.125
Privacy Information 229 9.541666666666666
Social Engineering 279 11.625
Unusual Price 686 28.583333333333332
User Review 52 2.166666666666667
Website Status 165 6.875000000000001
----------Gemini Pro----------
Certificate Information 28 1.1666666666666667
Company Information 87 3.6249999999999996
Contact Information 233 9.708333333333332
Domain Name 157 6.541666666666666
Payment Method 81 3.375
Priva