Author: William Murphy

Date Updated: 01/10/2023

Analysis Task 3 

Identify instances where the tagged areas overlap between other phishy findings and any of the other labels, which tags are most commonly overlapping with OPF, relative frequencies of each (OPF vs label).


In [19]:
# Use this to download any necessary modules
#import sys
#!{sys.executable} -m pip install psycopg2
#!{sys.executable} -m pip install pandas
#!{sys.executable} -m pip install sqlalchemy
#!{sys.executable} -m pip install numpy
#!{sys.executable} -m pip install matplotlib
#!{sys.executable} -m pip install scipy
#!{sys.executable} -m pip install scikit_posthocs
#!{sys.executable} -m pip install termcolor
#!{sys.executable} -m pip install warnings
#!{sys.executable} -m pip install plotnine

In [20]:
import psycopg2 #for database connection
import pandas as pd 
import sqlalchemy
import numpy as np
import matplotlib.pyplot as plt # for plotting
import json

In [21]:
# Define database
hostname = 'localhost'
username = 'postgres'
password = 'postgres'
database = 'phishdatabase'
port = 5432

In [22]:
# Create Connection
try:
    connection = psycopg2.connect( host=hostname, user=username, password=password, dbname=database, port=port )
    
    
except:
    print("I am unable to connect to the database")

In [23]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [24]:
# Step 1
# Getting gold standard images
gold_std = pd.read_sql_query("select subject_id, filename, malicious, gold_std from cybertrust_zooniverse_datamatch where gold_std is true", connection)
gold_std

  gold_std = pd.read_sql_query("select subject_id, filename, malicious, gold_std from cybertrust_zooniverse_datamatch where gold_std is true", connection)


Unnamed: 0,subject_id,filename,malicious,gold_std
0,43857918,BoA2(1).jpg,True,True
1,43861756,GoogleDoc.jpg,True,True
2,43857092,UPRailRoad.png,False,True
3,43857086,TaxInformationEmail.png,True,True
4,43857079,Paypal.png,True,True
...,...,...,...,...
27,43857066,ebay.png,True,True
28,43857064,Dropbox.png,True,True
29,43857063,Discover.png,True,True
30,43857061,craigslist.png,True,True


In [25]:
# Step 2
# Getting subject_id and raw json strings that hold the user's bounding boxes
# The code technically gets all the entries where a user select that an image looked fishy.
# It does this by getting the subject id and the bounding boxes drawn by a user for that subject id.

# If you see an entry in the table below that has "[]" in the user_answer column,
# that would be an instance where the user answered that the image was phishy, but did not draw any bounding boxes

sql = """select c.subject_ids as subject_id, a.annotations->>'value' as user_answers 
       from zooniverse_phish_classifications as c, 
    jsonb_array_elements(c.annotations) as a(annotations)
    where a.annotations->>'task' = 'T0'
"""

zooniverseclassification = pd.read_sql_query(sql, connection)
zooniverseclassification

  zooniverseclassification = pd.read_sql_query(sql, connection)


Unnamed: 0,subject_id,user_answers
0,43857074,"[{""x"": 655.9531860351562, ""y"": 339.36654663085..."
1,43857087,"[{""x"": 184.76377868652344, ""y"": 28.77332878112..."
2,43856989,"[{""x"": 120.6994857788086, ""y"": 24.617567062377..."
3,43857081,"[{""x"": 737.415771484375, ""y"": 352.486724853515..."
4,43857096,"[{""x"": 1083.43896484375, ""y"": 514.370239257812..."
...,...,...
12738,49251702,"[{""x"": 823.7265014648438, ""y"": 354.96710205078..."
12739,49251698,"[{""x"": 746.512451171875, ""y"": 352.313629150390..."
12740,52846951,"[{""x"": 947.9085693359375, ""y"": 80.652969360351..."
12741,52844558,"[{""x"": 247.52639770507812, ""y"": 497.6387939453..."


In [26]:
#Creating a new dataframe 
userboundingboxpos = pd.DataFrame(columns=["user_labeled_image", "subject_id","x_pos", "y_pos", "width", "height", "cue"])

# Sorting the data store in zooniverse classifications into the userboundingboxpos dataframe
for user_labeled_image in range(0, len(zooniverseclassification["user_answers"])):
    current_string = zooniverseclassification["user_answers"][user_labeled_image]
    json_object = json.loads(current_string)
    
    # Holds the cues that a person put on a single image during their session
    # temp_list holds a tuple
    # the tuple are structured (cue type, x_pos, y_pos, width, height)
    temp_list = []
    cue_is_unique = True

    for item in json_object:
        

        for cue in range(0, len(temp_list)):
                
            # This compares the current cue's type being added with cues' that have already been added to the dataframe from the user
            # Checking to see if the cue's are of the same type
            if item["tool_label"] == temp_list[cue][0]:
                # if the cues are of the same type, check that they are intersecting by Separating axis test
                if (
                    (item['x'] > (temp_list[cue][1] + temp_list[cue][3])) and 
                    (temp_list[cue][1] > (item['x'] + item['width'])) and
                    (item['y'] > (temp_list[cue][2] + temp_list[cue][4])) and
                    (temp_list[cue][2] > (item['y'] + item['height']))
                ):
                    cue_is_unique = False
        
        # adds the cue to the data frame if it is unique        
        if cue_is_unique:
            temp_list.append((item["tool_label"], item["x"], item["y"], item["width"],item["height"]))
            
            data_input = {"user_labeled_image": user_labeled_image, 
                            "subject_id": zooniverseclassification["subject_id"][user_labeled_image], 
                            "x_pos": item["x"],
                            "y_pos": item["y"],
                            "width": item["width"],
                            "height": item["height"],
                            "cue": item["tool_label"]}   
        
            userboundingboxpos.loc[len(userboundingboxpos.index)] = data_input

userboundingboxpos

Unnamed: 0,user_labeled_image,subject_id,x_pos,y_pos,width,height,cue
0,0,43857074,655.953186,339.366547,642.218933,66.584595,Invalid Domain or Sender
1,0,43857074,686.023621,878.486816,650.810608,68.732483,Potentially Malicious Link
2,0,43857074,683.875732,1146.973022,2117.819092,88.063477,Appeal to Action-Authority
3,1,43857087,184.763779,28.773329,264.872269,53.605089,Invalid Domain or Sender
4,1,43857087,1349.886353,1075.649292,171.851685,69.371338,Poor Spelling or Grammar
...,...,...,...,...,...,...,...
23572,12740,52846951,211.278152,421.187714,390.718796,120.083344,Invalid Domain or Sender
23573,12740,52846951,336.738312,562.778503,688.238739,86.029846,Invalid Domain or Sender
23574,12740,52846951,852.917297,727.669006,189.982605,60.937805,Invalid Domain or Sender
23575,12740,52846951,198.732132,750.968750,146.967636,43.014954,Invalid Domain or Sender
