Author: William Murphy

Date: 1/13/2024

Urgent Task given by Vinod:

Find all information regarding 2 images:
- 49251693 Snapchat.png
- 49251694 Messenger.png

Find trust, no trust and cues.

Findings: 
All users who saw these images trusted them.

In [14]:
import psycopg2 #for database connection
import pandas as pd 
import sqlalchemy
import numpy as np
import matplotlib.pyplot as plt # for plotting
import json

# Kruskal-Wallis analysis of variance
import scipy.stats as ss # For Kruskal-Wallis test
import scikit_posthocs as sp #For post hoc tests. 
from termcolor import colored # for coloring the print text
import warnings # to ignore plot warnings

# For ggplot
from plotnine import *

# For bolding the printed text
from termcolor import colored

In [15]:
# Define database
hostname = 'localhost'
# f = open("Username.txt", "r")
# username = f.read()
# password = f.read()
database = 'phishdatabase'
username = 'postgres'
password = 'postgres'
port = "5432"

In [16]:
# Create Connection
try:
    connection = psycopg2.connect( host=hostname, user=username, password=password, dbname=database, port=port )
    
    
except:
    print("I am unable to connect to the database")

In [17]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', 10)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

In [18]:
sql = """select c.subject_ids as subject_id, a.annotations->>'value' as zoo_trust 
       from zooniverse_phish_classifications as c, 
    jsonb_array_elements(c.annotations) as a(annotations)
    where a.annotations->>'task' = 'T3'"""


classificationsession = pd.read_sql_query(sql, connection)
classificationsession['subject_id'] = classificationsession['subject_id'].astype('int64')

classificationsession['zoo_trust_b'] = (classificationsession['zoo_trust'] == "Nothing Phishy Here")

classificationsession



Unnamed: 0,subject_id,zoo_trust,zoo_trust_b
0,43857001,Nothing Phishy Here,True
1,43857012,Nothing Phishy Here,True
2,43857072,Nothing Phishy Here,True
3,43857007,Nothing Phishy Here,True
4,43857095,,False
...,...,...,...
31364,52844553,,False
31365,52846983,,False
31366,52846998,,False
31367,52846937,,False


In [19]:
z_trust_df = classificationsession.groupby('subject_id').apply(lambda x: pd.Series(
               dict(z_trust = (x.zoo_trust_b ==  True).sum(),
                    z_no_trust =  (x.zoo_trust_b == False).sum()))).reset_index('subject_id')
z_trust_df = pd.DataFrame(z_trust_df)

z_trust_df

Unnamed: 0,subject_id,z_trust,z_no_trust
0,43856939,9,6
1,43856944,5,11
2,43856946,9,7
3,43856947,4,12
4,43856950,15,2
...,...,...,...
2022,52847471,8,7
2023,52847473,8,7
2024,52847474,15,0
2025,52847476,11,4


Trust and No Trust on the Images

In [20]:
# Everyone that saw the images trusted them
# comment out this cell for bottom cells to show
z_trust_df = z_trust_df.query('subject_id == 49251693 | subject_id == 49251694')
z_trust_df

Unnamed: 0,subject_id,z_trust,z_no_trust
118,49251693,15,0
119,49251694,16,0


In [21]:
sql = """select c.subject_ids as subject_id, a.annotations->>'value' as user_answers 
       from zooniverse_phish_classifications as c, 
    jsonb_array_elements(c.annotations) as a(annotations)
    where a.annotations->>'task' = 'T0'
"""

zooniverseclassification = pd.read_sql_query(sql, connection)
zooniverseclassification



Unnamed: 0,subject_id,user_answers
0,43857074,"[{""x"": 655.9531860351562, ""y"": 339.36654663085..."
1,43857087,"[{""x"": 184.76377868652344, ""y"": 28.77332878112..."
2,43856989,"[{""x"": 120.6994857788086, ""y"": 24.617567062377..."
3,43857081,"[{""x"": 737.415771484375, ""y"": 352.486724853515..."
4,43857096,"[{""x"": 1083.43896484375, ""y"": 514.370239257812..."
...,...,...
12738,49251702,"[{""x"": 823.7265014648438, ""y"": 354.96710205078..."
12739,49251698,"[{""x"": 746.512451171875, ""y"": 352.313629150390..."
12740,52846951,"[{""x"": 947.9085693359375, ""y"": 80.652969360351..."
12741,52844558,"[{""x"": 247.52639770507812, ""y"": 497.6387939453..."


In [22]:
usercues = pd.DataFrame(columns=["user_labeled_image", "subject_id", "cue"])

for user_labeled_image in range(0, len(zooniverseclassification["user_answers"])):
    current_string = zooniverseclassification["user_answers"][user_labeled_image]
    json_object = json.loads(current_string)
    
    for item in json_object:
        data_input = {"user_labeled_image": user_labeled_image, 
                            "subject_id": zooniverseclassification["subject_id"][user_labeled_image], 
                            "cue": item["tool_label"]}   
        
        usercues.loc[len(usercues.index)] = data_input

usercues

Unnamed: 0,user_labeled_image,subject_id,cue
0,0,43857074,Invalid Domain or Sender
1,0,43857074,Potentially Malicious Link
2,0,43857074,Appeal to Action-Authority
3,1,43857087,Invalid Domain or Sender
4,1,43857087,Poor Spelling or Grammar
...,...,...,...
23572,12740,52846951,Invalid Domain or Sender
23573,12740,52846951,Invalid Domain or Sender
23574,12740,52846951,Invalid Domain or Sender
23575,12740,52846951,Invalid Domain or Sender


In [23]:
sortedcues = usercues.groupby('subject_id').apply(lambda x: pd.Series(
    dict(Invalid_Domain_or_Sender = (x.cue == "Invalid Domain or Sender").sum(),
         Potent_Mal_Links = (x.cue == "Potentially Malicious Link").sum(),
         Spelling_or_Grammar = (x.cue == "Poor Spelling or Grammar").sum(),
         Appeal_to_Greed = (x.cue == "Appeal to Action-Greed").sum(),
         Appeal_to_Urgency = (x.cue == "Appeal to Action-Urgency").sum(),
         Appeal_to_Authority = (x.cue == "Appeal to Action-Authority").sum(),
         Other_Phishy_Findings = (x.cue == "Other Phishy Findings").sum()
    ))).reset_index('subject_id')

sortedcues = pd.DataFrame(sortedcues)
sortedcues

Unnamed: 0,subject_id,Invalid_Domain_or_Sender,Potent_Mal_Links,Spelling_or_Grammar,Appeal_to_Greed,Appeal_to_Urgency,Appeal_to_Authority,Other_Phishy_Findings
0,43856939,1,0,0,5,0,0,1
1,43856944,9,1,0,5,0,0,6
2,43856946,1,0,0,4,0,0,4
3,43856947,8,1,2,0,0,0,0
4,43856950,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
1693,52847470,0,0,6,0,0,0,5
1694,52847471,0,0,5,0,0,0,7
1695,52847473,3,0,6,0,0,1,2
1696,52847476,0,0,0,0,0,0,3


In [24]:
change_type ={"subject_id": "int64"}
sortedcues = sortedcues.astype(change_type)
data_types = sortedcues.dtypes
data_types

subject_id                  int64
Invalid_Domain_or_Sender    int64
Potent_Mal_Links            int64
Spelling_or_Grammar         int64
Appeal_to_Greed             int64
Appeal_to_Urgency           int64
Appeal_to_Authority         int64
Other_Phishy_Findings       int64
dtype: object

In [25]:
# The two images were not in the sortedcues df, so the merge is empty
data_df = pd.merge(z_trust_df, sortedcues, on="subject_id")
data_df = pd.DataFrame(data_df)
data_df

Unnamed: 0,subject_id,z_trust,z_no_trust,Invalid_Domain_or_Sender,Potent_Mal_Links,Spelling_or_Grammar,Appeal_to_Greed,Appeal_to_Urgency,Appeal_to_Authority,Other_Phishy_Findings


Cue Check

In [26]:
# Table is empty because no one put any cues
data_df = data_df.query('subject_id == 49251693 | subject_id == 49251694')
data_df

Unnamed: 0,subject_id,z_trust,z_no_trust,Invalid_Domain_or_Sender,Potent_Mal_Links,Spelling_or_Grammar,Appeal_to_Greed,Appeal_to_Urgency,Appeal_to_Authority,Other_Phishy_Findings
