In [11]:
import psycopg2 #for database connection
import pandas as pd 
import sqlalchemy
import numpy as np
import matplotlib.pyplot as plt # for plotting
import json

# Kruskal-Wallis analysis of variance
import scipy.stats as ss # For Kruskal-Wallis test
import scikit_posthocs as sp #For post hoc tests. 
from termcolor import colored # for coloring the print text
import warnings # to ignore plot warnings

# For ggplot
from plotnine import *

# For bolding the printed text
from termcolor import colored

In [12]:
# Define database
hostname = 'localhost'
# f = open("Username.txt", "r")
# username = f.read()
# password = f.read()
database = 'phishdatabase'
username = 'postgres'
password = 'postgres'
port = "5432"

In [13]:
# Create Connection
try:
    connection = psycopg2.connect( host=hostname, user=username, password=password, dbname=database, port=port )
    
    
except:
    print("I am unable to connect to the database")

In [14]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', 10)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

In [15]:
# Step 1
# Getting gold standard images
gold_std = pd.read_sql_query("select subject_id, task_id, filename, malicious, gold_std from cybertrust_zooniverse_datamatch where gold_std is true", connection)
gold_std



Unnamed: 0,subject_id,task_id,filename,malicious,gold_std
0,43857918,72,BoA2(1).jpg,True,True
1,43861756,27,GoogleDoc.jpg,True,True
2,43857092,40,UPRailRoad.png,False,True
3,43857086,9,TaxInformationEmail.png,True,True
4,43857079,54,Paypal.png,True,True
...,...,...,...,...,...
27,43857066,56,ebay.png,True,True
28,43857064,26,Dropbox.png,True,True
29,43857063,68,Discover.png,True,True
30,43857061,55,craigslist.png,True,True


In [16]:
# Step 2
# Getting Professionals Cues
Professionals_cues_email = pd.read_sql_query("select * from Professional_Cue_Mapping_email", connection)
Professionals_cues_email

Professionals_cues_webitem= pd.read_sql_query("select * from Professional_Cue_Mapping_webitem", connection)
Professionals_cues_webitem




Unnamed: 0,task_id,content_id,cue_id,cue_name,description
0,19,46,2,Malicious Links,M - Reviews - DigitalTrends
1,19,46,3,Invalid Domain or Sender,M - Reviews - DigitalTrends
2,19,46,2,Malicious Links,M - Reviews - DigitalTrends
3,19,46,2,Malicious Links,M - Reviews - DigitalTrends
4,19,46,2,Malicious Links,M - Reviews - DigitalTrends
...,...,...,...,...,...
604,78,75,3,Invalid Domain or Sender,M - First National
605,78,75,2,Malicious Links,M - First National
606,78,75,2,Malicious Links,M - First National
607,78,75,2,Malicious Links,M - First National


In [17]:
email_Professional_cues = Professionals_cues_email.groupby('task_id').apply(lambda x: pd.Series(
               dict(Spelling_Grammer=(x.cue_id ==  1).sum(),
                     Malicious_Links=(x.cue_id == 2).sum(),
                     Domain_Sender = (x.cue_id == 3).sum(),
                     Authority = (x.cue_id == 4).sum(),
                     Greed = (x.cue_id == 6).sum(),
                     Urgency = (x.cue_id == 7).sum()
                     ))).reset_index('task_id')
email_Professional_cues = pd.DataFrame(email_Professional_cues)

webitem_Professional_cues = Professionals_cues_webitem.groupby('task_id').apply(lambda x: pd.Series(
               dict(Spelling_Grammer=(x.cue_id ==  1).sum(),
                     Malicious_Links=(x.cue_id == 2).sum(),
                     Domain_Sender = (x.cue_id == 3).sum(),
                     Authority = (x.cue_id == 4).sum(),
                     Greed = (x.cue_id == 6).sum(),
                     Urgency = (x.cue_id == 7).sum()
                     ))).reset_index('task_id')
webitem_Professional_cues = pd.DataFrame(webitem_Professional_cues)



In [18]:
webitem_Professional_cues
email_Professional_cues

Unnamed: 0,task_id,Spelling_Grammer,Malicious_Links,Domain_Sender,Authority,Greed,Urgency
0,9,3,3,3,0,0,0
1,10,3,0,3,0,0,3
2,11,19,0,0,0,3,6
3,12,0,6,3,0,0,0
4,13,3,3,3,0,0,0
...,...,...,...,...,...,...,...
22,65,0,17,9,0,2,0
23,66,9,3,3,0,0,3
24,67,0,15,3,0,0,3
25,68,0,21,3,0,0,0


In [21]:
pd.set_option('display.max_rows', 20)
Email_gold_std = pd.merge(gold_std, email_Professional_cues, on='task_id')
Webitem_gold_std = pd.merge(gold_std, webitem_Professional_cues, on='task_id')
Email_gold_std



Unnamed: 0,subject_id,task_id,filename,malicious,gold_std,Spelling_Grammer,Malicious_Links,Domain_Sender,Authority,Greed,Urgency
0,43861756,27,GoogleDoc.jpg,True,True,5,6,4,2,0,3
1,43857086,9,TaxInformationEmail.png,True,True,3,3,3,0,0,0
2,43857103,67,VirusSpyware.png,True,True,0,15,3,0,0,3
3,43857096,30,USPSGmail.png,True,True,0,3,3,0,0,3
4,43857095,31,USPS.png,True,True,3,9,3,0,0,3
5,43857089,24,Tumblr.png,True,True,0,6,3,0,0,4
6,43857081,23,Sprint.png,True,True,0,27,3,0,3,0
7,43857074,13,IRS.png,True,True,3,3,3,0,0,0
8,43857069,66,fedex.png,True,True,9,3,3,0,0,3
9,43857064,26,Dropbox.png,True,True,5,6,3,2,0,2


In [20]:
Webitem_gold_std

Unnamed: 0,subject_id,task_id,filename,malicious,gold_std,Spelling_Grammer,Malicious_Links,Domain_Sender,Authority,Greed,Urgency
0,43857918,72,BoA2(1).jpg,True,True,0,74,3,0,1,2
1,43857079,54,Paypal.png,True,True,0,5,3,0,0,0
2,43857104,49,whatsup.png,True,True,45,27,3,0,3,0
3,43857087,21,TheWallStreetJournal.png,True,True,9,20,3,0,0,0
4,43857066,56,ebay.png,True,True,0,16,3,0,0,0
5,43857061,55,craigslist.png,True,True,9,39,3,0,0,0
6,43857060,57,Box.png,True,True,0,14,3,0,0,3
