In [2]:
#!/usr/bin/env python
import os.path, math, string, sys

import scurve
from scurve import progress, utils, draw
from PIL import Image, ImageDraw

In [3]:
class _Color:
    def __init__(self, data, block):
        self.data, self.block = data, block
        s = list(set(data))
        s.sort()
        self.symbol_map = {v : i for (i, v) in enumerate(s)}

    def __len__(self):
        return len(self.data)

    def point(self, x):
        if self.block and (self.block[0]<=x<self.block[1]):
            return self.block[2]
        else:
            return self.getPoint(x)

In [4]:
class ColorGradient(_Color):
    def getPoint(self, x):
        c = ord(self.data[x])/255.0
        return [
            int(255*c),
            int(255*c),
            int(255*c)
        ]

In [5]:
class ColorHilbert(_Color):
    def __init__(self, data, block):
        _Color.__init__(self, data, block)
        self.csource = scurve.fromSize("hilbert", 3, 256**3)
        self.step = len(self.csource)/float(len(self.symbol_map))

    def getPoint(self, x):
        c = self.symbol_map[self.data[x]]
        return self.csource.point(int(c*self.step))

In [6]:
class ColorClass(_Color):
    def getPoint(self, x):
        c = ord(self.data[x])
        if c == 0:
            return [0, 0, 0]
        elif c == 255:
            return [255, 255, 255]
        elif chr(c) in string.printable:
            return [55, 126, 184]
        return [228, 26, 28]

In [7]:
class ColorEntropy(_Color):
    def getPoint(self, x):
        e = utils.entropy(self.data, 32, x, len(self.symbol_map))
        # http://www.wolframalpha.com/input/?i=plot+%284%28x-0.5%29-4%28x-0.5%29**2%29**4+from+0.5+to+1
        def curve(v):
            f = (4*v - 4*v**2)**4
            f = max(f, 0)
            return f
        r = curve(e-0.5) if e > 0.5 else 0
        b = e**2
        return [
            int(255*r),
            0,
            int(255*b)
        ]

In [8]:
def drawmap_unrolled(map, size, csource, name, prog):
    prog.set_target((size**2)*4)
    map = scurve.fromSize(map, 2, size**2)
    c = Image.new("RGB", (size, size*4))
    cd = ImageDraw.Draw(c)
    step = len(csource)/float(len(map)*4)

    sofar = 0
    for quad in range(4):
        for i, p in enumerate(map):
            off = (i + (quad * size**2))
            color = csource.point(
                        int(off * step)
                    )
            x, y = tuple(p)
            cd.point(
                (x, y + (size * quad)),
                fill=tuple(color)
            )
            if not sofar%100:
                prog.tick(sofar)
            sofar += 1
    c.save(name)

In [9]:
def drawmap_square(map, size, csource, name, prog):
    prog.set_target((size**2))
    map = scurve.fromSize(map, 2, size**2)
    c = Image.new("RGB", map.dimensions())
    cd = ImageDraw.Draw(c)
    step = len(csource)/float(len(map))
    for i, p in enumerate(map):
        color = csource.point(int(i*step))
        cd.point(tuple(p), fill=tuple(color))
        if not i%100:
            prog.tick(i)
    c.save(name)

In [10]:
def main():
    from optparse import OptionParser, OptionGroup
    parser = OptionParser(
                usage = "%prog [options] infile [output]",
                version="%prog 0.1",
            )
    parser.add_option(
        "-b", "--block", action="store",
        dest="block", default=None,
        help="Mark a block of data with a specified color. Format: hexstartaddr:hexendaddr[:hexcolor]"
    )
    parser.add_option(
        "-c", "--color", action="store",
        type="choice", dest="color", default="class",
        choices=["class", "hilbert", "entropy", "gradient"],
        help="Color map."
    )
    parser.add_option(
        "-m", "--map", action="store",
        type="choice", dest="map", default="hilbert",
        choices=sorted(scurve.curveMap.keys()),
        help="Pixel layout map. Can be any supported curve."
    )
    parser.add_option(
        "-n", "--namesuffix", action="store",
        type="str", dest="suffix", default="",
        help="Suffix for generated file names. Ignored if destination is specified."
    )
    parser.add_option(
        "-p", "--progress", action="store_true", default=False,
        dest="progress",
        help="Don't show progress bar - print the destination file name."
    )
    parser.add_option(
        "-s", "--size", action="store",
        type="int", dest="size", default=256,
        help="Image width in pixels."
    )
    parser.add_option(
        "-t", "--type", type="choice",
        dest="type", default="unrolled",
        choices=["unrolled", "square"],
        help="Image aspect ratio - square (1x1) or unrolled (1x4)"
    )
    parser.add_option(
        "-q", "--quiet", action="store_true",
        dest="quiet", default=False
    )
    options, args = parser.parse_args()
    if len(args) not in [1, 2]:
        parser.error("Please specify input and output file.")

    d = open(args[0], 'r').read() #Gets the input file path
    if len(args) == 2: #If input file path and output file path is provided => dest = output file path
        dst = args[1]
    else:
        base = os.path.basename(args[0]) #If not, output file = basename(input_file).suffix(default = "").png
        if "." in base:
            base, _ = base.rsplit(".", 1)
        dst = base + options.suffix + ".png"

    if os.path.exists(dst) and len(args) < 2: #If file with the output name exists, verify if the user wants to override it 
        print("Refusing to over-write '%s'. Specify explicitly if you really want to do this."%dst, file=sys.stderr)
        sys.exit(1)

    block = None#Not needed
    if options.block:
        parts = options.block.split(":")
        if len(parts) not in [2, 3]:
            raise ValueError("Invalid block specification.")
        s, e = int(parts[0], 16), int(parts[1], 16)
        if len(parts) == 3:
            c = draw.parseColor(parts[2])
        else:
            c = [255, 0, 0]
        block = (s, e, c)

    if options.color == "class":
        csource = ColorClass(d, block)
    elif options.color == "hilbert":
        csource = ColorHilbert(d, block)
    elif options.color == "gradient":
        csource = ColorGradient(d, block)
    else:
        csource = ColorEntropy(d, block)


    if options.progress:
        print(dst)

    if options.quiet or options.progress:
        prog = progress.Dummy()
    else:
        prog = progress.Progress(None)


    if options.type == "unrolled":
        drawmap_unrolled(options.map, options.size, csource, dst, prog)
    elif options.type == "square":
        drawmap_square(options.map, options.size, csource, dst, prog)
    prog.clear()

### Find bug / Debugging

In [233]:
#Set arguments required
map1 = "hilbert"
size = 128
d = '/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/NotPhish/0_17track_net.html'
block = None
csource = ColorHilbert(d, block) # problema
base = os.path.basename('/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/NotPhish/0_17track_net.html')
if "." in base:
    base, _ = base.rsplit(".", 1)
dst = base + "" + ".png"
prog = progress.Progress(None)

In [232]:
drawmap_unrolled(map1, size, csource, dst, prog)

TypeError: unsupported operand type(s) for >>: 'float' and 'int'

In [241]:
#Test function
prog.set_target((size**2)*4)
map = scurve.fromSize(map1, 2, size**2)
c = Image.new("RGB", (size, size*4))
cd = ImageDraw.Draw(c)
step = len(csource)/float(len(map)*4)
sofar = 0

In [184]:
#map.point(1)

#hilbert_point(2, 7, 1)

2

In [252]:
h = 1
order = map.order
dimension = map.dimension

hwidth = order*dimension

e, d = 0, 0
p = [0]*dimension

def hilbert_point(dimension, order, h):
    """
        Convert an index on the Hilbert curve of the specified dimension and
        order to a set of point coordinates.
    """
    #    The bit widths in this function are:
    #        p[*]  - order
    #        h     - order*dimension
    #        l     - dimension
    #        e     - dimension
    hwidth = order*dimension
    e, d = 0, 0
    p = [0]*dimension
    for i in range(order):
        w = utils.bitrange(h, hwidth, i*dimension, i*dimension+dimension)
        l = utils.graycode(w)
        l = itransform(e, d, dimension, l)
        for j in range(dimension):
            b = utils.bitrange(l, dimension, j, j+1)
            p[j] = utils.setbit(p[j], order, i, b)
        e = e ^ utils.lrot(entry(w), d+1, dimension)
        d = (d + direction(w, dimension) + 1)%dimension
    return p

def itransform(entry, direction, width, x):
    """
        Inverse transform - we simply reverse the operations in transform.
    """
    assert x < 2**width
    assert entry < 2**width
    return(utils.lrot(x, direction+1, width)^entry)
    # There is an error in the Hamilton paper's formulation of the inverse
    # transform in Lemma 2.12. The correct restatement as a transform is as follows:
    #return transform(rrot(entry, direction+1, width), width-direction-2, width, x)
    
def entry(x):
    if x == 0:
        return 0
    else:
        return utils.graycode(int(2*((x-1)/2)))
    
def direction(x, n):
    assert x < 2**n
    if x == 0:
        return 0
    elif x%2 == 0:
        return utils.tsb(x-1, n)%n
    else:
        return utils.tsb(x, n)%n

In [254]:
for i in range(order):
    w = utils.bitrange(h, hwidth, i*dimension, i*dimension+dimension)
    l = utils.graycode(w)
    l = itransform(e, d, dimension, l)
    for j in range(dimension):
        b = utils.bitrange(l, dimension, j, j+1)
        p[j] = utils.setbit(p[j], order, i, b)
    e = e ^ utils.lrot(entry(w), d+1, dimension)
    d = (d + direction(w, dimension) + 1)%dimension
    
print(p)

[1, 0]


In [222]:

int(2*((1-1)/2))

0

1

In [257]:
for i, p in enumerate(scurve.fromSize('hilbert', 2, size**2)):
    print(p)

[0, 0]


TypeError: unsupported operand type(s) for >>: 'float' and 'int'

In [157]:
map

<scurve.hilbert.Hilbert at 0x7fc302bef610>

In [129]:
for quad in range(4):
    for i, p in enumerate(map):
        off = (i + (quad * size**2))
        color = csource.point(
                    int(off * step)
                )
        x, y = tuple(p)
        cd.point(
            (x, y + (size * quad)),
            fill=tuple(color)
        )
        if not sofar%100:
            prog.tick(sofar)
        sofar += 1
c.save(name)

TypeError: unsupported operand type(s) for >>: 'float' and 'int'

In [105]:
#drawmap_unrolled(map, size, csource, dst, prog)

In [103]:
csource.point

<__main__.ColorHilbert at 0x7fc302e5cd30>

In [50]:
l = '0000000:0005a70[:byteclass]' 
parts = l.split(":")
print(parts)

['0000000', '0005a70[', 'byteclass]']


In [None]:
d = file(args[0]).read() # What is this?

dst = 'hilbert'

csource = ColorHilbert(d, block)

def drawmap_unrolled(map, size, csource, name, prog):
    prog.set_target((size**2)*4)
    map = scurve.fromSize(map, 2, size**2)
    c = Image.new("RGB", (size, size*4))
    cd = ImageDraw.Draw(c)
    step = len(csource)/float(len(map)*4)

    sofar = 0
    for quad in range(4):
        for i, p in enumerate(map):
            off = (i + (quad * size**2))
            color = csource.point(
                        int(off * step)
                    )
            x, y = tuple(p)
            cd.point(
                (x, y + (size * quad)),
                fill=tuple(color)
            )
            if not sofar%100:
                prog.tick(sofar)
            sofar += 1
    c.save(name)


In [53]:
path = '/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/NotPhish/0_17track_net.html'  

In [57]:
d = open(path, 'r')
content = d.read()

In [59]:
content

In [66]:
argg = '/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/NotPhish/0_17track_net.html'

base = os.path.basename(argg)

base.rsplit(".", 1)

#if "." in base:
#    base, _ = base.rsplit(".", 1)
#    dst = base + options.suffix + ".png"

['0_17track_net', 'html']

In [71]:
if 1: #If file with the output name exists, verify if the user wants to override it 
    print >> sys.stderr, "Refusing to over-write '%s'. Specify explicitly if you really want to do this."%dst
    sys.exit(1)

In [76]:
dst = '/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/NotPhish/0_17track_net.html'
print("Refusing to over-write '%s'. Specify explicitly if you really want to do this."%dst, file=sys.stderr)

Refusing to over-write '/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/NotPhish/0_17track_net.html'. Specify explicitly if you really want to do this.


In [78]:
scurve.curveMap.keys()

dict_keys(['hcurve', 'hilbert', 'zigzag', 'zorder', 'natural', 'gray'])

In [91]:
x = 10

y = x>>5

In [92]:
y

0

In [None]:
return x^(x>>1) #/scurve/utils.py : line 8, in graycode

return utils.graycode(2*((x-1)/2)) #/scurve/hilbert.py : line 37, in entry

e = e ^ utils.lrot(entry(w), d+1, dimension) #/scurve/hilbert.py : line 60, in hilbert_point

return hilbert_point(self.dimension, self.order, idx) #/scurve/hilbert.py : line 112, in point

return self.csource.point(int(c*self.step)) #/binvis.py : line 43, in getPoint

return self.getPoint(x) #/binvis.py : line 22, in point

color = csource.point( #/binvis.py : line 86, in drawmap_unrolled
    
drawmap_unrolled(options.map, options.size, csource, dst, prog) #/binvis.py : line 211, in main







In [None]:
x = 10 #(0000 1010)

x >> (width-end) & ((2**(end-start))-1)

In [258]:
base = os.path.basename('/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/NotPhish/0_17track_net.html')
if "." in base:
    base, _ = base.rsplit(".", 1)
dst = base + ""+ ".png"
print(dst)

0_17track_net.png


# Finalmente

In [16]:
#Set arguments required
map = "hilbert"
size = 256

d = '/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/NotPhish/0_afip_gob_ar.html'
block = None
csource = ColorClass(d, block) # problema
base = os.path.basename(d)
if "." in base:
    base, _ = base.rsplit(".", 1)
dst = base + "" + ".png"
prog = progress.Progress(None)

In [17]:
drawmap_unrolled(map, size, csource, dst, prog)

|---------------------------------------> | 0:00:00

In [None]:
#run on terminal
#python binvis.py -c class -m hilbert -s 128 '/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/NotPhish/0_17track_net.html'

# Pipeline

In [245]:
import pandas as pd
import time

#For training

training_phishing = '/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/Phish/' #Dir with the phishing websites
training_legitimate = '/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/NotPhish/' #Dir with the legitimate websites

binary_images_path = '/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/Binary_images/' #Dir to save images generated for phishing websites

n_samples = 250 #Number of samples per category(250 for phishing, 250 for legitimate)

def listdir_nohidden(path): #Lists all non hidden files
    for f in os.listdir(path):
        if not f.startswith('.'):
            yield f

def innit_df(n_samples): #function that returns the data set according to the number of samples of each category wanted.
    isPhish = pd.DataFrame(data = {'File': list(listdir_nohidden(training_phishing)), 'BinaryImage': None, 'Phishing': 1 }) #Defines phish df and places all file names in File column
    notPhish = pd.DataFrame(data = {'File': list(listdir_nohidden(training_legitimate)), 'BinaryImage': None, 'Phishing': 0}) #Defines legitimate df and places all file names in File column
    
    #isPhish.File = list(listdir_nohidden(training_phishing))
    #notPhish.File = list(listdir_nohidden(training_legitimate))
    
    sample_phish = isPhish.sample(n_samples) #Selects n_samples random rows from each df
    sample_legitimate = notPhish.sample(n_samples)
    
    final_df = (pd.concat([sample_phish, sample_legitimate], ignore_index = True)).sample(frac = 1).reset_index(drop = True) #returns a df with n_samples * 2 random rows made from the sample df's collected before
    final_df.Phishing = (final_df.Phishing).astype('category') #Set target column type to category
    
    return final_df #returns the dataframe

args = {'mapp': "hilbert", #Sets the static arguments
        'size': 128,
        'block': None,
        'prog': progress.Progress(None)
        }

def generate_images(df, mapp, size, block, prog): #Generates the images for the websites that are in the training dataframe
    aux_l = []
    for file in df.values:
        if(file[2] == 1):
            csource = ColorClass(training_phishing + file[0], block)
        else:
            csource = ColorClass(training_legitimate + file[0], block)
        dst = binary_images_path + file[0][:-5] + ".png"
        drawmap_unrolled(mapp, size, csource, dst, prog)
        aux_l.append(file[0][:-5] + ".png")
    df.BinaryImage = aux_l

In [241]:
df = innit_df(n_samples)

In [243]:
start_time = time.time()

generate_images(df, args['mapp'], args['size'], args['block'], args['prog'])

timee = (time.time() - start_time)
print("--- %s seconds ---" %timee)

|---------------------------------------> | 0:00:00

KeyboardInterrupt: 

Unnamed: 0,File,BinaryImage,Phishing
0,4_cryptositeslist_com_zx_spectrocoin.html,,0
1,6_xero_com.html,,0
2,7470384.html,,1
3,7459915.html,,1
4,3_forbes_com.html,,0
...,...,...,...
495,6_ushistory_org.html,,0
496,1_astrology_com.html,,0
497,7459972.html,,1
498,7468701.html,,1


In [239]:
aux_l = []
for file in df.values:
    print(file[1])
        
    break

None


Unnamed: 0,File,BinaryImage,Phishing
0,7458203.html,,1
1,7466595.html,,1
2,7468798.html,,1
3,6_google_com_my.html,,0
4,6_whowhatwear_com.html,,0
...,...,...,...
495,6_bungie_net.html,,0
496,7454556.html,,1
497,7454535.html,,1
498,5_att_net.html,,0


In [None]:
#Set arguments required

base = os.path.basename(d)
if "." in base:
    base, _ = base.rsplit(".", 1)

args = {map: "hilbert"
        size: 128
        csource: ColorClass(d, block)
        dst: "" + base + "" + ".png"
        }
#----------------------
map = "hilbert"
size = 128
block = None
d = '/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/NotPhish/0_550909_com.html'
block = None
csource = ColorClass(d, block) # problema
base = os.path.basename(d)
if "." in base:
    base, _ = base.rsplit(".", 1)
dst = base + "" + ".png"
prog = progress.Progress(None)

In [188]:
base = os.path.basename(d)

In [189]:
base

'0_afip_gob_ar.html'

In [257]:
def try_utf8(data):
    "Returns a Unicode object on success, or None on failure"
    try:
       return data.decode('utf-8')
    except UnicodeDecodeError:
       return None

In [273]:
file = open('/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/Phish/7453961.html', 'rb')
data = file.read()
udata = try_utf8(data)

In [275]:
print(udata)

None


In [11]:
args = {'mapp': "hilbert", #Sets the static arguments
        'size': 128,
        'block': None,
        'prog': progress.Progress(None)
        }

In [40]:
d = '/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/NotPhish/0_550909_com.html'
prog = args['prog']
size = args['size']
map = args['mapp']

csource = ColorHilbert(open(d).read(), args['block'])
prog.set_target((size**2)*4)
map = scurve.fromSize(map, 2, size**2)
c = Image.new("RGB", (size, size*4))
cd = ImageDraw.Draw(c)
step = len(csource)/float(len(map)*4)

sofar = 0
for quad in range(4):
    for i, p in enumerate(map):
        off = (i + (quad * size**2))
        color = csource.point(
                    int(off * step)
                )
        x, y = tuple(p)
        cd.point(
            (x, y + (size * quad)),
            fill=tuple(color)
        )
        if not sofar%100:
            prog.tick(sofar)
        sofar += 1


#drawmap_unrolled(mapp, size, csource, dst, prog)

In [None]:
im = imageio.imread(c)

In [None]:
prog.set_target((size**2)*4)
map = scurve.fromSize(map, 2, size**2)
c = Image.new("RGB", (size, size*4))
cd = ImageDraw.Draw(c)
step = len(csource)/float(len(map)*4)

sofar = 0
for quad in range(4):
    for i, p in enumerate(map):
        off = (i + (quad * size**2))
        color = csource.point(
                    int(off * step)
                )
        x, y = tuple(p)
        cd.point(
            (x, y + (size * quad)),
            fill=tuple(color)
        )
        if not sofar%100:
            prog.tick(sofar)
        sofar += 1
c.save(name)


In [43]:
import imageio.v3 as imageio

im = imageio.imread('/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/Binary_images/0_canlidoviz_com.png')


In [44]:
# Reads all images in a given path
import imageio.v3 as iio
from pathlib import Path

images = list()
for file in Path("/Users/joao.ferreira/Documents/Anaconda-jupyter/archive/training/Binary_images").iterdir():
    if not file.is_file():
        continue

    images.append(iio.imread(file))

array([[[120, 180,  15],
        [ 30,  68,  51],
        [196, 236, 177],
        ...,
        [ 30,  68,  51],
        [207, 236, 241],
        [202,  28, 161]],

       [[ 32,  34,  13],
        [ 32,  34,  13],
        [189, 107, 217],
        ...,
        [189, 107, 217],
        [239,  17, 241],
        [ 76, 196, 195]],

       [[188,  43,  61],
        [150,  30, 195],
        [ 24,  46, 125],
        ...,
        [150, 240, 195],
        [210,  90, 221],
        [150, 136,  75]],

       ...,

       [[189,  46, 228],
        [189,  46, 228],
        [133,  92, 200],
        ...,
        [ 52, 163,  74],
        [196, 236, 177],
        [239,  17, 241]],

       [[ 30, 180,  60],
        [135, 240, 136],
        [170,  51, 170],
        ...,
        [ 32,  34,  13],
        [227, 197,  81],
        [125,  34,  40]],

       [[153, 240,  34],
        [135, 240, 136],
        [125,  34,  40],
        ...,
        [213, 109,  89],
        [244, 185, 144],
        [207, 236, 241]]