In [2]:
pip install yarl



In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
from yarl import URL
url = URL("https://github.com/search?q=data+science")

In [5]:
url.scheme

'https'

In [6]:
import pandas as pd
data = pd.read_csv("/content/gdrive/MyDrive/Datasets/URL Classification.csv", names=["url", "Type"], index_col=0)

In [7]:
sample = data.sample(10000, random_state=1)
sample.head(10)

Unnamed: 0,url,Type
1308349,http://yorkrite.com/ne/gcram/,Society
276825,http://www.alliedartistsofamerica.org/,Arts
1315267,http://www.msstate.edu/org/farmhouse/index.html,Society
392387,http://www.sunnytec.com.tw/,Business
1267730,http://www.kabissa.org,Society
1392788,http://www.newadvent.org/cathen/12134b.htm,Society
672881,http://www.gamespot.com/ps2/driving/arcticthun...,Games
1526125,http://www.studsquad.net/,Sports
1009901,http://www.spacedog.biz,Reference
330964,http://www.meditrans-japan.com/,Business


In [8]:
sample["url"] = sample["url"].apply(lambda url: URL(url))

processed = sample.assign(
    host=sample.url.apply(lambda url: url.host),
    path=sample.url.apply(lambda url: url.path),
    name=sample.url.apply(lambda url: url.name),
    scheme=sample.url.apply(lambda url: url.scheme),
    query=sample.url.apply(lambda url: url.query_string),
)
processed

Unnamed: 0,url,Type,host,path,name,scheme,query
1308349,http://yorkrite.com/ne/gcram/,Society,yorkrite.com,/ne/gcram/,,http,
276825,http://www.alliedartistsofamerica.org/,Arts,www.alliedartistsofamerica.org,/,,http,
1315267,http://www.msstate.edu/org/farmhouse/index.html,Society,www.msstate.edu,/org/farmhouse/index.html,index.html,http,
392387,http://www.sunnytec.com.tw/,Business,www.sunnytec.com.tw,/,,http,
1267730,http://www.kabissa.org,Society,www.kabissa.org,/,,http,
...,...,...,...,...,...,...,...
169463,http://www.naxos.com/composerinfo/3303.htm,Arts,www.naxos.com,/composerinfo/3303.htm,3303.htm,http,
1094461,http://members.tripod.com/arroweb1/,Science,members.tripod.com,/arroweb1/,,http,
526843,http://www.petroskills.com/,Business,www.petroskills.com,/,,http,
171473,http://www.gregbartholomew.com,Arts,www.gregbartholomew.com,/,,http,


In [9]:
group = processed.groupby(["Type", "host"]).agg(count=("url", "count"))

# Get the top 5 most popular hosts for each type
sorted_group = group.sort_values(by="count", ascending=False).reset_index()
largest = sorted_group.groupby("Type").head(5).sort_values(by='Type')

# View the bottom 10 rows
largest.tail(10)

Unnamed: 0,Type,host,count
17,Society,members.tripod.com,12
25,Society,en.wikipedia.org,9
7,Society,www.angelfire.com,25
3,Society,www.geocities.com,56
1,Society,www.newadvent.org,91
35,Sports,sportsillustrated.cnn.com,7
11,Sports,www.geocities.com,18
45,Sports,www.clubwebsite.co.uk,6
50,Sports,www.angelfire.com,6
47,Sports,www.freewebs.com,6


In [10]:
pip install graphistry

Collecting graphistry
  Downloading graphistry-0.29.6-py3-none-any.whl (228 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.6/228.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting palettable>=3.0 (from graphistry)
  Downloading palettable-3.3.3-py2.py3-none-any.whl (332 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m332.3/332.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting squarify (from graphistry)
  Downloading squarify-0.4.3-py3-none-any.whl (4.3 kB)
Installing collected packages: squarify, palettable, graphistry
Successfully installed graphistry-0.29.6 palettable-3.3.3 squarify-0.4.3


In [11]:
import graphistry

graphistry.register(api=3, username='#######', password='########')

In [12]:
edges = largest[["Type", "host"]]

In [13]:
def create_node_df(df: pd.DataFrame, col_name: str):
  nodes = (
        df[[col_name]]
        .assign(type=col_name)
        .rename(columns={col_name: "node"})
        .drop_duplicates()
  )
  return nodes


type_nodes = create_node_df(largest, "Type")
url_nodes = create_node_df(largest, "host")
nodes = pd.concat([type_nodes, url_nodes])
nodes

Unnamed: 0,node,type
115,Adult,Type
12,Arts,Type
114,Business,Type
9,Computers,Type
41,Games,Type
127,Health,Type
18,Home,Type
111,Kids,Type
1343,News,Type
4,Recreation,Type


In [14]:

g = (
    graphistry
    .edges(edges, "Type", "host")
    .nodes(nodes, "node")
)

g.plot()

In [15]:
pip install faker

Collecting faker
  Downloading Faker-19.10.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-19.10.0


In [16]:
from faker import Faker

types = list(edges.Type.unique())

fake = Faker()
colors = [fake.color() for _ in range(len(types))]
node_color_mapping = dict(zip(types, colors))
node_color_mapping

{'Adult': '#9023c6',
 'Arts': '#eda097',
 'Business': '#ea7344',
 'Computers': '#ea9570',
 'Games': '#f0ffb2',
 'Health': '#078450',
 'Home': '#6639ba',
 'Kids': '#238189',
 'News': '#ffd0c4',
 'Recreation': '#2ea0a0',
 'Reference': '#fcf294',
 'Science': '#473899',
 'Shopping': '#93f9b0',
 'Society': '#78b9e8',
 'Sports': '#14d166'}

In [17]:
node_icon_mapping = {"host": "link", "Type": "newspaper-o"}

In [18]:
g = (
    graphistry.edges(edges, "Type", "host")
    .nodes(nodes, "node")
    .encode_point_color(
        "node", categorical_mapping=node_color_mapping, default_mapping="silver"
    )
    .encode_point_icon(
        "type", categorical_mapping=node_icon_mapping
    )
)

g.plot()