I want to create the adjacency list for the "valid_redirects". 

1. Get the `ads.txt` files
2. Figure out how to clean up each file
    - The aim is to get every row of the file into a pandas dataframe
    - I would have to deal with comments
    - Throwaway certain rows
    - among other stuff...
3. Output the adjacency list with the name of the publisher against the name of the platform
4. Make a networkx object

In [1]:
import pandas as pd
import requests as req
from urllib.parse import urlparse
import os
from itertools import islice
from bs4 import BeautifulSoup
import numpy as np
import re
import networkx as nx

In [9]:
req.get('https://w123.com')

ConnectTimeout: HTTPSConnectionPool(host='w123.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001D50FE9B760>, 'Connection to w123.com timed out. (connect timeout=None)'))

## 1. Get the `ads.txt` files

Since this is parallelized, it is in a separate script - See `getting_adstxt.py`

## 2. Figure out how to clean up each file

- The aim is to get every row of the file into a pandas dataframe
    - I would have to deal with comments
    - Throwaway certain rows
    - among other stuff...

In [2]:
#A function to print n elements from a dict just for convenience
#From here https://stackoverflow.com/a/7971655/10098211
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [3]:
#Getting everything read into a dictionary
adstxt_dict = dict()
directory = "./ads_files"
adstxt_files = os.listdir(directory)

for filename in adstxt_files:
    base_url = filename.rsplit('.', 1)[0]
    with open(f"{directory}/{filename}", 'rb') as adstxt_file:
        adstxt_dict[base_url] = adstxt_file.readlines()

#Printing a couple of items just to eyeball it
# n_items = take(2, adstxt_dict.items()) 
# print(n_items)

In [4]:
def ads_txt_uncommenter(adstxt_line):
    adstxt_line = adstxt_line.decode('utf-8', errors='ignore')
    adstxt_line_uncomment = adstxt_line.split('#', 1)[0]
    return adstxt_line_uncomment.strip()

In [5]:
adstxt_decoded_dict = dict()
for key, val in adstxt_dict.items():
    try:
        adstxt_decoded_dict[key] = [ads_txt_uncommenter(x) for x in val]
    except Exception as e:
        print(key, str(e))

In [6]:
#Throwaway certain lines
adstxt_decoded_weeded_dict = dict()
for key, val in adstxt_decoded_dict.items():
    adstxt_decoded_weeded_dict[key] = list()
    for line in val:
        if line.strip() == '':
            continue
        elif bool(BeautifulSoup(line, "html.parser").find()): #https://stackoverflow.com/a/24856208/10098211
            continue 
        else:
            adstxt_decoded_weeded_dict[key].append(line)



In [7]:
#separating the variables and legitimate lines
adstxt_decoded_weeded_formatted_dict = dict()
for key, value in adstxt_decoded_weeded_dict.items():
    variables = []
    lines = []
    for line in value:
        if line.__contains__('='):
            variables.append(line)
        else:
            lines.append(line)
    adstxt_decoded_weeded_formatted_dict[key] = [lines, variables]

In [8]:
#Creating dataframe
adstxt_df = pd.DataFrame.from_dict(adstxt_decoded_weeded_formatted_dict, orient='index', columns = None)#, columns = ['base_url', 'adstxt_lines'])
adstxt_df.columns = ['adstxt_lines', 'variables']
adstxt_df.head(5)

Unnamed: 0,adstxt_lines,variables
397news.com,[],[]
abcnews.go.com,"[themediagrid.com, CGBR8T, DIRECT, 35d5010d778...","[OWNERDOMAIN=disney.com, MANAGERDOMAIN=theglob..."
aberdeennews.com,"[media.net, 8CUDQDX7N, DIRECT, aniview.com, 62...","[ownerdomain=gannett.com, subdomain=obits.news..."
abilene-rc.com,"[townnews.com, 22666575351, DIRECT, google.com...",[﻿MANAGERDOMAIN=townnews.com]
abqjournal.com,"[google.com, pub-1192071988574450, DIRECT, f08...","[subdomain=v7.comicskingdom.net, subdomain=gam..."


In [9]:
#Throwiung away urls with empty files
adstxt_df['file_len'] = adstxt_df['adstxt_lines'].apply(lambda x: len(x))
adstxt_filled_df = adstxt_df.loc[(adstxt_df.file_len != 0) & ~(adstxt_df.file_len.isna())]
print(len(adstxt_df))
print(len(adstxt_filled_df))

2527
2467


In [10]:
adstxt_filled_df = adstxt_filled_df.reset_index(names= 'base_url')
adstxt_filled_df.head(4)

Unnamed: 0,base_url,adstxt_lines,variables,file_len
0,abcnews.go.com,"[themediagrid.com, CGBR8T, DIRECT, 35d5010d778...","[OWNERDOMAIN=disney.com, MANAGERDOMAIN=theglob...",210
1,aberdeennews.com,"[media.net, 8CUDQDX7N, DIRECT, aniview.com, 62...","[ownerdomain=gannett.com, subdomain=obits.news...",726
2,abilene-rc.com,"[townnews.com, 22666575351, DIRECT, google.com...",[﻿MANAGERDOMAIN=townnews.com],1668
3,abqjournal.com,"[google.com, pub-1192071988574450, DIRECT, f08...","[subdomain=v7.comicskingdom.net, subdomain=gam...",1201


## 3. Output the adjacency list with the name of the publisher against the name of the platform


In [11]:
# with open('./data/adjacency_list_df.csv', 'w') as outfile:
#     outfile.write("publisher,platform,acc_id,type,tag_id\n")

# for row in adstxt_filled_df.values:
#     for adstxt_line in row[1]:
#         cols = adstxt_line.split(",")
#         if len(cols) > 4: #I observed that some lines weren't properly broken. So I am going to drop things after column 4
#             adstxt_line = ','.join(cols[:4])
#         with open('./data/adjacency_list_df.csv', 'a') as outfile:
#             try:
#                 outfile.write(f"{row[0]},{adstxt_line}\n")
#             except: # If there is an encoding issue with a line, I am not writing it out
#                 continue

In [10]:
adj_df = pd.read_csv('./data/adjacency_list_df.csv', sep=',', encoding_errors = 'ignore')
adj_df.head(5)

Unnamed: 0,publisher,platform,acc_id,type,tag_id
0,abcnews.go.com,themediagrid.com,CGBR8T,DIRECT,35d5010d7789b49d
1,abcnews.go.com,taboola.com,1184469,DIRECT,c228e6794e811952
2,abcnews.go.com,telaria.com,i79zj-650ov,DIRECT,1a4e959a1b50034a
3,abcnews.go.com,tremorhub.com,i79zj-650ov,DIRECT,1a4e959a1b50034a
4,abcnews.go.com,theglobeandmail.com,pub-7154879651992621,DIRECT,


In [12]:
adj_df.loc[adj_df.platform.str.contains('placeholder') & ~(adj_df.platform.isna())]

Unnamed: 0,publisher,platform,acc_id,type,tag_id
318367,darnews.com,placeholder.example.com,placeholder,DIRECT,placeholder
1001623,osceolatimes.com,placeholder.example.com,placeholder,DIRECT,placeholder
1205970,tcm.com,placeholder.example.com,placeholder,DIRECT,placeholder


In [3]:
for col in adj_df.columns:
    adj_df[col] = adj_df[col].str.strip()
adj_df['type'] = adj_df['type'].str.replace('\W+', '').str.upper()
adj_df.head(5)

  adj_df['type'] = adj_df['type'].str.replace('\W+', '').str.upper()


Unnamed: 0,publisher,platform,acc_id,type,tag_id
0,abcnews.go.com,themediagrid.com,CGBR8T,DIRECT,35d5010d7789b49d
1,abcnews.go.com,taboola.com,1184469,DIRECT,c228e6794e811952
2,abcnews.go.com,telaria.com,i79zj-650ov,DIRECT,1a4e959a1b50034a
3,abcnews.go.com,tremorhub.com,i79zj-650ov,DIRECT,1a4e959a1b50034a
4,abcnews.go.com,theglobeandmail.com,pub-7154879651992621,DIRECT,


In [115]:
#Get rid of bad rows
tidy_adj_df = adj_df.loc[~(adj_df.platform.isna()), :]
tidy_adj_df = tidy_adj_df.loc[(tidy_adj_df.type.isin(['DIRECT', 'RESELLER', 'BOTH'])) & 
                              ~(tidy_adj_df.platform.str.contains('placeholder')),:]
tidy_adj_df.platform = tidy_adj_df.platform.apply(lambda x: 'google.com' if x.__contains__('google.com') else x)
tidy_adj_df.platform = tidy_adj_df.platform.apply(lambda x: 'indexexchange.com' if x == 'indexexchange.comindexexchange.com' else x)
tidy_adj_df.head(5)

Unnamed: 0,publisher,platform,acc_id,type,tag_id
0,abcnews.go.com,themediagrid.com,CGBR8T,DIRECT,35d5010d7789b49d
1,abcnews.go.com,taboola.com,1184469,DIRECT,c228e6794e811952
2,abcnews.go.com,telaria.com,i79zj-650ov,DIRECT,1a4e959a1b50034a
3,abcnews.go.com,tremorhub.com,i79zj-650ov,DIRECT,1a4e959a1b50034a
4,abcnews.go.com,theglobeandmail.com,pub-7154879651992621,DIRECT,


In [91]:
def remove_www(x):
    if x.startswith('www') or x.startswith('ww1'):
        return x[4:]
    else:
        return x

In [101]:
def platform_name_cleaner(platform_name):
    split_name = platform_name.split(' ')
    for name in split_name:
        if name.__contains__('.'):
            cur_name = name.lower().strip()
            if cur_name.__contains__('http'):
                return remove_www(urlparse(cur_name).netloc)
            else:
                return remove_www(cur_name)
        else: 
            continue

In [116]:
tidy_adj_df.platform = tidy_adj_df.platform.apply(lambda x: platform_name_cleaner(x))

In [117]:
100*(len(adj_df) - len(tidy_adj_df))/len(tidy_adj_df) #% of rows lost
#Small enough I guess

0.8464118225831754

In [118]:
tidy_adj_df.tail(5)

Unnamed: 0,publisher,platform,acc_id,type,tag_id
1394954,zanesvilletimesrecorder.com,taboola.com,1111283,DIRECT,c228e6794e811952
1394955,zanesvilletimesrecorder.com,appnexus.com,1356,DIRECT,f5ab79cb980f11d1
1394956,zanesvilletimesrecorder.com,openx.com,538929384,DIRECT,6a698e2ec38604c6
1394957,zanesvilletimesrecorder.com,telaria.com,d3kkw-751vw,DIRECT,1a4e959a1b50034a
1394958,zanesvilletimesrecorder.com,liveintent.com,20716,DIRECT,


In [139]:
edgelist_df = tidy_adj_df.groupby(['publisher', 'platform', 'type']).count()['acc_id'].reset_index()
#edgelist_df['acc_id'] = edgelist_df['acc_id'].apply(lambda x: {"acc_id_cnt": x})
edgelist_df.head(5)

Unnamed: 0,publisher,platform,type,acc_id
0,Idahostatesman.com,1rx.io,DIRECT,2
1,Idahostatesman.com,33across.com,DIRECT,1
2,Idahostatesman.com,33across.com,RESELLER,2
3,Idahostatesman.com,ad-generation.jp,RESELLER,2
4,Idahostatesman.com,adcolony.com,RESELLER,1


In [141]:
print(len(edgelist_df))
edgelist_df = edgelist_df.loc[~(edgelist_df.publisher == edgelist_df.platform)]
print(len(edgelist_df))

182964
182961


In [142]:
#Writing it out
edgelist_df.to_csv('./data/edge_list.txt', sep=' ', index = False, header = False)

## 4. Make a networkx object

In [143]:
with open('./data/edge_list.txt', 'r') as infile:
    nx_edge_list = infile.readlines()
for ind in range(len(nx_edge_list)):
    nx_edge_list[ind] = nx_edge_list[ind].strip()
nx_edge_list[:5]

['Idahostatesman.com 1rx.io DIRECT 2',
 'Idahostatesman.com 33across.com DIRECT 1',
 'Idahostatesman.com 33across.com RESELLER 2',
 'Idahostatesman.com ad-generation.jp RESELLER 2',
 'Idahostatesman.com adcolony.com RESELLER 1']

In [122]:
graph_obj = nx.algorithms.bipartite.parse_edgelist(nx_edge_list, data=(("type", str),("weight", str),))

In [123]:
len(graph_obj.nodes)

2679

In [124]:
len(graph_obj.edges)

137041

In [125]:
len(tidy_adj_df.platform.drop_duplicates())

632

In [126]:
len(tidy_adj_df.publisher.drop_duplicates())

2051

## 5. Make the files for neo4j

I need a publishers file (with unique Ids), a platform file (with unique Ids) and a uses file with edges.

Let us ignore publisher attributes and platform attributes for now

In [144]:
edgelist_df.columns = ['pubName', 'platName', 'relType', 'idCnt']
edgelist_df.head(5)

Unnamed: 0,pubName,platName,relType,idCnt
0,Idahostatesman.com,1rx.io,DIRECT,2
1,Idahostatesman.com,33across.com,DIRECT,1
2,Idahostatesman.com,33across.com,RESELLER,2
3,Idahostatesman.com,ad-generation.jp,RESELLER,2
4,Idahostatesman.com,adcolony.com,RESELLER,1


In [145]:
publishers = edgelist_df['pubName'].drop_duplicates().reset_index()
publishers.columns = ['pubKey', 'pubName']
publishers.head(5)

Unnamed: 0,pubKey,pubName
0,0,Idahostatesman.com
1,110,PetroleumNews.com
2,111,abcnews.go.com
3,174,aberdeennews.com
4,261,abilene-rc.com


In [146]:
#Writing out publishers file
path = r"C:\Users\venki\OneDrive - The University of Texas at Dallas\Documents\Personal files\Academics\MSSDAR\Sem1\data_methods_epps_6302\Team Project\neo4j_assets"
publishers.to_csv(f"{path}/publishers.csv", sep =',', index = False)

In [147]:
platforms = edgelist_df['platName'].drop_duplicates().reset_index()
platforms.columns = ['platkey', 'platName']
platforms.head(5)

Unnamed: 0,platkey,platName
0,0,1rx.io
1,1,33across.com
2,3,ad-generation.jp
3,4,adcolony.com
4,5,admanmedia.com


In [148]:
#Writing out platforms file
path = r"C:\Users\venki\OneDrive - The University of Texas at Dallas\Documents\Personal files\Academics\MSSDAR\Sem1\data_methods_epps_6302\Team Project\neo4j_assets"
platforms.to_csv(f"{path}/platforms.csv", sep =',', index = False)

In [149]:
edgelist_df = edgelist_df.merge(publishers, left_on = 'pubName', right_on = 'pubName')
edgelist_df = edgelist_df.merge(platforms, left_on = 'platName', right_on = 'platName')

In [150]:
edgelist_df.head(5)

Unnamed: 0,pubName,platName,relType,idCnt,pubKey,platkey
0,Idahostatesman.com,1rx.io,DIRECT,2,0,0
1,amsnow.com,1rx.io,RESELLER,1,4004,0
2,bellinghamherald.com,1rx.io,DIRECT,2,11643,0
3,biography.com,1rx.io,DIRECT,2,13738,0
4,bnd.com,1rx.io,DIRECT,2,14815,0


In [151]:
edgelist_df[['pubKey', 'platkey', 'relType', 'idCnt']]

Unnamed: 0,pubKey,platkey,relType,idCnt
0,0,0,DIRECT,2
1,4004,0,RESELLER,1
2,11643,0,DIRECT,2
3,13738,0,DIRECT,2
4,14815,0,DIRECT,2
...,...,...,...,...
182956,180318,180404,DIRECT,1
182957,180318,180435,DIRECT,1
182958,181525,181526,RESELLER,1
182959,181525,181529,RESELLER,1


In [152]:
#Writing out uses file
path = r"C:\Users\venki\OneDrive - The University of Texas at Dallas\Documents\Personal files\Academics\MSSDAR\Sem1\data_methods_epps_6302\Team Project\neo4j_assets"
edgelist_df[['pubKey', 'platkey', 'relType', 'idCnt']].to_csv(f"{path}/uses.csv", sep =',', index = False)

In [153]:
publishers.loc[publishers.pubName == 'nytimes.com']

Unnamed: 0,pubKey,pubName
1442,127708,nytimes.com


In [154]:
edgelist_df.relType.unique()

array(['DIRECT', 'RESELLER'], dtype=object)