In [1]:
import pandas as pd
from dataprep.load_annotated_data import load_corpus

data = load_corpus('modified_corpus.csv')
print(len(data))
print(data[0])

1066
OrderedDict([('source_url', 'http://www.villagevoice.com/'), ('source_url_processed', 'villagevoice.com'), ('URL', 'http://mediabiasfactcheck.com/the-village-voice/'), ('fact', 'HIGH'), ('bias', 'left')])


In [2]:
from dataprep.alexa_scrapper import ScrapeAlexa
from tqdm import tqdm

results = {}
for row in tqdm(data):
    results[row['source_url_processed']] = ScrapeAlexa(row['source_url_processed']).scrape_alexa_site_info()

100%|██████████| 1066/1066 [02:50<00:00,  6.25it/s]


In [3]:
results['villagevoice.com']

{'site': 'villagevoice.com',
 'score': [{'url': 'nylon.com', 'overlap_score': 4.1, 'alexa_rank': 51694.0},
  {'url': 'slantmagazine.com', 'overlap_score': 2.5, 'alexa_rank': 92418.0},
  {'url': 'gothamist.com', 'overlap_score': 2.4, 'alexa_rank': 16813.0},
  {'url': 'screendaily.com', 'overlap_score': 2.1, 'alexa_rank': 110772.0},
  {'url': 'amny.com', 'overlap_score': 2.1, 'alexa_rank': 69192.0}],
 'audience_geography': [{'country': 'United States', 'percent': 77.2}]}

In [4]:
res_df = pd.DataFrame.from_dict(results, orient='index')

res_df.head()

Unnamed: 0,site,score,audience_geography
villagevoice.com,villagevoice.com,"[{'url': 'nylon.com', 'overlap_score': 4.1, 'a...","[{'country': 'United States', 'percent': 77.2}]"
insideclimatenews.org,insideclimatenews.org,"[{'url': 'wri.org', 'overlap_score': 12.8, 'al...","[{'country': 'United States', 'percent': 69.0}..."
fury.news,fury.news,"[{'url': 'settingrecordstraight.blogspot.com',...",[]
now8news.com,now8news.com,"[{'url': 'donatebits.com', 'overlap_score': 11...",[]
constitution.com,constitution.com,"[{'url': 'themilreview.com', 'overlap_score': ...",[]


In [5]:
# Number of records with no results

res_df['score'].isnull().sum()

0

In [6]:
res_df[res_df['score'].isnull()]

Unnamed: 0,site,score,audience_geography


In [7]:
res_df['score'].apply(lambda x: len(x) if x else 0).value_counts()

5    939
0     63
1     18
2     16
4     14
3     12
Name: score, dtype: int64

In [8]:
res_df[res_df['score'].apply(lambda x: len(x) if x else 0) == 1]


Unnamed: 0,site,score,audience_geography
loser.com,loser.com,"[{'url': 'theamericanscoop.com', 'overlap_scor...",[]
gawken.com,gawken.com,"[{'url': 'canipunchnazis.com', 'overlap_score'...",[]
brokenworldnews.com,brokenworldnews.com,"[{'url': 'roman-catholic.com', 'overlap_score'...",[]
harvardpolitics.com,harvardpolitics.com,"[{'url': 'thecrimson.com', 'overlap_score': 1....",[]
viralcords.com,viralcords.com,"[{'url': 'fakenewscodex.com', 'overlap_score':...",[]
progressivestoday.com,progressivestoday.com,"[{'url': 'libertynewsdaily.com', 'overlap_scor...",[]
deepstatenation.com,deepstatenation.com,"[{'url': 'deepstatenation.news', 'overlap_scor...",[]
conservativetoday.com,conservativetoday.com,"[{'url': 'according2hiphop.com', 'overlap_scor...",[]
londonwebnews.com,londonwebnews.com,"[{'url': 'americablog.com', 'overlap_score': 1...",[]
modernliberals.com,modernliberals.com,"[{'url': 'politicalgarbagechute.com', 'overlap...",[]


In [9]:
res_df[res_df['score'].apply(lambda x: len(x) if x else 0) == 2]


Unnamed: 0,site,score,audience_geography
liberaldarkness.com,liberaldarkness.com,"[{'url': 'grubwear.com', 'overlap_score': 4.9,...",[]
deepleftfield.info,deepleftfield.info,"[{'url': 'politicalflare.com', 'overlap_score'...","[{'country': 'United States', 'percent': 96.8}]"
newsuptoday.com,newsuptoday.com,"[{'url': 'theconservativeteam.com', 'overlap_s...",[]
dailydems.com,dailydems.com,"[{'url': 'resistancegenealogy.com', 'overlap_s...",[]
addictinginfo.org,addictinginfo.org,"[{'url': 'liberaltruthnow.blogspot.com', 'over...",[]
surenews.com,surenews.com,"[{'url': 'bustedlocals.com', 'overlap_score': ...",[]
mbganews.com,mbganews.com,"[{'url': 'peoplescharter.org', 'overlap_score'...",[]
globetoday.com,globetoday.com,"[{'url': 'petsite.net', 'overlap_score': 4.4, ...",[]
syriana-analysis.com,syriana-analysis.com,"[{'url': 'ingaza.wordpress.com', 'overlap_scor...",[]
clickhole.com,clickhole.com,"[{'url': 'theonion.com', 'overlap_score': 2.7,...","[{'country': 'United States', 'percent': 67.9}]"


In [10]:
res_df[res_df['score'].apply(lambda x: len(x) if x else 0) == 3]

Unnamed: 0,site,score,audience_geography
aceflashman.wordpress.com,aceflashman.wordpress.com,"[{'url': 'alisonangel.com', 'overlap_score': 3...",[]
newsbreakshere.com,newsbreakshere.com,"[{'url': 'moroccanews.com', 'overlap_score': 7...",[]
pacificresearch.org,pacificresearch.org,"[{'url': 'notmytollroad.com', 'overlap_score':...","[{'country': 'United States', 'percent': 48.0}..."
trumpservativenews.info,trumpservativenews.info,"[{'url': 'patriotsforamerica.org', 'overlap_sc...",[]
fairobserver.com,fairobserver.com,"[{'url': 'fpif.org', 'overlap_score': 1.4, 'al...","[{'country': 'India', 'percent': 41.5}, {'coun..."
newromantimes.com,newromantimes.com,"[{'url': 'physicsofsex.blogspot.com', 'overlap...",[]
extranewsfeed.com,extranewsfeed.com,"[{'url': 'rlslawyers.com', 'overlap_score': 1....","[{'country': 'United States', 'percent': 86.4}]"
ibleedredwhiteblue.com,ibleedredwhiteblue.com,"[{'url': 'republicmainstreet.wordpress.com', '...",[]
americantruthseekers.com,americantruthseekers.com,"[{'url': 'irumormill.com', 'overlap_score': 1....",[]
newcenturytimes.com,newcenturytimes.com,"[{'url': 'nct.news', 'overlap_score': 9.6, 'al...",[]


# Save results from scrapping level 0

In [11]:
import json

with open("/home/paco/Documents/site_similarity/data/scrapping_results/level 0 results.json", 'w') as f:
    json.dump(results, f, indent=4)
