In [30]:
import pandas as pd
import newspaper
from googlesearch import search
import pdb
import tldextract
from datetime import timedelta


In [131]:
import pandas as pd
import newspaper
from googlesearch import search
import pdb
import argparse
import tldextract
from datetime import timedelta




def get_url_bias(url, corpus):
    
    try:
        return corpus.loc[corpus['source_url_processed'].str.contains(tldextract.extract(url).domain)]['bias'].values[0]
    except IndexError:
        return 'center'
    
def get_url_fact(url, corpus):
    try:
        return corpus.loc[corpus['source_url_processed'].str.contains(tldextract.extract(url).domain)]['fact'].values[0]
    except IndexError:
        return 'MIXED'
    
def get_search_query(url):

    article = newspaper.Article(url)

    article.download()
    article.parse()
    article.nlp()

    query             = article.title
    date_before       = article.publish_date + timedelta(days=2)
    date_after        = article.publish_date - timedelta(days=2)

    query_time_before = str(date_before.year) +\
                        '-' + str(date_before.month) +\
                        '-' + str(date_before.day)
            
    query_time_after = str(date_after.year) +\
                        '-' + str(date_after.month) +\
                        '-' + str(date_after.day)
            
            
    query = query + ' before:' + query_time_before + ' after:' + query_time_after
        
    return query
    

    
def get_query_results(search_query):
    
    alt_article_lists = [i for i in search(search_query, num = 10)]
    search_results_df = pd.DataFrame(columns = ['link', 'domain', 'title', 'content'])

    search_results_df['link'] = alt_article_lists
    search_results_df['domain'] = search_results_df['link'].apply(lambda x: tldextract.extract(x).domain\
                                                                             + '.' + tldextract.extract(x).suffix)
    
    return search_results_df


def parse_params():

    parser = argparse.ArgumentParser(description='Source Reliability')
    parser.add_argument('--url',             type=str, default='')
    params = parser.parse_args()
    return params




def main():

    
    user_params = parse_params()
    url         = user_params.url
    corpus      = pd.read_csv('data/corpus.csv')

    url_bias      = get_url_bias(url, corpus)
    url_fact      = get_url_fact(url, corpus)
    
    query         = get_search_query(url)
    query_results_df = get_query_results(query)
    query_results_df = query_results_df.fillna('')


    search_results_df = pd.merge(corpus, query_results_df, left_on = 'source_url_processed', right_on = 'domain')
    search_results_df = search_results_df[~search_results_df['bias'].str.replace('-', ' ').str.contains('right')]
    
    search_results_df.to_csv('results/search_results.csv')

    return search_results_df.to_dict(orient = 'index')


def get_alternative_links(url):

    
    corpus      = pd.read_csv('data/corpus.csv')

    url_bias      = get_url_bias(url, corpus)
    url_fact      = get_url_fact(url, corpus)
    
    query         = get_search_query(url)
    query_results_df = get_query_results(query)
    query_results_df = query_results_df.fillna('')


    search_results_df = pd.merge(corpus, query_results_df, left_on = 'source_url_processed', right_on = 'domain')
    search_results_df = search_results_df[~search_results_df['bias'].str.replace('-', ' ').str.contains('right')]
    

    return search_results_df.to_dict(orient = 'index')



In [132]:
get_alternative_links('http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/')

{1: {'URL': 'https://mediabiasfactcheck.com/the-peoples-cube/',
  'bias': 'center',
  'content': '',
  'domain': 'thepeoplescube.com',
  'fact': 'LOW',
  'link': 'https://thepeoplescube.com/peoples-blog/america-duck-yeah-victory-merchandise-t12741.html',
  'source_url': 'http://thepeoplescube.com',
  'source_url_processed': 'thepeoplescube.com',
  'title': ''},
 4: {'URL': 'http://mediabiasfactcheck.com/cnn/',
  'bias': 'left',
  'content': '',
  'domain': 'cnn.com',
  'fact': 'MIXED',
  'link': 'http://www.cnn.com/sitemaps/sitemap-articles-2013-12.xml',
  'source_url': 'http://www.cnn.com/',
  'source_url_processed': 'cnn.com',
  'title': ''},
 5: {'URL': 'https://mediabiasfactcheck.com/ndtv/',
  'bias': 'left-center',
  'content': '',
  'domain': 'ndtv.com',
  'fact': 'HIGH',
  'link': 'http://archives.ndtv.com/articles/2014-01.html',
  'source_url': 'https://www.ndtv.com/',
  'source_url_processed': 'ndtv.com',
  'title': ''},
 6: {'URL': 'http://mediabiasfactcheck.com/world-news/',

In [102]:
corpus[corpus['bias'].str.replace('-', ' ').str.contains('right')]

Unnamed: 0,source_url,source_url_processed,URL,fact,bias
2,http://www.fury.news/,fury.news,http://mediabiasfactcheck.com/fury-news/,LOW,extreme-right
4,http://constitution.com/,constitution.com,http://mediabiasfactcheck.com/the-constitution/,LOW,extreme-right
5,http://freebeacon.com/,freebeacon.com,http://mediabiasfactcheck.com/washington-free-...,MIXED,right
6,http://brexitcentral.com,brexitcentral.com,https://mediabiasfactcheck.com/brexitcentral/,MIXED,right
8,https://patriotpost.us/,patriotpost.us,http://mediabiasfactcheck.com/the-patriot-post/,MIXED,right
...,...,...,...,...,...
1053,https://www.hermancain.com/,hermancain.com,https://mediabiasfactcheck.com/hermancain-com/,MIXED,right
1055,https://www.numbersusa.com/,numbersusa.com,http://mediabiasfactcheck.com/numbers-usa/,MIXED,right
1057,https://www.sbsun.com,sbsun.com,https://mediabiasfactcheck.com/san-bernardino-...,HIGH,right-center
1059,http://allenwestrepublic.com/,allenwestrepublic.com,http://mediabiasfactcheck.com/allen-west-repub...,LOW,extreme-right


In [107]:
search_results_df[search_results_df['bias'].str.replace('-', ' ').str.contains('right')]

Unnamed: 0,source_url,source_url_processed,URL,fact,bias,link,domain,title,content
0,http://triblive.com/,triblive.com,http://mediabiasfactcheck.com/pittsburgh-tribu...,HIGH,right-center,https://archive.triblive.com/wp-content/themes...,triblive.com,,
2,http://www.dailymail.co.uk/ushome/index.html,dailymail.co.uk,http://mediabiasfactcheck.com/daily-mail/,MIXED,right,https://www.dailymail.co.uk/sitemap-articles-d...,dailymail.co.uk,,
7,http://www.unz.com/,unz.com,https://mediabiasfactcheck.com/the-unz-report/,LOW,extreme-right,http://www.unz.com/sbpdl/portlandia-vs-wire-in...,unz.com,,
