In [164]:
import pandas as pd
import newspaper
from googlesearch import search
import pdb
import tldextract
from newspaper import Article
from datetime import timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [169]:
import pandas as pd
import newspaper
from googlesearch import search
import pdb
import argparse
import tldextract
from datetime import timedelta




def get_url_bias(url, corpus):
    
    try:
        return corpus.loc[corpus['source_url_processed'].str.contains(tldextract.extract(url).domain)]['bias'].values[0]
    except IndexError:
        return 'center'
    
def get_url_fact(url, corpus):
    try:
        return corpus.loc[corpus['source_url_processed'].str.contains(tldextract.extract(url).domain)]['fact'].values[0]
    except IndexError:
        return 'MIXED'
    
def get_search_query(url):

    article = newspaper.Article(url)

    article.download()
    article.parse()
    article.nlp()

    query             = article.title
    
    try:
        date_before       = article.publish_date + timedelta(days=2)
        date_after        = article.publish_date - timedelta(days=2)

        query_time_before = str(date_before.year) +\
                            '-' + str(date_before.month) +\
                            '-' + str(date_before.day)

        query_time_after = str(date_after.year) +\
                            '-' + str(date_after.month) +\
                            '-' + str(date_after.day)
        
        query = query + ' before:' + query_time_before + ' after:' + query_time_after
    
    except TypeError:
        
        print('Date for the article not available. Finding other articles across all times')
    
    
        
    return query
    

    
def get_query_results(search_query):
    
    alt_article_lists = [i for i in search(search_query, num = 10)]
    search_results_df = pd.DataFrame(columns = ['link', 'domain', 'title', 'content'])

    search_results_df['link'] = alt_article_lists
    search_results_df['domain'] = search_results_df['link'].apply(lambda x: tldextract.extract(x).domain\
                                                                             + '.' + tldextract.extract(x).suffix)
    
    return search_results_df


def parse_params():

    parser = argparse.ArgumentParser(description='Source Reliability')
    parser.add_argument('--url',             type=str, default='')
    params = parser.parse_args()
    return params




def main():

    
    user_params = parse_params()
    url         = user_params.url
    corpus      = pd.read_csv('data/corpus.csv')

    url_bias      = get_url_bias(url, corpus)
    url_fact      = get_url_fact(url, corpus)
    
    query         = get_search_query(url)
    query_results_df = get_query_results(query)
    query_results_df = query_results_df.fillna('')


    search_results_df = pd.merge(corpus, query_results_df, left_on = 'source_url_processed', right_on = 'domain')
    search_results_df = search_results_df[~search_results_df['bias'].str.replace('-', ' ').str.contains('right')]
    
    search_results_df.to_csv('results/' + query + '.csv')

    return search_results_df.to_dict(orient = 'index')


def get_alternative_links(url):

    corpus_url   = 'https://raw.githubusercontent.com/Omairss/BeyondBias/master/data/corpus.csv'
    s            = requests.get(corpus_url).content

    corpus       = pd.read_csv(io.StringIO(s.decode('utf-8')))

    url_bias      = get_url_bias(url, corpus)
    url_fact      = get_url_fact(url, corpus)
    
    query         = get_search_query(url)
    query_results_df = get_query_results(query)
    query_results_df = query_results_df.fillna('')


    search_results_df = pd.merge(corpus, query_results_df, left_on = 'source_url_processed', right_on = 'domain')
    search_results_df = search_results_df[~search_results_df['bias'].str.replace('-', ' ').str.contains('right')]
    

    return search_results_df




In [155]:
def getCosine(body_1, body_2, title_1, title_2):
    
    bodies = (body_1, body_2)
    titles = (title_1, title_2)
    
    tfidf_vectorizer = TfidfVectorizer()
    
    tfidf_matrix_bodies = tfidf_vectorizer.fit_transform(bodies)
    tfidf_matrix_titles = tfidf_vectorizer.fit_transform(titles)
    
    cosine_similarity_bodies=(cosine_similarity(tfidf_matrix_bodies[0:1], tfidf_matrix_bodies[1]))[0][0]
    cosine_similarity_titles=(cosine_similarity(tfidf_matrix_titles[0:1], tfidf_matrix_titles[1]))[0][0]
    
    return 0.5*(cosine_similarity_bodies+cosine_similarity_titles)


def getSimilarity(url1, url2):
    
    #url1 = 'https://www.newyorker.com/books/page-turner/how-jane-vonnegut-made-kurt-vonnegut-a-writer'
    #url2 = 'https://www.nytimes.com/2007/04/12/books/12vonnegut.html'
    
    article1 = Article(url1)
    article2 = Article(url2)
    
    article1.download()
    article2.download()
    
    article1.parse()
    article2.parse()
    
    body1=article1.text
    body2=article2.text
    
    title1=article1.title
    title2=article2.title
    
    return getCosine(body1, body2, title1, title2)

In [151]:
search_results_df = get_alternative_links('https://www.breitbart.com/news/turkey-denies-blocking-retreat-of-kurdish-forces-in-syria-official/')

Date for the article not available. Finding other articles across all times


In [156]:
getSimilarity('https://www.breitbart.com/news/turkey-denies-blocking-retreat-of-kurdish-forces-in-syria-official/',
              'https://www.nytimes.com/2007/04/12/books/12vonnegut.html')

0.10977297172269668

In [157]:
url = 'https://www.breitbart.com/news/turkey-denies-blocking-retreat-of-kurdish-forces-in-syria-official/'
search_results_df['cosine_similarity'] = search_results_df['link'].apply(lambda x: getSimilarity(x, url))

In [158]:
search_results_df

Unnamed: 0,source_url,source_url_processed,URL,fact,bias,link,domain,title,content,cosine_similarity
1,https://www.japantimes.co.jp/,japantimes.co.jp,https://mediabiasfactcheck.com/japan-times/,HIGH,center,https://www.japantimes.co.jp/news/2019/10/18/w...,japantimes.co.jp,,,0.216166
2,http://www.aljazeera.com/,aljazeera.com,http://mediabiasfactcheck.com/al-jazeera/,HIGH,left-center,https://www.aljazeera.com/news/2019/10/turkey-...,aljazeera.com,,,0.277109
3,http://www.timesofisrael.com/,timesofisrael.com,http://mediabiasfactcheck.com/times-of-israel/,HIGH,left-center,https://www.timesofisrael.com/turkey-starts-bo...,timesofisrael.com,,,0.202082
4,https://www.bloomberg.com/,bloomberg.com,http://mediabiasfactcheck.com/bloomberg/,HIGH,left-center,https://www.bloomberg.com/news/articles/2019-1...,bloomberg.com,,,0.016945
5,http://www.news24.com/,news24.com,http://mediabiasfactcheck.com/news24-south-afr...,HIGH,center,https://www.news24.com/World/News/trump-defend...,news24.com,,,0.233539
8,http://www.pri.org/,pri.org,http://mediabiasfactcheck.com/public-radio-int...,HIGH,left-center,https://www.pri.org/stories/2016-08-24/turkey-...,pri.org,,,0.214544
9,http://www.cnn.com/,cnn.com,http://mediabiasfactcheck.com/cnn/,MIXED,left,https://www.cnn.com/2019/10/12/middleeast/turk...,cnn.com,,,0.160726


In [161]:
list(search_results_df['link'])

['https://www.japantimes.co.jp/news/2019/10/18/world/trump-trumpets-turkish-cease-fire-kurd-ex-allies-must-vacate-syria-border-area-allowing-ankara-solidify-gains/',
 'https://www.aljazeera.com/news/2019/10/turkey-military-operation-syria-latest-updates-191017051518215.html',
 'https://www.timesofisrael.com/turkey-starts-bombing-kurds-in-syria-as-us-pulls-out-report/',
 'https://www.bloomberg.com/news/articles/2019-10-17/trump-s-haphazard-syria-deal-leaves-erdogan-with-long-sought-win',
 'https://www.news24.com/World/News/trump-defends-syria-pullout-denies-giving-turkey-green-light-for-invasion-20191016',
 'https://www.pri.org/stories/2016-08-24/turkey-fighting-isis-syria-and-blocking-us-backed-kurds',
 'https://www.cnn.com/2019/10/12/middleeast/turkey-syria-offensive-intl/index.html']

In [167]:
search_results_df['score'] = search_results_df['cosine_similarity']/np.log(np.array(search_results_df.index) + 1)

In [168]:
search_results_df['score']

1    0.311862
2    0.252235
3    0.145772
4    0.010529
5    0.130340
8    0.097643
9    0.069802
Name: score, dtype: float64