# Importing Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import pandas as pd
import numpy as np
import json
import time
from time import sleep

# Importing Source Reliability Data

In [2]:
with open('Dataset/MBFC/factuality.json') as f:
    data = json.load(f)

In [3]:
def reliability(dict, key):
    
    if key in dict.keys():
        return dict[key]
    else:
        return 'UNKNOWN'

In [4]:
dict = {}
for p in data:
    pos = 0
    for i in range(len(p)):
        if p[i] == '.':
            pos = i
            break
    dict[p[:pos]] = data[p]

In [5]:
# Adding some domains in dict that has to be removed from search result

rem = ['translate', 'pinterest', 'shutterstock', 'linkedin', 'merriam-webster', 'amazon', 'unsplash', 'facebook', 
       'myntra', 'dictionary', 'youtube', 'flipkart', 'developer', 'twitter', 'webcache', 'reddit', 'britannica']

for dom in rem:
    dict[dom] = 'FALSE'

In [6]:
def reliability_from_link(link):
    
    url = str(link)

    if url[:5] == 'https':
        url = url[8:]
    else:
        url = url[7:]

    pos = 0
    for i in range(len(url)):
        if url[i] == '.':
            pos = i
            break
    
    if reliability(dict,url[:pos]) != 'UNKNOWN':
        return reliability(dict,url[:pos])

    url = url[pos+1:]

    pos = len(url)
    for i in range(len(url)):
        if url[i] == '.':
            pos = i
            break
    
    if reliability(dict,url[:pos]) != 'UNKNOWN':
        return reliability(dict,url[:pos])
    else:
        return 'UNKNOWN'

# Importing Dataset

In [7]:
data = pd.read_csv('Dataset/main/all_data.csv')

data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,author,comments,country,crawled,domain_rank,id,language,likes,...,fear,joy,sadness,surprise,trust,negative,positive,first_all,second_all,third_all
0,1,,JEREMY W. PETERS,0.0,US,2017-03-14 08:25:04,0,3.0,english,0.0,...,6,20,5,14,30,14,52,409,150,45
1,2,,STEVE EDER,0.0,US,2017-03-14 08:25:36,0,4.0,english,0.0,...,4,4,4,5,9,8,15,145,38,28
2,3,,MAGGIE HABERMAN ASHLEY PARKER,0.0,US,2017-03-14 08:25:36,0,5.0,english,0.0,...,8,15,8,6,26,15,34,440,168,68
3,4,,NELSON D. SCHWARTZ SUI-LEE WEE,0.0,US,2017-03-14 08:25:36,0,6.0,english,0.0,...,10,10,10,6,32,24,43,592,136,71
4,5,,MAGGIE HABERMAN,0.0,US,2017-03-14 08:25:37,0,7.0,english,0.0,...,3,6,2,4,14,4,25,223,82,28


In [8]:
main_img_url = data.iloc[:,10].values

In [9]:
print(main_img_url)

['https://static01.nyt.com/images/2016/11/23/us/23spectacle1/23spectacle1-master768.jpg'
 'https://static01.nyt.com/images/2016/10/18/us/18fd-trumpfoundation/18fd-trumpfoundation-master675.jpg'
 'https://static01.nyt.com/images/2016/11/12/us/12tower1/12tower1-master768.jpg'
 ... 'https://t4.rbxcdn.com/c5695e5f087535e2066dc473e03b1819'
 'https://t4.rbxcdn.com/c5695e5f087535e2066dc473e03b1819'
 'https://t4.rbxcdn.com/c5695e5f087535e2066dc473e03b1819']


# Extraction

In [10]:
def url_encoding(img_url):
    
    return urllib.parse.quote(img_url)

In [11]:
def is_valid_link(link):
    
    link = str(link)
    
    if len(link)<13:
        return False
    elif link[0:4]!='http':
        return False
    elif reliability_from_link(link)=='FALSE':
        return False
    else:
        return True

In [12]:
def comp(val):
    return val['rel']

In [13]:
def get_filtered_links(links):
    res = []
    for link in links:
        if is_valid_link(link):
            res.append(link)
            
    temp = []
    for link in res:
        
        rel = reliability_from_link(link)
        if rel == 'UNKNOWN':
            rel = 3
        elif rel == 'MIXED':
            rel = 2
        else:
            rel = 1
        temp.append({
            'url': link,
            'rel':rel
        })
    temp.sort(key=comp)
    
    for i in range(len(res)):
        res[i] = temp[i]['url']
    
    return res

In [14]:
def get_complete_image_url(host_url,image_link):
    
    pref=''
    dom=''
    
    if host_url[0:5] == 'http:':
        pref = 'http://'
        for i in range(7,len(host_url)):
            if(host_url[i]=='/'):
                dom = host_url[7:i]
                break
    else:
        pref = 'https://'
        for i in range(8,len(host_url)):
            if(host_url[i]=='/'):
                dom = host_url[8:i]
                break
    
    if len(image_link)<2:
        return ''
    elif image_link[0:2]=='//':
        return pref + image_link[2:]
    elif image_link[0]=='/':
        return pref + dom + image_link
    elif len(image_link)<13:
        return ''
    elif image_link[0:4]!='http':
        return ''
    else:
        return image_link

In [15]:
def get_info(img_url):
    
    if len(str(img_url))<13 or img_url[0:4]!='http':
        return 0, [['NA','NA',[]],['NA','NA',[]],['NA','NA',[]],['NA','NA',[]]]
    
    if img_url[:22] == 'https://t4.rbxcdn.com/':
        return 0, [['NA','NA',[]],['NA','NA',[]],['NA','NA',[]],['NA','NA',[]]]
    
    img_url = url_encoding(img_url)
    
    google_search_url = 'https://www.google.com/searchbyimage?&image_url=' + img_url
    
    url = google_search_url
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
    
    response = ''
    
    try:
        response = requests.get(url,headers=headers,timeout=20)
    except:
        sleep(3)
        return 0, [[url,'Not Responding',[]],[url,'Not Responding',[]],[url,'Not Responding',[]],[url,'Not Responding',[]]]
    
    content = BeautifulSoup(response.content, 'html.parser')
    
    link_containers = content.select('#search a')
    
    links =[]
    for link_container in link_containers:
        link = ''
        try:
            link = link_container['href']
        except:
            continue
        links.append(link)
    
    links = get_filtered_links(links)
    
    res = []
    num = 0
    
    for i in range(len(links)):
                    
        url = links[i]

        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}

        response = ''

        try:
            response = requests.get(url,headers=headers,timeout=20)
        except:
            sleep(3)
            continue

        content = BeautifulSoup(response.content, 'html.parser')
        
        # Extracting text from all paragraphs on current source page
        texts = content.findAll('p')

        extracted_text = '';
        for text in texts:
            extracted_text += text.get_text() + ' '
        
        val = extracted_text
        val = val.split()
        if len(val)<30:
            continue
        
        # Extracting url of images present on source pages
        src_img_url = []
        
        source_images = content.findAll('img')
        
        for source_image in source_images:
            link = ''
            try:
                link = source_image['src']
            except:
                continue
            
            comp_image_url = get_complete_image_url(url,link)
            
            if(comp_image_url!=''):
                src_img_url.append(comp_image_url)
        
        if len(src_img_url)==0:
            continue
        
        # Storing Results for current source
        res.append([url, extracted_text,src_img_url])
        num += 1
        
        if num == 4:
            break
    
    for i in range(num,4):
        res.append(['NA','NA',[]])
    
    return num , res

In [16]:
extracted_info = []
numSources = []
img_url = []

def run(fir,las):
    
    t0 = time.time()

    for i in range(fir,las):

        num , temp_info = get_info(main_img_url[i])
        numSources.append(num)
        extracted_info.append(temp_info)
        img_url.append(main_img_url[i])
        
        print(i, 'Time elapsed:',time.time()-t0,'sec')

    print('Average time per query:',(time.time()-t0)/(las-fir),'seconds.')

In [17]:
print(len(main_img_url))

20015


In [18]:
run(16000,20015)

16000 Time elapsed: 0.0 sec
16001 Time elapsed: 0.0 sec
16002 Time elapsed: 0.0009975433349609375 sec
16003 Time elapsed: 1.1175873279571533 sec
16004 Time elapsed: 10.499725580215454 sec
16005 Time elapsed: 12.666152238845825 sec
16006 Time elapsed: 14.48758339881897 sec
16007 Time elapsed: 15.674888610839844 sec
16008 Time elapsed: 23.13892149925232 sec
16009 Time elapsed: 29.959431648254395 sec
16010 Time elapsed: 44.21314716339111 sec
16011 Time elapsed: 44.228768825531006 sec
16012 Time elapsed: 50.20737600326538 sec
16013 Time elapsed: 56.02313947677612 sec
16014 Time elapsed: 79.80955815315247 sec
16015 Time elapsed: 86.55744290351868 sec
16016 Time elapsed: 105.98863911628723 sec
16017 Time elapsed: 112.01786231994629 sec
16018 Time elapsed: 133.99289178848267 sec
16019 Time elapsed: 140.6743311882019 sec
16020 Time elapsed: 142.08167171478271 sec
16021 Time elapsed: 143.523934841156 sec
16022 Time elapsed: 143.523934841156 sec
16023 Time elapsed: 154.5952274799347 sec
16024 Ti

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


16120 Time elapsed: 1009.8234710693359 sec
16121 Time elapsed: 1018.1751754283905 sec
16122 Time elapsed: 1026.869404554367 sec
16123 Time elapsed: 1039.7677323818207 sec
16124 Time elapsed: 1053.181527376175 sec
16125 Time elapsed: 1067.0465984344482 sec
16126 Time elapsed: 1079.6136147975922 sec
16127 Time elapsed: 1085.8519990444183 sec
16128 Time elapsed: 1099.6525247097015 sec
16129 Time elapsed: 1108.1811101436615 sec
16130 Time elapsed: 1117.2066791057587 sec
16131 Time elapsed: 1135.4210810661316 sec
16132 Time elapsed: 1142.0963158607483 sec
16133 Time elapsed: 1155.8859968185425 sec
16134 Time elapsed: 1166.5121092796326 sec
16135 Time elapsed: 1179.64844083786 sec
16136 Time elapsed: 1191.8646340370178 sec
16137 Time elapsed: 1201.8781690597534 sec
16138 Time elapsed: 1214.5200669765472 sec
16139 Time elapsed: 1227.3682849407196 sec
16140 Time elapsed: 1242.4664821624756 sec
16141 Time elapsed: 1253.7742719650269 sec
16142 Time elapsed: 1264.605174779892 sec
16143 Time elaps

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


16146 Time elapsed: 1432.2014853954315 sec
16147 Time elapsed: 1443.5567953586578 sec
16148 Time elapsed: 1459.6228301525116 sec
16149 Time elapsed: 1471.3775372505188 sec
16150 Time elapsed: 1471.3775372505188 sec
16151 Time elapsed: 1471.3775372505188 sec
16152 Time elapsed: 1471.3775372505188 sec
16153 Time elapsed: 1471.3775372505188 sec
16154 Time elapsed: 1471.3775372505188 sec
16155 Time elapsed: 1471.3775372505188 sec
16156 Time elapsed: 1471.3775372505188 sec
16157 Time elapsed: 1471.3775372505188 sec
16158 Time elapsed: 1471.3775372505188 sec
16159 Time elapsed: 1471.3775372505188 sec
16160 Time elapsed: 1471.3775372505188 sec
16161 Time elapsed: 1471.3775372505188 sec
16162 Time elapsed: 1471.3775372505188 sec
16163 Time elapsed: 1471.3775372505188 sec
16164 Time elapsed: 1471.3775372505188 sec
16165 Time elapsed: 1471.3775372505188 sec
16166 Time elapsed: 1471.3775372505188 sec
16167 Time elapsed: 1471.3775372505188 sec
16168 Time elapsed: 1471.3775372505188 sec
16169 Time 

16338 Time elapsed: 2966.2003321647644 sec
16339 Time elapsed: 2979.5299682617188 sec
16340 Time elapsed: 2989.248808145523 sec
16341 Time elapsed: 2998.4723522663116 sec
16342 Time elapsed: 3011.976080417633 sec
16343 Time elapsed: 3020.3829073905945 sec
16344 Time elapsed: 3027.239804983139 sec
16345 Time elapsed: 3039.8076899051666 sec
16346 Time elapsed: 3046.5119111537933 sec
16347 Time elapsed: 3064.589466571808 sec
16348 Time elapsed: 3081.0784919261932 sec
16349 Time elapsed: 3100.890141248703 sec
16350 Time elapsed: 3100.890141248703 sec
16351 Time elapsed: 3100.890141248703 sec
16352 Time elapsed: 3100.890141248703 sec
16353 Time elapsed: 3100.891187429428 sec
16354 Time elapsed: 3100.891187429428 sec
16355 Time elapsed: 3100.891187429428 sec
16356 Time elapsed: 3100.891187429428 sec
16357 Time elapsed: 3100.891187429428 sec
16358 Time elapsed: 3100.891187429428 sec
16359 Time elapsed: 3100.891187429428 sec
16360 Time elapsed: 3100.891187429428 sec
16361 Time elapsed: 3100.89

16534 Time elapsed: 3975.466037273407 sec
16535 Time elapsed: 3975.466037273407 sec
16536 Time elapsed: 3989.134454727173 sec
16537 Time elapsed: 3989.134454727173 sec
16538 Time elapsed: 3989.134454727173 sec
16539 Time elapsed: 3991.2345395088196 sec
16540 Time elapsed: 3993.045386314392 sec
16541 Time elapsed: 3999.85404253006 sec
16542 Time elapsed: 4026.6549863815308 sec
16543 Time elapsed: 4053.576151371002 sec
16544 Time elapsed: 4056.7974157333374 sec
16545 Time elapsed: 4067.9034893512726 sec
16546 Time elapsed: 4075.5837457180023 sec
16547 Time elapsed: 4081.6650354862213 sec
16548 Time elapsed: 4094.89488863945 sec
16549 Time elapsed: 4109.285603523254 sec
16550 Time elapsed: 4121.209797143936 sec
16551 Time elapsed: 4126.641540527344 sec
16552 Time elapsed: 4138.2480800151825 sec
16553 Time elapsed: 4153.997625112534 sec
16554 Time elapsed: 4173.8414986133575 sec
16555 Time elapsed: 4186.4951157569885 sec
16556 Time elapsed: 4199.409328699112 sec
16557 Time elapsed: 4214.72

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


16569 Time elapsed: 4713.31325340271 sec
16570 Time elapsed: 4729.239736795425 sec
16571 Time elapsed: 4743.345160007477 sec
16572 Time elapsed: 4759.332751274109 sec
16573 Time elapsed: 4776.240008115768 sec
16574 Time elapsed: 4788.055766820908 sec
16575 Time elapsed: 4800.993829250336 sec
16576 Time elapsed: 4808.604467153549 sec
16577 Time elapsed: 4825.284157752991 sec
16578 Time elapsed: 4851.894596576691 sec
16579 Time elapsed: 4869.57288646698 sec
16580 Time elapsed: 4886.273223638535 sec
16581 Time elapsed: 4901.65509390831 sec
16582 Time elapsed: 4917.837318897247 sec
16583 Time elapsed: 4931.684900045395 sec
16584 Time elapsed: 4947.028440475464 sec
16585 Time elapsed: 4958.501953840256 sec
16586 Time elapsed: 4965.900825738907 sec
16587 Time elapsed: 4971.962762594223 sec
16588 Time elapsed: 4980.182280063629 sec
16589 Time elapsed: 4997.570791721344 sec
16590 Time elapsed: 5011.996379137039 sec
16591 Time elapsed: 5014.296076536179 sec
16592 Time elapsed: 5020.146280765533

17155 Time elapsed: 6138.018326282501 sec
17156 Time elapsed: 6138.079335689545 sec
17157 Time elapsed: 6138.079335689545 sec
17158 Time elapsed: 6138.079335689545 sec
17159 Time elapsed: 6138.079335689545 sec
17160 Time elapsed: 6138.079335689545 sec
17161 Time elapsed: 6138.079335689545 sec
17162 Time elapsed: 6138.079335689545 sec
17163 Time elapsed: 6138.079335689545 sec
17164 Time elapsed: 6138.079335689545 sec
17165 Time elapsed: 6138.079335689545 sec
17166 Time elapsed: 6138.079335689545 sec
17167 Time elapsed: 6138.079335689545 sec
17168 Time elapsed: 6138.079335689545 sec
17169 Time elapsed: 6138.079335689545 sec
17170 Time elapsed: 6138.079335689545 sec
17171 Time elapsed: 6138.079335689545 sec
17172 Time elapsed: 6138.079335689545 sec
17173 Time elapsed: 6138.079335689545 sec
17174 Time elapsed: 6138.079335689545 sec
17175 Time elapsed: 6138.079335689545 sec
17176 Time elapsed: 6138.079335689545 sec
17177 Time elapsed: 6138.079335689545 sec
17178 Time elapsed: 6138.079335689

17530 Time elapsed: 6138.244267463684 sec
17531 Time elapsed: 6138.296442270279 sec
17532 Time elapsed: 6138.299461841583 sec
17533 Time elapsed: 6138.299461841583 sec
17534 Time elapsed: 6138.299461841583 sec
17535 Time elapsed: 6138.299461841583 sec
17536 Time elapsed: 6138.299461841583 sec
17537 Time elapsed: 6138.299461841583 sec
17538 Time elapsed: 6138.30028796196 sec
17539 Time elapsed: 6138.30028796196 sec
17540 Time elapsed: 6138.30028796196 sec
17541 Time elapsed: 6138.30028796196 sec
17542 Time elapsed: 6138.302286148071 sec
17543 Time elapsed: 6138.302286148071 sec
17544 Time elapsed: 6138.302286148071 sec
17545 Time elapsed: 6138.302286148071 sec
17546 Time elapsed: 6138.302286148071 sec
17547 Time elapsed: 6138.302286148071 sec
17548 Time elapsed: 6138.302286148071 sec
17549 Time elapsed: 6138.303283214569 sec
17550 Time elapsed: 6138.303283214569 sec
17551 Time elapsed: 6138.303283214569 sec
17552 Time elapsed: 6138.303283214569 sec
17553 Time elapsed: 6138.303283214569 

18030 Time elapsed: 6138.459853887558 sec
18031 Time elapsed: 6138.491275072098 sec
18032 Time elapsed: 6138.491275072098 sec
18033 Time elapsed: 6138.491275072098 sec
18034 Time elapsed: 6138.491275072098 sec
18035 Time elapsed: 6138.491275072098 sec
18036 Time elapsed: 6138.491275072098 sec
18037 Time elapsed: 6138.491275072098 sec
18038 Time elapsed: 6138.491275072098 sec
18039 Time elapsed: 6138.491275072098 sec
18040 Time elapsed: 6138.491275072098 sec
18041 Time elapsed: 6138.491275072098 sec
18042 Time elapsed: 6138.491275072098 sec
18043 Time elapsed: 6138.491275072098 sec
18044 Time elapsed: 6138.491275072098 sec
18045 Time elapsed: 6138.491275072098 sec
18046 Time elapsed: 6138.491275072098 sec
18047 Time elapsed: 6138.491275072098 sec
18048 Time elapsed: 6138.491275072098 sec
18049 Time elapsed: 6138.491275072098 sec
18050 Time elapsed: 6138.491275072098 sec
18051 Time elapsed: 6138.491275072098 sec
18052 Time elapsed: 6138.491275072098 sec
18053 Time elapsed: 6138.491275072

18426 Time elapsed: 6138.670750617981 sec
18427 Time elapsed: 6138.671751737595 sec
18428 Time elapsed: 6138.671751737595 sec
18429 Time elapsed: 6138.671751737595 sec
18430 Time elapsed: 6138.671751737595 sec
18431 Time elapsed: 6138.671751737595 sec
18432 Time elapsed: 6138.672748088837 sec
18433 Time elapsed: 6138.672748088837 sec
18434 Time elapsed: 6138.672748088837 sec
18435 Time elapsed: 6138.672748088837 sec
18436 Time elapsed: 6138.672748088837 sec
18437 Time elapsed: 6138.672748088837 sec
18438 Time elapsed: 6138.672748088837 sec
18439 Time elapsed: 6138.673744678497 sec
18440 Time elapsed: 6138.673744678497 sec
18441 Time elapsed: 6138.673744678497 sec
18442 Time elapsed: 6138.673996210098 sec
18443 Time elapsed: 6138.673996210098 sec
18444 Time elapsed: 6138.673996210098 sec
18445 Time elapsed: 6138.673996210098 sec
18446 Time elapsed: 6138.673996210098 sec
18447 Time elapsed: 6138.673996210098 sec
18448 Time elapsed: 6138.674997329712 sec
18449 Time elapsed: 6138.674997329

18780 Time elapsed: 6138.826659679413 sec
18781 Time elapsed: 6138.862352371216 sec
18782 Time elapsed: 6138.862352371216 sec
18783 Time elapsed: 6138.862738609314 sec
18784 Time elapsed: 6138.862738609314 sec
18785 Time elapsed: 6138.862738609314 sec
18786 Time elapsed: 6138.862738609314 sec
18787 Time elapsed: 6138.862738609314 sec
18788 Time elapsed: 6138.862738609314 sec
18789 Time elapsed: 6138.862738609314 sec
18790 Time elapsed: 6138.862738609314 sec
18791 Time elapsed: 6138.862738609314 sec
18792 Time elapsed: 6138.862738609314 sec
18793 Time elapsed: 6138.863734722137 sec
18794 Time elapsed: 6138.863734722137 sec
18795 Time elapsed: 6138.863734722137 sec
18796 Time elapsed: 6138.863734722137 sec
18797 Time elapsed: 6138.863734722137 sec
18798 Time elapsed: 6138.864731311798 sec
18799 Time elapsed: 6138.864731311798 sec
18800 Time elapsed: 6138.864731311798 sec
18801 Time elapsed: 6138.864731311798 sec
18802 Time elapsed: 6138.864731311798 sec
18803 Time elapsed: 6138.864731311

19155 Time elapsed: 6139.101371049881 sec
19156 Time elapsed: 6139.101371049881 sec
19157 Time elapsed: 6139.101371049881 sec
19158 Time elapsed: 6139.101371049881 sec
19159 Time elapsed: 6139.102368593216 sec
19160 Time elapsed: 6139.102368593216 sec
19161 Time elapsed: 6139.102368593216 sec
19162 Time elapsed: 6139.102368593216 sec
19163 Time elapsed: 6139.102368593216 sec
19164 Time elapsed: 6139.102368593216 sec
19165 Time elapsed: 6139.102368593216 sec
19166 Time elapsed: 6139.102368593216 sec
19167 Time elapsed: 6139.102368593216 sec
19168 Time elapsed: 6139.102368593216 sec
19169 Time elapsed: 6139.103365421295 sec
19170 Time elapsed: 6139.10436296463 sec
19171 Time elapsed: 6139.10436296463 sec
19172 Time elapsed: 6139.10436296463 sec
19173 Time elapsed: 6139.10436296463 sec
19174 Time elapsed: 6139.10436296463 sec
19175 Time elapsed: 6139.105634212494 sec
19176 Time elapsed: 6139.105634212494 sec
19177 Time elapsed: 6139.1066353321075 sec
19178 Time elapsed: 6139.1066353321075

19529 Time elapsed: 6139.260143518448 sec
19530 Time elapsed: 6139.302572488785 sec
19531 Time elapsed: 6139.302572488785 sec
19532 Time elapsed: 6139.302572488785 sec
19533 Time elapsed: 6139.302572488785 sec
19534 Time elapsed: 6139.302572488785 sec
19535 Time elapsed: 6139.302572488785 sec
19536 Time elapsed: 6139.302572488785 sec
19537 Time elapsed: 6139.3035707473755 sec
19538 Time elapsed: 6139.3035707473755 sec
19539 Time elapsed: 6139.3035707473755 sec
19540 Time elapsed: 6139.304567813873 sec
19541 Time elapsed: 6139.304567813873 sec
19542 Time elapsed: 6139.304567813873 sec
19543 Time elapsed: 6139.304567813873 sec
19544 Time elapsed: 6139.304567813873 sec
19545 Time elapsed: 6139.304567813873 sec
19546 Time elapsed: 6139.304567813873 sec
19547 Time elapsed: 6139.305565595627 sec
19548 Time elapsed: 6139.305565595627 sec
19549 Time elapsed: 6139.306563615799 sec
19550 Time elapsed: 6139.306563615799 sec
19551 Time elapsed: 6139.306563615799 sec
19552 Time elapsed: 6139.307560

19943 Time elapsed: 6139.482108831406 sec
19944 Time elapsed: 6139.482108831406 sec
19945 Time elapsed: 6139.482108831406 sec
19946 Time elapsed: 6139.482108831406 sec
19947 Time elapsed: 6139.482108831406 sec
19948 Time elapsed: 6139.482108831406 sec
19949 Time elapsed: 6139.483106851578 sec
19950 Time elapsed: 6139.483106851578 sec
19951 Time elapsed: 6139.483106851578 sec
19952 Time elapsed: 6139.483106851578 sec
19953 Time elapsed: 6139.483106851578 sec
19954 Time elapsed: 6139.483106851578 sec
19955 Time elapsed: 6139.483106851578 sec
19956 Time elapsed: 6139.484102487564 sec
19957 Time elapsed: 6139.484102487564 sec
19958 Time elapsed: 6139.484102487564 sec
19959 Time elapsed: 6139.484102487564 sec
19960 Time elapsed: 6139.485099315643 sec
19961 Time elapsed: 6139.485099315643 sec
19962 Time elapsed: 6139.485099315643 sec
19963 Time elapsed: 6139.485099315643 sec
19964 Time elapsed: 6139.486097812653 sec
19965 Time elapsed: 6139.486097812653 sec
19966 Time elapsed: 6139.486097812

In [19]:
source1 = []
text1 = []
image_url1 = []
source2 = []
text2 = []
image_url2 = []
source3 = []
text3 = []
image_url3 = []
source4 = []
text4 = []
image_url4 = []

for i in range (len(extracted_info)):
    source1.append(extracted_info[i][0][0])
    text1.append(extracted_info[i][0][1])
    image_url1.append(extracted_info[i][0][2])
    source2.append(extracted_info[i][1][0])
    text2.append(extracted_info[i][1][1])
    image_url2.append(extracted_info[i][1][2])
    source3.append(extracted_info[i][2][0])
    text3.append(extracted_info[i][2][1])
    image_url3.append(extracted_info[i][2][2])
    source4.append(extracted_info[i][3][0])
    text4.append(extracted_info[i][3][1])
    image_url4.append(extracted_info[i][3][2])


In [20]:
dictionary = {}

dictionary['numSources'] = numSources
dictionary['img_url'] = img_url

dictionary['source1'] = source1
dictionary['text1'] = text1
dictionary['image_url1'] = image_url1
dictionary['source2'] = source2
dictionary['text2'] = text2
dictionary['image_url2'] = image_url2
dictionary['source3'] = source3
dictionary['text3'] = text3
dictionary['image_url3'] = image_url3
dictionary['source4'] = source4
dictionary['text4'] = text4
dictionary['image_url4'] = image_url4

In [21]:
df = pd.DataFrame(dictionary)

df = df.applymap(lambda x: str(x).encode("utf-8", errors="ignore").decode("utf-8", errors="ignore"))

df.to_csv('Dataset/metadata/ExtractedData_TICNN_16000_20015.csv')