In [19]:
import preprocessing

In [1]:
import pandas as pd
import numpy as np
from newspaper import Article
from datetime import datetime
import pickle
from langdetect import detect


def process(df): #input filename
    """
    Fetches news items from csv file
    Returns a list of News class
    """
    ret = []
    for index,row in df.iterrows():
        sqldate = row['SQLDATE']
        month_year = row['MonthYear']
        #event_root_code = row['EventRootCode']
        goldstein_scale = row['GoldsteinScale']
        num_mentions = row['NumMentions']
        num_sources = row['NumSources']
        num_articles = row['NumArticles']
        tone = row['AvgTone']
        url = row['SOURCEURL']
        news = News(sqldate, month_year, goldstein_scale, num_mentions, num_sources, num_articles, tone, url)
        ret.append(news)
    print('\nThere are %d items in News.'% len(ret))
    return ret


class News:
    def __init__(self, sqldate, month_year, goldstein_scale, num_mentions, num_sources, num_articles, tone, url):
        self.sqldate = sqldate
        self.month_year = month_year
        #self.event_root_code = event_root_code
        self.goldstein_scale = goldstein_scale
        self.num_mentions = num_mentions
        self.num_sources = num_sources
        self.num_articles = num_articles
        self.tone = tone
        self.url = url
        self.text = None
        self.publish_date = None
        self.taxonomy = []
        self.language = []

    def get_sqldate(self):
        return self.sqldate
    def get_monthyear(self):
        return self.month_year
 #   def get_eventrootcode(self):
 #       return self.event_root_code
    def get_goldsteinscale(self):
        return self.goldstein_scale
    def get_nummentions(self):
        return self.num_mentions
    def get_numsources(self):
        return self.num_sources
    def get_numarticles(self):
        return self.num_articles
    def get_tone(self):
        return self.tone
    def get_url(self):
        return self.url
    def get_text(self):
        return self.text
    def get_publish_date(self):
        return self.publish_date
    def get_taxonomy(self):
        return self.taxonomy
    def set_taxonomy(self,taxonomy):
        self.taxonomy.extend(taxonomy)
    def get_language(self):
        return self.language
    def set_language(self,language):
        self.language.append(language)#zoe change extent to append
    def clean_text(self):
        article = Article(self.url)
        try:
            article.download()
            article.parse()
            self.text,self.publish_date = article.text, article.publish_date
            print('Success.')
        except:
            self.text,self.publish_date = None, None
            print('No text found.')

            
# remove deuplicate url
def removeDuplicate (df):
    date=df.sort_values(by='SQLDATE', ascending=True).groupby('SOURCEURL').first() ["SQLDATE"].reset_index().drop(['SOURCEURL'],axis=1)
    by_data=df.groupby('SOURCEURL')['MonthYear','GoldsteinScale','NumMentions','NumSources','NumArticles','AvgTone'].mean().reset_index()
    by_data['SQLDATE']=date
    return by_data

def pickle_cleaning(csvfilename,picklefilename):
    data = removeDuplicate(pd.read_csv(csvfilename)) #remove duplicate urls
    print(data.head(10))
    news_list = process(data) #change csv to class
    pickle.dump(news_list, open(picklefilename,'wb')) #save as pickle
    print(len(data))
     
def text_downloading(picklefilename,rename):
    language_list = []
    with open(picklefilename,'rb') as PickleFiles:
        news_list = pickle.loads(PickleFiles.read())
    for story in news_list:
        story.clean_text()
        if story.get_text() == None:
            pass
        else:
            try:
                detect(story.text)
                language_list.append(detect(story.text))
                story.set_language([detect(story.text)])
                print(detect(story.text))
            except:
                print("error")
    pickle.dump(news_list, open(rename,'wb'))
    return language_list




In [2]:
pickle_cleaning('201505.csv','arg201505.p')

                                           SOURCEURL  MonthYear  \
0  Agerpres news agency, Bucharest/BBC Monitoring...   201505.0   
1  MercoPress website, Montevideo/BBC Monitoring/...   201505.0   
2  Ministry of Foreign Affairs website, Moscow/BB...   201505.0   
3            Multiple Sources/BBC Monitoring/(c) BBC   201505.0   
4  Public Television of Armenia, Yerevan/BBC Moni...   201505.0   
5  http://13wham.com/template/inews_wire/wires.in...   201505.0   
6     http://1news.az/society/20150521105536064.html   201505.0   
7  http://24tv.ua/ru/sdelano_v_ukraine/ukrainskij...   201505.0   
8  http://33live.ru/novosti/21-05-2015-seks-v-3d-...   201505.0   
9  http://442.perfil.com/2015-05-26-363797-un-bar...   201505.0   

   GoldsteinScale  NumMentions  NumSources  NumArticles   AvgTone   SQLDATE  
0        3.500000     3.000000         1.0     3.000000  7.540984  20150504  
1        2.772727     3.000000         1.0     3.000000 -0.565600  20150504  
2        2.090909     3.0909

In [None]:
listarg201508=text_downloading('arg201505.p','arg201505eng.p')
listarg201508

No text found.
No text found.
No text found.
No text found.
No text found.
No text found.
Success.
error
Success.
error
Success.
ru
Success.
es
Success.
es
Success.
en
Success.
es
Success.
es
Success.
en
Success.
en
Success.
ar
Success.
ru
Success.
ru
Success.
ru
No text found.
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
No text found.
No text found.
No text found.
No text found.
No text found.
No text found.
No text found.
No text found.
Success.
en
No text found.
No text found.
No text found.
No text found.
No text found.
No text found.
Success.
en
Success.
es
Success.
es
Success.
es
Success.
error
No text found.
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
Success.
en
Success.
es
Success.
es
Success.
en
Success.
pt
Success.
en
Success.
en
Success.
pt
Success.
pt
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
Success.
es
S

Success.
error
Success.
error
Success.
es
Success.
es
Success.
error
Success.
error
Success.
error
Success.
es
Success.
es
Success.
error
Success.
error
Success.
es
Success.
error
Success.
error
Success.
es
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
es
Success.
error
Success.
error
Success.
error
Success.
es
Success.
error
Success.
error
Success.
error
Success.
error
Success.
es
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
es
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
error
Success.
en
Success.
en
No text found.
No text found.
No text found.
No text found.
Success.
en
Success.
en
Success.
en
Success.
en
Success.
en
Success.
hr
Success.
en
Success.
en
Success.
es
Success.
es
N

In [None]:
#check if we can get language from story
infile = open('arg201505eng.p','rb')
new_dict = pickle.load(infile)
for index, story in enumerate(new_dict):
    print(story.get_language())

In [10]:
#count the number of English articles
df1 = pd.DataFrame(listarg201508, columns=["colummn"])
df1


def count_en (language_list,language):
    count=0
    for i in language_list:
        if i==language:
            count=count+1
    print('There are {} of total article, and {} of them are English article'.format(len(language_list),count))
    a=count/len(language_list)
    print('The percentage of English is {} of them are English'.format(a))


count_en(listarg201508,'en')

There are 53 of total article, and 9 of them are English article
The percentage of English is 0.16981132075471697 of them are English


In [15]:
#count the number of Spanish articles
count_en(listarg201508,'es')

There are 53 of total article, and 36 of them are English article
The percentage of English is 0.6792452830188679 of them are English


In [14]:
unique_language_list = pd.unique(df1['colummn'])
unique_language_list

array(['ru', 'es', 'en', 'ar', 'pt'], dtype=object)

In [38]:
#!/usr/bin/env python
# coding: utf-8

# In[16]:

import multiprocessing as mp
import os
import requests, zipfile, io
import re
import datetime
import string
import time
import glob
from glob import iglob
from datetime import datetime
import newspaper
from newspaper import Article
import pickle
import csv
from time import ctime
from langdetect import detect
import threading

 
class Trigger:
    def evaluate(self, story):
        """
        Returns True if an alert should be generated
        for the given news item, or False otherwise.
        """
        # DO NOT CHANGE THIS!
        raise NotImplementedError
        
class PhraseTrigger(Trigger):
    def __init__(self, phrase):
        self.phrase = phrase
    def get_phrase(self):
        return self.phrase
    def evaluate(self, story):
        return self.is_phrase_in(story)
    
    def is_phrase_in(self, text):
        raw_text = text.lower()
        raw_phrase = self.phrase.lower()
        #print('mapping is_phrase_in')
        for char in raw_text:
            if char in string.punctuation:
                raw_text = raw_text.replace(char,' ')
        raw_list = raw_text.split()
        phrase_list = raw_phrase.split()
        
        if phrase_list[0] not in raw_list:
            return False
        else:
            temp_index = raw_list.index(phrase_list[0])
            return phrase_list == raw_list[temp_index:temp_index + len(phrase_list)]

class TextTrigger(PhraseTrigger):
    def __init__(self, phrase):
        PhraseTrigger.__init__(self, phrase)
    def get_phrase(self):
        return self.phrase
    def evaluate(self, story):
        #print('evaluate')
        return self.is_phrase_in(story.get_text())

class AndTrigger(Trigger):
    def __init__(self, *args):
        self.args = args
        
    def get_args(self):
        phrase_list = [arg.get_phrase() for arg in self.args]
        return '+'.join(phrase_list)
    def evaluate(self, story):
        #print('evaluate')
        true_list = [T.evaluate(story) for T in self.args]
        result = (True, False)[False in true_list]
        return result
#        return self.T1.evaluate(story) and self.T2.evaluate(story) and self.T3.evaluate(story)

class OrTrigger(Trigger):
    def __init__(self,*args):
        self.args = args
    def get_args(self):
        phrase_list = [arg.get_phrase() for arg in self.args]
        return '+'.join(phrase_list)
    def evaluate(self, story):
        true_list = [T.evaluate(story) for T in self.args]
        result = (False,True)[True in true_list]
        return result
    
class NotTrigger(Trigger):
    def __init__(self, T):
        self.T = T
    def get_T(self):
        return self.T
    def evaluate(self, story):
        return not self.T.evaluate(story)

    
def read_trigger_config(filename):
    """
    filename: the name of a trigger configuration file

    Returns: a list of trigger objects specified by the trigger configuration
        file.
    """
    #print('start reading trig')
    trigger_file = open(filename, 'r', encoding='utf-8')
    lines = []
    for line in trigger_file:
        line = line.rstrip()
        if not (len(line) == 0 or line.startswith('//')):
            lines.append(line)

    trigger_dict = {}
    
    for line in lines:
        l_item = line.split('+')
        if l_item[1] == 'TEXT':
            trigger_dict[l_item[0]] = TextTrigger(l_item[2])
        elif l_item[1] == 'AND':
            arg_tuple = tuple(TextTrigger(item) for item in l_item[2:])
            trigger_dict[l_item[0]] = AndTrigger(*arg_tuple)
     
    #print(lines) # for now, print it so you see what it contains!
    return trigger_dict    
    

In [75]:
trigger_dict_english=read_trigger_config('triggerlist-english-all.txt')


{'m1': <__main__.AndTrigger at 0x1a2435bd68>,
 'm2': <__main__.AndTrigger at 0x1a2435b8d0>,
 'm3': <__main__.AndTrigger at 0x1a2435b7f0>,
 'm4': <__main__.TextTrigger at 0x1a242efcc0>,
 'm5': <__main__.TextTrigger at 0x1a242ef940>,
 'm6': <__main__.TextTrigger at 0x1a242efb00>,
 'm7': <__main__.AndTrigger at 0x1a242ef390>,
 'm8': <__main__.AndTrigger at 0x1a242ef470>,
 'm9': <__main__.AndTrigger at 0x1a242efa20>}

In [76]:
trigger_dict_spanish=read_trigger_config('triggerlist-Spanish-all.txt')

In [77]:
def filter_stories_bypickle(fsplit,trigger_dict_eng,trigger_dict_span,newpickle):
    """
    Takes in a list of News instances which placing in pickle.

    Returns: a list of only the stories for which a trigger in triggerlist fires.
    """
#    trig_story = []
    #temp_stories = feng[:num_line]
    with open(fsplit,'rb') as PickleFiles:
        feng = pickle.loads(PickleFiles.read())
    temp_stories = feng
    filter_taxonomy=[]
    for index, story in enumerate(temp_stories):    
        #print(story.get_language())
        #print(type(story))
        if story.get_language() != []:
            if story.get_language()[0] == 'en':
                #print('mapping eng')
                for key,trig in trigger_dict_eng.items():
                    try:
                        #print('mapping step 2 eng')
                        story.set_taxonomy((key,trig.get_args())) if trig.evaluate(story)                     else ctime()#print('False',end=' ')
                    except AttributeError:
                        pass
            if story.get_language()[0] == 'es':
                #print('mapping span')
                for key,trig in trigger_dict_span.items():
                    try:
                        #print('mapping step 2 span')
                        story.set_taxonomy((key,trig.get_args())) if trig.evaluate(story)                     else ctime()#print('False',end=' ')
                    except AttributeError:
                        pass
        #print(story.get_taxonomy())
        #print(story)
        #print(type(story))
        #print(len(story.get_taxonomy()))
        if len(story.get_taxonomy())>0:
            filter_taxonomy.append(story)
    outfile = open(newpickle,'wb')
    pickle.dump(filter_taxonomy,outfile)
    outfile.close()    
    return filter_taxonomy
        

In [80]:
import time
start = time.time()
x=filter_stories_bypickle('arg201505eng.p',trigger_dict_english,trigger_dict_spanish,'newpickle.p')
elapsed = (time.time() - start)
print("Time used:",elapsed)

Time used: 19.435309886932373


In [72]:
x

[<__main__.News at 0x1a27eb3eb8>,
 <__main__.News at 0x1a27eb3f28>,
 <__main__.News at 0x1a2435b710>,
 <__main__.News at 0x1a24369c18>]