In [1]:
import numpy as np
import pandas as pd
from newspaper import Article
import pickle
from langdetect import detect
import joblib
import tqdm
from datetime import datetime
import string

In [2]:
class News:
    def __init__(self, sqldate, month_year, goldstein_scale, num_mentions, num_sources, num_articles, tone, url):
        self.sqldate = sqldate
        self.month_year = month_year
        self.goldstein_scale = goldstein_scale
        self.num_mentions = num_mentions
        self.num_sources = num_sources
        self.num_articles = num_articles
        self.tone = tone
        self.url = url
        self.text = None
        self.publish_date = None
        self.taxonomy = []
        self.language = []

    def get_sqldate(self):
        return self.sqldate
    
    def get_monthyear(self):
        return self.month_year
    
    def get_goldsteinscale(self):
        return self.goldstein_scale
    
    def get_nummentions(self):
        return self.num_mentions
    
    def get_numsources(self):
        return self.num_sources
    
    def get_numarticles(self):
        return self.num_articles
    
    def get_tone(self):
        return self.tone
    
    def get_url(self):
        return self.url
    
    def get_text(self):
        return self.text
    
    def get_publish_date(self):
        return self.publish_date
    
    def get_taxonomy(self):
        return self.taxonomy
    
    def set_taxonomy(self,taxonomy):
        self.taxonomy.extend(taxonomy)
        
    def get_language(self):
        return self.language
    
    def set_language(self,language):
        self.language.extend(language)
        
    def clean_text(self):
        try:
            article = Article(self.url)
            article.download()
            article.parse()
            self.text,self.publish_date = article.text, article.publish_date
            #print('Success.')
        except:
            self.text,self.publish_date = None, None
            #print('No text found.')

In [3]:
#cleaning
def process():
    df2 = pd.read_csv('D:\\GLOBAL_AI\\nlp political\\new country\\china\\data\\china'+date+'.csv')
    df1 = df2.sort_values(by='SQLDATE', ascending=True).groupby('SOURCEURL').first()
    date_df = df1[["SQLDATE","MonthYear"]].reset_index().drop(['SOURCEURL'],axis=1)
    df = df2.groupby('SOURCEURL')['GoldsteinScale','NumMentions','NumSources','NumArticles','AvgTone'].mean().reset_index()
    df[["SQLDATE","MonthYear"]]=date_df
#Fetches News objects, returns a list of News.    
    ret = []
    
    for index,row in df.iterrows():
        sqldate = row['SQLDATE']
        month_year = row['MonthYear']
        goldstein_scale = row['GoldsteinScale']
        num_mentions = row['NumMentions']
        num_sources = row['NumSources']
        num_articles = row['NumArticles']
        tone = row['AvgTone']
        url = row['SOURCEURL']
        news = News(sqldate, month_year, goldstein_scale, num_mentions, num_sources, num_articles, tone, url)
        ret.append(news)
        
    print('There are {} items in News.'.format(len(ret)))
    return ret

In [4]:
date_list = ['200801','200802','200803','200804','200805','200806','200807','200808']

In [5]:
for date in date_list:
    news_list = process()
    pickle.dump(news_list, open('D:\\GLOBAL_AI\\nlp political\\new country\\china\\data\\china'+date+'.pickle','wb'))#save as pickle
    print('Successfully save pickle--china{}'.format(date))

There are 42 items in News.
Successfully save pickle--china200801
There are 32 items in News.
Successfully save pickle--china200802
There are 63 items in News.
Successfully save pickle--china200803
There are 43 items in News.
Successfully save pickle--china200804
There are 51 items in News.
Successfully save pickle--china200805
There are 40 items in News.
Successfully save pickle--china200806
There are 55 items in News.
Successfully save pickle--china200807
There are 26 items in News.
Successfully save pickle--china200808


In [6]:
def text_downloading(date):
    language_list = []
    file = open('D:\\GLOBAL_AI\\nlp political\\new country\\china\\data\\china'+date+'.pickle','rb')
    news_list = pickle.loads(file.read())
    
    for news in news_list:
        news.clean_text()
        
        if news.get_text() == None:
            pass
        
        else:        
            try:
                language = detect(news.get_text())
                language_list.append(language)
                news.set_language([language])
            except:
                pass
                
    outfile = open('D:\\GLOBAL_AI\\nlp political\\new country\\china\\data\\china'+date+'.pickle','wb')            
    pickle.dump(news_list, outfile)
    outfile.close()
    return language_list

In [7]:
if __name__ == '__main__':
    language_lists = joblib.Parallel(n_jobs=4, prefer="threads")(joblib.delayed(text_downloading)(date) for date in tqdm.tqdm_notebook(date_list))
    print('Successfully created language_lists')

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))






Successfully created language_lists


In [8]:
language_list = [item for sublist in language_lists for item in sublist]

In [9]:
language_df = pd.DataFrame(language_list,columns=['language'])

In [10]:
def count_en (language):
    count=0
    
    for i in language_list:
        if i==language:         
            count=count+1
    print('There are {} of total article, and {} of them are English article'.format(len(language_list),count))
    
    a=count/len(language_list)
    print('The percentage of English is {} of them are English'.format(a))

In [11]:
count_en('en')

There are 316 of total article, and 314 of them are English article
The percentage of English is 0.9936708860759493 of them are English


In [12]:
unique_language_list = pd.unique(language_df['language'])
unique_language_list

array(['en', 'hr'], dtype=object)

In [13]:
len(unique_language_list)

2

In [14]:
def count(x):   
    return len(language_df.loc[language_df['language']==x])/len(language_df)

In [15]:
for i in unique_language_list:
    percentage = count(i)
    print('Percentage of language {}:{}'.format(i,percentage))

Percentage of language en:0.9936708860759493
Percentage of language hr:0.006329113924050633


In [16]:
class Trigger:
    def evaluate(self, news):
        """
        Returns True if an alert should be generated
        for the given news item, or False otherwise.
        """
        # DO NOT CHANGE THIS!
        raise NotImplementedError
        
class PhraseTrigger(Trigger):
    def __init__(self, phrase):
        self.phrase = phrase
        
    def get_phrase(self):
        return self.phrase
    
    def evaluate(self, news):
        return self.is_phrase_in(news)
    
    def is_phrase_in(self, text):
        raw_text = text.lower()
        raw_phrase = self.phrase.lower()
        for char in raw_text:
            if char in string.punctuation:
                raw_text = raw_text.replace(char,' ')
        raw_list = raw_text.split()
        phrase_list = raw_phrase.split()
        
        if phrase_list[0] not in raw_list:
            return False
        else:
            temp_index = raw_list.index(phrase_list[0])
            return phrase_list == raw_list[temp_index:temp_index + len(phrase_list)]

class TextTrigger(PhraseTrigger):
    def __init__(self, phrase):
        PhraseTrigger.__init__(self, phrase)
        
    def get_phrase(self):
        return self.phrase

    def get_args(self):
        return self.phrase
        
    def evaluate(self, news):
        return self.is_phrase_in(news.get_text())

class AndTrigger(Trigger):
    def __init__(self, *args):
        self.args = args
        
    def get_args(self):
        phrase_list = [arg.get_phrase() for arg in self.args]
        return '+'.join(phrase_list)
    
    def evaluate(self, news):
        true_list = [T.evaluate(news) for T in self.args]  
        result = [True, False][False in true_list]
        return result

class OrTrigger(Trigger):
    def __init__(self,*args):
        self.args = args
        
    def get_args(self):
        phrase_list = [arg.get_phrase() for arg in self.args]
        return '+'.join(phrase_list)
    
    def evaluate(self, news):
        true_list = [T.evaluate(news) for T in self.args]
        result = [False,True][True in true_list]
        return result
    
class NotTrigger(Trigger):
    def __init__(self, T):
        self.T = T
        
    def get_T(self):
        return self.T
    
    def evaluate(self, news):
        return not self.T.evaluate(news)

In [17]:
def read_trigger_config(filename):
    """
    filename: the name of a trigger configuration file

    Returns: a list of trigger objects specified by the trigger configuration
        file.
    """
    #print('start reading trig')
    trigger_file = open(filename, 'r')
    lines = []
    
    for line in trigger_file:
        line = line.strip()
        if not (len(line) == 0 or line.startswith('//')):
            lines.append(line)

    trigger_dict = {}
    
    for line in lines:
        l_item = line.split('+')
        
        if l_item[1] == 'TEXT':
            trigger_dict[l_item[0]] = TextTrigger(l_item[2])
 
        elif l_item[1] == 'AND':
            arg_tuple = tuple(TextTrigger(item) for item in l_item[2:])
            trigger_dict[l_item[0]] = AndTrigger(*arg_tuple)
     
    #print(lines) # for now, print it so you see what it contains!
    return trigger_dict   

In [18]:
trigger_dict_english=read_trigger_config('D:\\GLOBAL_AI\\nlp political\\triggerlist-English.txt')

In [19]:
def filter_news_bypickle(date):
    """
    Takes in a list of News instances which placing in pickle.

    Returns: a list of only the news for which a trigger in triggerlist fires.
    """
    
    file = open('D:\\GLOBAL_AI\\nlp political\\new country\\china\\data\\china'+date+'.pickle','rb')
    news_list = pickle.loads(file.read())
    
    for index, news in enumerate(news_list):    
        if news.get_language() != []:
            if news.get_language()[0] == 'en':
                for key,trig in trigger_dict_english.items():
                    
                    try:
                        news.set_taxonomy((key,trig.get_args())) if trig.evaluate(news)                     else datetime.now()
                    except AttributeError:
                        pass
                    
            '''if news.get_language()[0] == 'es':
                for key,trig in trigger_dict_spanish.items():
                    
                    try:              
                        news.set_taxonomy((key,trig.get_args())) if trig.evaluate(news)                     else datetime.now()
                    except AttributeError:
                        pass'''
            
    outfile = open('D:\\GLOBAL_AI\\nlp political\\new country\\china\\data\\china'+date+'_en.pickle','wb')
    pickle.dump(news_list,outfile)
    outfile.close()    
        

In [20]:
if __name__ == '__main__':
    joblib.Parallel(n_jobs=4, prefer="threads")(joblib.delayed(filter_news_bypickle)(date) for date in tqdm.tqdm_notebook(date_list))
    print('Successfully completed news classification')


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))


Successfully completed news classification


In [116]:
url,sqldate,goldstein_scale,num_mentions,num_articles,tone,taxonomy,language,text = [],[],[],[],[],[],[],[],[]

In [117]:
for date in date_list:
    file = open('D:\\GLOBAL_AI\\nlp political\\new country\\china\\data\\china'+date+'_en.pickle','rb')
    news_list = pickle.loads(file.read())
    
    for news in news_list:
        url.append(news.url)
        sqldate.append(news.sqldate)
        goldstein_scale.append(news.goldstein_scale)
        num_mentions.append(news.num_mentions)
        num_articles.append(news.num_articles)
        tone.append(news.tone)
        taxonomy.append(news.taxonomy)
        language.append(news.language)
        text.append(news.get_text())


In [118]:
original = pd.DataFrame({"URL":url, 
                        "date":sqldate, 
                        "goldstein_scale":goldstein_scale, 
                        "num_mentions":num_mentions, 
                        "num_articles":num_articles, 
                         "tone":tone, 
                         "taxonomy":taxonomy, 
                         "language":language,
                         "text":text})   


In [119]:
original.head()

Unnamed: 0,URL,date,goldstein_scale,num_mentions,num_articles,tone,taxonomy,language,text
0,http://colombogazette.com/2018/01/05/sri-lanka...,20080108,-10.0,5.0,5.0,-2.888889,[],[en],Sri Lanka’s leading wildlife experts and envir...
1,http://en.people.cn/n3/2018/0112/c90000-931446...,20080115,6.4,5.0,5.0,1.267606,"[m2-9, foreign+attack]",[en],Chinese Premier Li Keqiang(3rd L) attends the ...
2,http://kabar.kg/eng/news/chinese-scientists-cl...,20080129,3.4,8.0,8.0,-0.14245,[],[en],China on Thursday announced it successfully cl...
3,http://revolutionradio.org/2018/01/16/nato-all...,20080120,0.0,10.0,10.0,-4.973822,[],[en],NATO is not ready to defend against a major cy...
4,http://usa.chinadaily.com.cn/a/201801/11/WS5a5...,20080114,6.4,20.0,20.0,1.437128,[],[en],PHNOM PENH -- Chinese Premier Li Keqiang has a...


In [120]:
final = pd.DataFrame()

In [121]:
def judge_language(x):
    if x != []:
        return x[0]
    else:
        return np.nan

In [122]:

original['language'] = original['language'].map(judge_language)

final['NumUniqueUrl'] = original.groupby('date')['URL'].count()
final['NumUniqueUrl'] .fillna(0)
final['NumUniqueUrlwithArticle'] = original.groupby('date')['text'].apply(lambda x: x.notnull().count())
final['NumUniqueUrlwithArticle'].fillna(0)

final['MeanToneAllAll'] = original.groupby('date')['tone'].mean()
final['StdToneAllAll'] = original.groupby('date')['tone'].std()

final['NumToneAllEng'] = original[(original['language']=='en')].groupby('date')['tone'].count()
final['NumToneAllEng']=final['NumToneAllEng'].fillna(0)
final['MeanToneAllEng'] = original[(original['language']=='en')].groupby('date')['tone'].mean()
final['StdToneAllEng'] = original[(original['language']=='en')].groupby('date')['tone'].std()

final['NumToneFullEng'] = original[(original['language']=='en')&(original['taxonomy'].map(lambda d: len(d)) > 0)].groupby('date')['tone'].count()
final['NumToneFullEng']=final['NumToneFullEng'].fillna(0)
final['MeanToneFullEng'] = original[(original['language']=='en')&(original['taxonomy'].map(lambda d: len(d)) > 0)].groupby('date')['tone'].mean()
final['StdToneFullEng'] = original[(original['language']=='en')&(original['taxonomy'].map(lambda d: len(d)) > 0)].groupby('date')['tone'].std()

'''
final['NumToneFullSpan'] = original[(original['language']=='es')&(original['taxonomy'].map(lambda d: len(d)) > 0)].groupby('date')['tone'].count()
final['NumToneFullSpan']=final['NumToneFullSpan'].fillna(0)
final['MeanToneFullSpan'] =original[(original['language']=='es')&(original['taxonomy'].map(lambda d: len(d)) > 0)].groupby('date')['tone'].mean()
final['MeanToneFullSpan']=final['MeanToneFullSpan'].fillna(0)
final['StdToneFullSpan'] = original[(original['language']=='es')&(original['taxonomy'].map(lambda d: len(d)) > 0)].groupby('date')['tone'].std()
'''


"\nfinal['NumToneFullSpan'] = original[(original['language']=='es')&(original['taxonomy'].map(lambda d: len(d)) > 0)].groupby('date')['tone'].count()\nfinal['NumToneFullSpan']=final['NumToneFullSpan'].fillna(0)\nfinal['MeanToneFullSpan'] =original[(original['language']=='es')&(original['taxonomy'].map(lambda d: len(d)) > 0)].groupby('date')['tone'].mean()\nfinal['MeanToneFullSpan']=final['MeanToneFullSpan'].fillna(0)\nfinal['StdToneFullSpan'] = original[(original['language']=='es')&(original['taxonomy'].map(lambda d: len(d)) > 0)].groupby('date')['tone'].std()\n"

In [123]:
CmpList = ['m1','m2','m3']    
CtyList = ['m4','m5','m6']
GovList = ['m7','m8','m9']

In [124]:
#Cmp

'''
final['NumToneCmpLan'] = original[(original['taxonomy'].apply(lambda x: sum([i[:2] in CmpList for i in x]) > 0))].groupby('date')['tone'].count()
final['NumToneCmpLan']=final['NumToneCmpLan'].fillna(0)
final['MeanToneCmpLan'] = original[(original['taxonomy'].apply(lambda x: sum([i[:2] in CmpList for i in x]) > 0))].groupby('date')['tone'].mean()
final['StdToneCmpLan'] = original[(original['taxonomy'].apply(lambda x: sum([i[:2] in CmpList for i in x]) > 0))].groupby('date')['tone'].std()
'''

final['NumToneCmpEng'] = original[(original['language']=='en')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CmpList for i in x]) > 0))].groupby('date')['tone'].count()
final['NumToneCmpEng']=final['NumToneCmpEng'].fillna(0)
final['MeanToneCmpEng'] = original[(original['language']=='en')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CmpList for i in x]) > 0))].groupby('date')['tone'].mean()
final['StdToneCmpEng'] = original[(original['language']=='en')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CmpList for i in x]) > 0))].groupby('date')['tone'].std()

'''
final['NumToneCmpSpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CmpList for i in x]) > 0))].groupby('date')['tone'].count()
final['NumToneCmpSpan']=final['NumToneCmpSpan'].fillna(0)
final['MeanToneCmpSpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CmpList for i in x]) > 0))].groupby('date')['tone'].mean()
final['StdToneCmpSpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CmpList for i in x]) > 0))].groupby('date')['tone'].std()
'''


"\nfinal['NumToneCmpSpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CmpList for i in x]) > 0))].groupby('date')['tone'].count()\nfinal['NumToneCmpSpan']=final['NumToneCmpSpan'].fillna(0)\nfinal['MeanToneCmpSpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CmpList for i in x]) > 0))].groupby('date')['tone'].mean()\nfinal['StdToneCmpSpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CmpList for i in x]) > 0))].groupby('date')['tone'].std()\n"

In [125]:
#Cty

'''
final['NumToneCtyLan'] = original[(original['taxonomy'].apply(lambda x: sum([i[:2] in CtyList for i in x]) > 0))].groupby('date')['tone'].count()
final['NumToneCtyLan']=final['NumToneCtyLan'].fillna(0)
final['MeanToneCtyLan'] = original[(original['taxonomy'].apply(lambda x: sum([i[:2] in CtyList for i in x]) > 0))].groupby('date')['tone'].mean()
final['StdToneCtyLan'] = original[(original['taxonomy'].apply(lambda x: sum([i[:2] in CtyList for i in x]) > 0))].groupby('date')['tone'].std()
'''

final['NumToneCtyEng'] = original[(original['language']=='en')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CtyList for i in x]) > 0))].groupby('date')['tone'].count()
final['NumToneCtyEng']=final['NumToneCtyEng'].fillna(0)
final['MeanToneCtyEng'] = original[(original['language']=='en')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CtyList for i in x]) > 0))].groupby('date')['tone'].mean()
final['StdToneCtyEng'] = original[(original['language']=='en')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CtyList for i in x]) > 0))].groupby('date')['tone'].std()

'''
final['NumToneCtySpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CtyList for i in x]) > 0))].groupby('date')['tone'].count()
final['NumToneCtySpan']=final['NumToneCtySpan'].fillna(0)
final['MeanToneCtySpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CtyList for i in x]) > 0))].groupby('date')['tone'].mean()
final['StdToneCtySpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CtyList for i in x]) > 0))].groupby('date')['tone'].std()
'''


"\nfinal['NumToneCtySpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CtyList for i in x]) > 0))].groupby('date')['tone'].count()\nfinal['NumToneCtySpan']=final['NumToneCtySpan'].fillna(0)\nfinal['MeanToneCtySpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CtyList for i in x]) > 0))].groupby('date')['tone'].mean()\nfinal['StdToneCtySpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in CtyList for i in x]) > 0))].groupby('date')['tone'].std()\n"

In [126]:
#Gov

'''
final['NumToneGovLan'] = original[(original['taxonomy'].apply(lambda x: sum([i[:2] in GovList for i in x]) > 0))].groupby('date')['tone'].count()
final['NumToneGovLan']=final['NumToneGovLan'].fillna(0)
final['MeanToneGovLan'] = original[(original['taxonomy'].apply(lambda x: sum([i[:2] in GovList for i in x]) > 0))].groupby('date')['tone'].mean()
final['StdToneGovLan'] = original[(original['taxonomy'].apply(lambda x: sum([i[:2] in GovList for i in x]) > 0))].groupby('date')['tone'].std()
'''

final['NumToneGovEng'] = original[(original['language']=='en')&(original['taxonomy'].apply(lambda x: sum([i[:2] in GovList for i in x]) > 0))].groupby('date')['tone'].count()
final['NumToneGovEng']=final['NumToneGovEng'].fillna(0)
final['MeanToneGovEng'] = original[(original['language']=='en')&(original['taxonomy'].apply(lambda x: sum([i[:2] in GovList for i in x]) > 0))].groupby('date')['tone'].mean()
final['StdToneGovEng'] = original[(original['language']=='en')&(original['taxonomy'].apply(lambda x: sum([i[:2] in GovList for i in x]) > 0))].groupby('date')['tone'].std()

'''
final['NumToneGovSpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in GovList for i in x]) > 0))].groupby('date')['tone'].count()
final['NumToneGovSpan']=final['NumToneGovSpan'].fillna(0)
final['MeanToneGovSpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in GovList for i in x]) > 0))].groupby('date')['tone'].mean()
final['StdToneGovSpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in GovList for i in x]) > 0))].groupby('date')['tone'].std()
'''


"\nfinal['NumToneGovSpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in GovList for i in x]) > 0))].groupby('date')['tone'].count()\nfinal['NumToneGovSpan']=final['NumToneGovSpan'].fillna(0)\nfinal['MeanToneGovSpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in GovList for i in x]) > 0))].groupby('date')['tone'].mean()\nfinal['StdToneGovSpan'] = original[(original['language']=='es')&(original['taxonomy'].apply(lambda x: sum([i[:2] in GovList for i in x]) > 0))].groupby('date')['tone'].std()\n"

In [127]:
final.to_csv('D:\\GLOBAL_AI\\nlp political\\new country\\china\\final.csv')

In [128]:
final = pd.read_csv('D:\\GLOBAL_AI\\nlp political\\new country\\china\\final.csv')

In [129]:
final

Unnamed: 0,date,NumUniqueUrl,NumUniqueUrlwithArticle,MeanToneAllAll,StdToneAllAll,NumToneAllEng,MeanToneAllEng,StdToneAllEng,NumToneFullEng,MeanToneFullEng,StdToneFullEng,NumToneCmpEng,MeanToneCmpEng,StdToneCmpEng,NumToneCtyEng,MeanToneCtyEng,StdToneCtyEng,NumToneGovEng,MeanToneGovEng,StdToneGovEng
0,20080105,2,2,-0.305343,2.444936,1.0,-2.034174,,0.0,,,0.0,,,0.0,,,0.0,,
1,20080106,4,4,-0.635965,2.541005,4.0,-0.635965,2.541005,0.0,,,0.0,,,0.0,,,0.0,,
2,20080107,1,1,-3.424658,,1.0,-3.424658,,0.0,,,0.0,,,0.0,,,0.0,,
3,20080108,1,1,-2.888889,,1.0,-2.888889,,0.0,,,0.0,,,0.0,,,0.0,,
4,20080109,1,1,-3.938356,,1.0,-3.938356,,0.0,,,0.0,,,0.0,,,0.0,,
5,20080112,1,1,-2.590090,,1.0,-2.590090,,0.0,,,0.0,,,0.0,,,0.0,,
6,20080113,2,2,-1.367300,4.902423,2.0,-1.367300,4.902423,1.0,2.099237,,1.0,2.099237,,0.0,,,0.0,,
7,20080114,5,5,0.811015,0.926333,5.0,0.811015,0.926333,2.0,0.508911,1.483116,2.0,0.508911,1.483116,0.0,,,0.0,,
8,20080115,5,5,0.799613,0.853658,4.0,0.687016,0.941872,2.0,0.434335,1.178423,2.0,0.434335,1.178423,0.0,,,0.0,,
9,20080116,1,1,1.373626,,1.0,1.373626,,0.0,,,0.0,,,0.0,,,0.0,,


In [130]:
cols = list(final.iloc[:,1:].columns)
d7ma = pd.DataFrame()
d7ma['date'] = final['date']
for i in cols:
    d7ma[i]=final[i].rolling(7).mean()
    

In [131]:
d7ma.head()

Unnamed: 0,date,NumUniqueUrl,NumUniqueUrlwithArticle,MeanToneAllAll,StdToneAllAll,NumToneAllEng,MeanToneAllEng,StdToneAllEng,NumToneFullEng,MeanToneFullEng,StdToneFullEng,NumToneCmpEng,MeanToneCmpEng,StdToneCmpEng,NumToneCtyEng,MeanToneCtyEng,StdToneCtyEng,NumToneGovEng,MeanToneGovEng,StdToneGovEng
0,20080105,,,,,,,,,,,,,,,,,,,
1,20080106,,,,,,,,,,,,,,,,,,,
2,20080107,,,,,,,,,,,,,,,,,,,
3,20080108,,,,,,,,,,,,,,,,,,,
4,20080109,,,,,,,,,,,,,,,,,,,


In [132]:
d7ma.to_csv('D:\\GLOBAL_AI\\nlp political\\new country\\china\\d7ma.csv')