In [None]:
### Loading libraries
from urllib2 import urlopen
from bs4 import BeautifulSoup as bs
import pandas as pd, numpy as np
import datetime, time, os, io, codecs, json

# Accessing the NYT site and query its API

In [None]:
### Register for an API Key from the NYT developer website and enter your api key here
apiKey = "<your api key>"

In [None]:
### Helper function to get json into a form I can work with  
def convert(input):
    if isinstance(input, dict):
        return {convert(key): convert(value) for key, value in input.iteritems()}
    elif isinstance(input, list):
        return [convert(element) for element in input]
    elif isinstance(input, unicode):
        return input.encode('utf-8')
    else:
        return input

### Helpful function to figure out what to name individual JSON files        
def getJsonFileName(YR, MO, json_file_path):
    json_file_name = ".".join([YR+'-'+MO,'json'])
    json_file_name = "".join([json_file_path,json_file_name])
    return json_file_name

In [None]:
### Function to grab and save data
def _grab_nyt_by_month_(YR, MO, apiKey):
    ### Create a request string with the month and apiKey
    request_string = "https://api.nytimes.com/svc/archive/v1/"+YR+"/"+MO+".json?api-key="+apiKey
    
    ### Read the NYT site with the request string
    response = urlopen(request_string)
    content = response.read()
    
    ### Convert scraping results into a json format
    articles = convert(json.loads(content))

    ### Save results to a json file
    if len(articles["response"]["docs"]) >= 1:
        json_file_name = getJsonFileName(YR, MO, 'D:\Data\NYTbyMonth\NYT-')
        json_file = open(json_file_name, 'w')
        json_file.write(content)
        json_file.close()
    ### if no more articles, go to next month
    else:
        return

In [None]:
### Function to read saved json files and convert them to dataframe
def readJsonFileToDF(YR, MO):
    ### Read saved json files and convert them into json format
    file_name = getJsonFileName(YR, MO, 'D:\Data\NYTbyMonth\NYT-')
    in_file = open(file_name, 'r')
    articles = convert(json.loads(in_file.read()))
    in_file.close()
    ### Create a dataframe from json files
    artlist = articles["response"]["docs"]
    df = pd.DataFrame(artlist)
    df.to_csv('D:/Data/NYTbyMonthDF/NYT_art_list_'+YR+'-'+MO+'.txt',sep='|',encoding='utf-8')

## Read article list and get URLs

In [None]:
df = pd.read_csv('D:/Data/NYTimesArticleList.txt',encoding='utf-8',sep='|',index_col=0)
dfNew = df[['headline','pub_date','web_url']].copy()
dfNew = dfNew.sort_values(by=['pub_date']).reset_index(drop=True)

In [None]:
### Convert publication date column to string
for index,row in dfNew.iterrows():
    d = pd.to_datetime(row['pub_date'])
    row['pub_date'] = d.strftime('%Y-%m-%d')

### Function that grabs full articles

In [None]:
def _grab_full_nyt_(url):
    
    try:
        sc = urlopen(url).read()
        sc = bs(sc,'lxml')
    except:
        return url
    
    try:
        titleline = sc.find_all('h1')
        title = titleline[0].text.strip()
    except:
        title = 'N/A'
    
    try:
        authorline = sc.find_all('p',{'class':'byline-dateline'})
        if len(authorline)>0:
            result = []
            for a in authorline:
                result.extend(a.find_all('span',{'class':'byline'}))
            author = [a.text.strip() for a in result]
            author = u' '.join(author)
        else:
            authorline = sc.find_all('p',{'class':'css-1cbhw1y e1x1pwtg1'})
            author = authorline[0].text.strip()
    except:
        author = 'N/A'
    
    try:
        bodyline = sc.find_all('p',{'class':'story-body-text'})
        if len(bodyline)==0:
            bodyline = sc.find_all('p',{'class':'css-1tyen8a e2kc3sl0'})
        content = [a.text.strip() for a in bodyline]
        content = u' '.join(content)
    except:
        content = 'N/A'
    
    return [title,author,content,sc]

In [None]:
dates = [d.strftime('%Y-%m-%d') for d in pd.date_range('2009-01-01','2009-01-03',freq='D')]
dates = sorted(set(dates))     # Eliminate duplicates from dates and sort the new set
dates[-3:]

In [None]:
inputs = [(dt, list(dfNew.loc[dfNew.pub_date == dt,'web_url'])) for dt in dates]     # Get list of dates and urllists by date

In [None]:
datelist, urllist = zip(*inputs)     # Unzip packet to dates and urls

In [None]:
secs = 2
urlerrors = list()
for i,d in enumerate(datelist):
    urls = urllist[i]
    if len(urls)==0:
        continue
    data = list()
    for url in urls:
        stuff = _grab_full_nyt_(url)
        if type(stuff)==list:
            data.append(stuff+[url])
        else:
            urlerrors.append([stuff,d])
#         data.append(_grab_full_nyt_(url)+[url])
        time.sleep(secs)
    df = pd.DataFrame(data)
    df['date'] = d
    df.columns = ['title','author','content','sourcecode','url','date']
    df.to_csv('D:/Data/NYTFullwithSourceCode/nyt_full_'+d+'.txt',sep='|',encoding='utf-8')

In [None]:
fnames = os.listdir('D:/Data/NYTFullwithSourceCode/')
df = list()
for f in fnames:
    df.append(pd.read_csv('D:/Data/NYTFullwithSourceCode/'+f,encoding='utf-8',sep='|',index_col=0))
df = pd.concat(df)
df1 = df.sort_values(by=['date'])
df1.index = range(len(df1))

## Parallel code for grabbing full articles on starcluster

In [None]:
### This part runs on Amazon Web Services (AWS) by a starcluster
import os, pandas as pd , time, datetime
import ipyparallel as ipp
from ipyparallel import Client
# client = Client() # run on local ipcluster
client = Client('Your security group',
                sshkey='Your SSH key')
lbview = client.load_balanced_view()
pnodes = len(client.ids)     # Number of nodes in the starcluster
print pnodes

In [None]:
### Function that write a file to all nodes in starcluster
@lbview.parallel(block=True)
def write_to_starcluster(s,filepath='/home/sgeadmin/temp.py'):
    import time
    time.sleep(3)
    try:
        with open(filepath,'r') as f:         # Open filepath to read files
            scopy = f.read()
        if scopy == s:                        # If file s already exists
            return 'already on server'
        else:
            with open(filepath,'w') as f:     # Open filepath to write files
                f.write(s)                    # Write file s to server
            return 'written to server'
    except:
        with open(filepath,'w') as f:
                f.write(s)
        return 'written to server'

In [None]:
### Function that reads files from all nodes in starcluster
@lbview.parallel(block = True)
def read_starcluster(s):
    import os
    files = os.listdir('/home/sgeadmin')
    return files

In [None]:
with open('D:/Codes/_s3_IO_.py') as f:     # Open file _s3_IO_.py from local computer to read
    s = f.read()

In [None]:
write_to_starcluster.map([s]*pnodes*3)     # Write file _s3_IO_.py to all nodes in starcluster

In [None]:
@lbview.parallel(block=True)
def _execute_nyt_(packet):
    from urllib2 import urlopen                   # Open arbitrary resources by URL
    from bs4 import BeautifulSoup as bs           # Library for pulling data out of HTML and XML files
    import pandas as pd                           # Data manipulation and analysis
    import time                                   # Time access and conversions
    import datetime                               # Manipulating dates and times
    import os                                     # Miscellaneous operating system interfaces
    
    with open('/home/sgeadmin/temp.py') as f:     # Open files in starcluster to read
        s = f.read()
    exec(s, globals())                            # Execute files
    
    dates, urllists = zip(*packet)                # Unzip packet to dates and urls
    
    ### Function that grabs full articles
    def _grab_full_nyt_(url):
        
        try:
            sc = urlopen(url).read()
            sc = bs(sc,'lxml')
        except:
            return url
        
        try:
            titleline = sc.find_all('h1')
            title = titleline[0].text.strip()
        except:
            title = 'N/A'
        
        try:
            authorline = sc.find_all('p',{'class':'byline-dateline'})
            if len(authorline)>0:
                result = []
                for a in authorline:
                    result.extend(a.find_all('span',{'class':'byline'}))
                author = [a.text.strip() for a in result]
                author = u' '.join(author)
            else:
                authorline = sc.find_all('p',{'class':'css-1cbhw1y e1x1pwtg1'})
                author = authorline[0].text.strip()
        except:
            author = 'N/A'
        
        try:
            bodyline = sc.find_all('p',{'class':'story-body-text'})
            if len(bodyline)==0:
                bodyline = sc.find_all('p',{'class':'css-1tyen8a e2kc3sl0'})
            content = [a.text.strip() for a in bodyline]
            content = u' '.join(content)
        except:
            content = 'N/A'
        
        return [title,author,content,sc]
    
    secs = 2
    outputs = list()
    urlerrors = list()
    for i,d in enumerate(dates):
        urls = urllists[i]
        if len(urls)==0:
            continue
        data = list()
        for url in urls:
            stuff = _grab_full_nyt_(url)
            if type(stuff)==list:
                data.append(stuff+[url])
            else:
                urlerrors.append([stuff,d])
#             data.append(_grab_full_nyt_(url)+[url])
            time.sleep(secs)
        df = pd.DataFrame(data)
        df['date'] = d
        df.columns = ['title','author','content','sourcecode','url','date']
        ### Save to s3
        dfstring = df.to_csv(encoding='utf-8',sep='|')
        try:
            _write_file_from_str_to_s3_(dfstring,'Data/NYTFull/nyt_full_'+d+'.txt',public=False,html=False)
            out = 'Data/NYTFull/nyt_full_'+d+'.txt'
        except:
            out = 'failed to write {}'.format(d)
        outputs.append(out)
    return outputs, urlerrors

In [None]:
def chunks(l, n):
    n = max(1, n)
    return [l[i:i + n] for i in range(0, len(l), n)]

In [None]:
dates = [d.strftime('%Y-%m-%d') for d in pd.date_range('2009-06-01','2017-12-31',freq='D')]
dates = sorted(set(dates))     # Eliminate duplicates from dates and sort the new set
dates[-3:]

In [None]:
inputs = [(dt, list(dfNew.loc[dfNew.pub_date == dt,'web_url'])) for dt in dates]     # Get list of dates and urllists by date

In [None]:
iterspernode = 1
ndates = [inp for inp in inputs if inp[0] in dates]                       # Get nonempty list of dates and urllists
ndates = chunks(ndates,max(int(len(ndates)/(pnodes*iterspernode)),1))     # Split dates for each node

In [None]:
'each node gets {} dates'.format(len(ndates[0]))

In [None]:
output = _execute_nyt_.map(ndates)

### Read Filenames in S3

In [None]:
import boto
import boto.s3.connection

conn = boto.connect_s3()
bucket = conn.get_bucket('Your bucket',validate=False)
bucket

In [None]:
s3filenames = []
for key in bucket.list(prefix='Data/NYTFull'):
    keyname = str(key)[37:-5]
    s3filenames.append(keyname)

In [None]:
fnames = os.listdir('D:/Data/NYTFullwithSourceCode/')

In [None]:
finished = [x[9:-4] for x in fnames]
unfinished = [x for x in dates if x not in s3filenames and x not in finished]
dates = unfinished

## Read files from S3

In [None]:
### Documentation (accessing S3 Data in Python with Boto3): https://dluo.me/s3databoto3
import boto3
from botocore.client import Config

config = Config(connect_timeout=120,read_timeout=120)     # Prevent timeout error when reading big files
s3client = boto3.client('s3',config=config)               # Low-level functional-oriented API
s3resource = boto3.resource('s3',config=config)           # High-level object-oriented API
s3bucket = s3resource.Bucket('Your bucket')                  # Indicate bucket name
s3bucket

In [None]:
import pandas as pd
from pandas.compat import StringIO
import datetime, time, os, io, codecs

"""
Iterates through all the objects, doing the pagination for you. Each object is an s3.ObjectSummary, so it doesn't
contain the body. You need to call get() to get the whole body
"""
s3keys = list()
s3df = list()
for obj in s3bucket.objects.filter(Prefix='Data/NYTFull/'):   # Use s3bucket.objects.all() for all objects
    key = obj.key
    s3keys.append(key)
    body = obj.get()['Body'].read()
    testdata = StringIO(body)
    dat = pd.read_csv(testdata,encoding='utf-8',sep='|',index_col=0)
#     with codecs.open('./s3temp.csv','w') as f:
#         f.write(body)
#     dat = pd.read_csv('./s3temp.csv',encoding='utf-8',sep='|',index_col=0)
    s3df.append(dat)

In [None]:
s3df = pd.concat(s3df)
s3dfNew = s3df.sort_values(by=['date'])
s3dfNew.index = range(len(s3df))

## Read full-text files

In [None]:
### Function that gets title, author, and content from sourcecode
def _get_dataframe_nyt_(sourcecode):

    try:
        sc = sourcecode
        sc = bs(sc,'lxml')
    except:
        sc = 'N/A'

    try:
        titleline = sc.find_all('h1')
        title = titleline[0].text.strip()
    except:
        title = 'N/A'

    try:
        authorline = sc.find_all('p',{'class':'byline-dateline'})
        if len(authorline)>0:
            result = []
            for a in authorline:
                result.extend(a.find_all('span',{'class':'byline'}))
            author = [a.text.strip() for a in result]
            author = u' '.join(author)
        else:
            authorline = sc.find_all('p',{'class':'css-1cbhw1y e1x1pwtg1'})
            author = authorline[0].text.strip()
    except:
        author = 'N/A'

    try:
        bodyline1 = sc.find_all('p',{'class':'story-body-text'})
        if len(bodyline1)!=0:
            content = [a.text.strip() for a in bodyline1]
        else:
            bodyline2 = sc.find_all('p',{'class':'css-1tyen8a e2kc3sl0'})
            if len(bodyline2)!=0:
                content = [a.text.strip() for a in bodyline2]
            else:
                bodyline3 = sc.find_all('p',{'class':'css-1i0edl6 e2kc3sl0'})
                if len(bodyline3)!=0:
                    content = [a.text.strip() for a in bodyline3]
                else:
                    bodyline4 = sc.find_all('p',{'class':'g-body'})
                    if len(bodyline4)!=0:
                        content = [a.text.strip() for a in bodyline4]
#                     else:
#                         content = 'N/A'
        content = u' '.join(content)
    except:
        content = 'N/A'

    return [title,author,content]

In [None]:
fnames = os.listdir('D:/Data/NYTFullwithSourceCode')

In [None]:
### Read files with sourcecode, fill in missing content from sourcecode, and save new files without sourcecode
for f in fnames:
    df = pd.read_csv('D:/Dropbox/Data/NYTFullwithSourceCode/'+f,encoding='utf-8',sep='|',index_col=0)
    data = list()
    for index,row in df.iterrows():
        sourcecode = row['sourcecode']
        url = row['url']
        d = row['date']
        stuff = _get_dataframe_nyt_(sourcecode)
        data.append(stuff+[url]+[d])
    dfNew = pd.DataFrame(data)
    dfNew.columns = ['title','author','content','url','date']
    dfNew.to_csv('./NYTFullwithoutSourceCode/'+f,sep='|',encoding='utf-8')

In [None]:
fnames = os.listdir('./NYTFullwithoutSourceCode')
df = list()
for f in fnames:
    df.append(pd.read_csv('./NYTFullwithoutSourceCode/'+f,encoding='utf-8',sep='|',index_col=0))
df = pd.concat(df)
df1 = df.sort_values(by=['date'])
df1.index = range(len(df1))

In [None]:
dates = [d.strftime('%Y-%m-%d') for d in pd.date_range('2009-01-01','2017-12-18',freq='D')]
dates = sorted(set(dates))     # Eliminate duplicates from dates and sort the new set

In [None]:
### Get list of dates and urls of rows with empty content by date
worklist = [(dt, list(dat.loc[dat.date==dt,'url'])) for dt in dates]

In [None]:
datelists, urllists = zip(*worklist)

In [None]:
### Fill in more missing content from sourcecode and save new files without sourcecode
for i,d in enumerate(datelists):
    urls = urllists[i]
    if len(urls)==0:
        continue
    else:
        orgdf = pd.read_csv('D:/Dropbox/Data/NYTFullwithSourceCode/nyt_full_'+d+'.txt',encoding='utf-8',sep='|',index_col=0)
        repdf = pd.DataFrame()
        for address in urls:
            repdf = repdf.append(orgdf[orgdf['url']==address])
        data = list()
        for index,row in repdf.iterrows():
            sourcecode = row['sourcecode']
            url = row['url']
            stuff = _get_dataframe_nyt_(sourcecode)
            data.append(stuff+[url]+[d])
        dfNew = pd.DataFrame(data)
        dfNew.columns = ['title','author','content','url','date']
        dfNew.to_csv('./NYTFullwithoutSourceCodeV1/nyt_full_'+d+'.txt',sep='|',encoding='utf-8')

In [None]:
fnames = os.listdir('./NYTFullwithoutSourceCodeV1')
dfV1 = list()
for f in fnames:
    dfV1.append(pd.read_csv('./NYTFullwithoutSourceCodeV1/'+f,encoding='utf-8',sep='|',index_col=0))
dfV1 = pd.concat(dfV1)
dfV1New = dfV1.sort_values(by=['date'])
dfV1New.index = range(len(dfV1New))

In [None]:
# Replace empty content with new content and save new dataframe
fnames = os.listdir('./NYTFullwithoutSourceCodeV1')
for f in fnames:
    dfV1 = pd.read_csv('./NYTFullwithoutSourceCodeV1/'+f,encoding='utf-8',sep='|',index_col=0)
    if dfV1['content'].count()==0:                                        # Count number of rows that are not NaN
        continue
    else:
        dfV1.dropna(subset=['content'],inplace=True)                      # Drop rows that are NaN
        dfV0 = pd.read_csv('./NYTFullwithoutSourceCode/'+f,encoding='utf-8',sep='|',index_col=0).append(dfV1)
        dfV0.drop_duplicates(subset=['url'],keep='last',inplace=True)     # Remove rows with duplicate urls
        dfV0.reset_index(drop=True,inplace=True)                          # Reset index of new dataframe
        dfV0.to_csv('./NYTFullwithoutSourceCodeV2/'+f,sep='|',encoding='utf-8')

In [None]:
fnames = os.listdir('D:/Dropbox/Data/NYTFullwithoutSourceCode')
df = list()
for f in fnames:
    df.append(pd.read_csv('D:/Dropbox/Data/NYTFullwithoutSourceCode/'+f,encoding='utf-8',sep='|',index_col=0))
df = pd.concat(df)
df1 = df.sort_values(by=['date'])
df1.index = range(len(df1))

## Match full-text dataframe with original dataframe that has metadata

In [None]:
dfOrg = pd.read_csv('D:/Data/NYTimesArticleList.txt',encoding='utf-8',sep='|',index_col=0)
dfFull = pd.read_csv('D:/Data/NYTFull.txt',encoding='utf-8',sep='|',index_col=0)
dfFull.dropna(subset=['content'],inplace=True)

In [None]:
# Remove rows with duplicate urls
dfOrg1 = dfOrg.drop_duplicates(subset='web_url',keep='first')
dfFull1 = dfFull.drop_duplicates(subset='url',keep='first')

In [None]:
# Select rows from dfOrg1 that has the same urls as dfFull1
df1 = dfOrg1[dfOrg1['web_url'].isin(dfFull1['url'])]

In [None]:
data = list()
for index,row in dfFull.iterrows():
    title = row['title']
    author = row['author']
    content = row['content']
    url = row['url']
    date = row['date']
    
    matchrow = dfOrg[dfOrg['web_url'].str.contains(url)]
    document_type = matchrow['document_type'].values[0]
    type_of_material = matchrow['type_of_material'].values[0]
    news_desk = matchrow['news_desk'].values[0]
    headline = matchrow['headline'].values[0]
    keywords = matchrow['keywords'].values[0]
    print_page = matchrow['print_page'].values[0]
    pub_date = matchrow['pub_date'].values[0]
    section_name = matchrow['section_name'].values[0]
    subsection_name = matchrow['subsection_name'].values[0]
    source = matchrow['source'].values[0]
    web_url = matchrow['web_url'].values[0]
    word_count = matchrow['word_count'].values[0]
    
    matchlist = [title,author,content,url,date,document_type,type_of_material,news_desk,headline,keywords,
                 print_page,pub_date,section_name,subsection_name,source,web_url,word_count]
    data.append(matchlist)

df = pd.DataFrame(data)
df.columns = ['title','author','content','url','date','document_type','type_of_material','news_desk',
              'headline','keywords','print_page','pub_date','section_name','subsection_name','source',
              'web_url','word_count']

In [None]:
# Sort two dataframes by url
df2 = df1.sort_values(by=['web_url']).reset_index(drop=True)
dfFullNew = dfFull1.sort_values(by=['url']).reset_index(drop=True)

In [None]:
# Select relevant columns from df2
dfOrgNew = df2[['document_type','type_of_material','news_desk','headline','keywords','print_page',
                'pub_date','section_name','subsection_name','source','web_url','word_count']].copy()

In [None]:
# Combine two dataframes and sort the new dataframe by date
df = pd.concat([dfFullNew,dfOrgNew],axis=1)
df = df.sort_values(by=['date']).reset_index(drop=True)
df1 = df[['title','author','content','date','document_type','type_of_material','news_desk',
         'section_name','subsection_name','word_count']].copy()

In [None]:
df4 = df1[['date','document_type']].copy()
df4['date'] = pd.to_datetime(df4['date'])
df4group = df4.groupby(pd.Grouper(key='date',freq='M'))
df4grouplist = list(df4group['document_type'])

In [None]:
finallist = list()
for i in range(len(df4grouplist)):
    date = df4grouplist[i][0]
    doclist = list(df4grouplist[i][1])
    articlecount = doclist.count('article')
    blogpostcount = doclist.count('blogpost')
    multimediacount = doclist.count('multimedia')
    total = len(doclist)
    artpercent = float(articlecount)/total
    blogpercent = float(blogpostcount)/total
    mediapercent = float(multimediacount)/total
    finallist.append([date,articlecount,blogpostcount,multimediacount,total,artpercent,blogpercent,mediapercent])
df4doc = pd.DataFrame(finallist)
df4doc.columns = ['date','article_count','blogpost_count','multimedia_count','total_count',
                 'article_percent','blogpost_percent','multimedia_percent']

In [None]:
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
dfpercent = df4doc[['date','article_percent','blogpost_percent']].copy()
dfpercent.plot()

In [None]:
dfpercent1 = dfpercent.groupby(pd.Grouper(key='date',freq='M')).mean()
plt.plot(dfpercent1)

### Detect English texts and remove non-English texts from data

In [None]:
dfNYT = pd.read_csv('/mnt/data/NYTFullNew.txt',encoding='utf-8',sep='|',index_col=0)

In [None]:
dataNYT = dfNYT.copy()
dataNYT.dropna(subset=['content'],inplace=True)
dataNYT.reset_index(drop=True,inplace=True)

In [None]:
import pycld2 as cld2

In [None]:
bad_chars = [u'\x00', u'\x01', u'\x02', u'\x03', u'\x04', u'\x05', u'\x06', u'\x07', u'\x08', u'\x0b', u'\x0e',
             u'\x0f', u'\x10', u'\x11', u'\x12', u'\x13', u'\x14', u'\x15', u'\x16', u'\x17', u'\x18', u'\x19',
             u'\x1a', u'\x1b', u'\x1c', u'\x1d', u'\x1e', u'\x1f', u'\x7f', u'\x80', u'\x81', u'\x82', u'\x83',
             u'\x84', u'\x85', u'\x86', u'\x87', u'\x88', u'\x89', u'\x8a', u'\x8b', u'\x8c', u'\x8d', u'\x8e',
             u'\x8f', u'\x90', u'\x91', u'\x92', u'\x93', u'\x94', u'\x95', u'\x96', u'\x97', u'\x98', u'\x99',
             u'\x9a', u'\x9b', u'\x9c', u'\x9d', u'\x9e', u'\x9f']

In [None]:
# Detect English texts using pycld2 on dataNYT
count1 = 0
EngNYT = []
start_time1 = time.time()
for index,row in dataNYT.iterrows():
    title = row['title']
    author = row['author']
    text = row['content']
    url = row ['url']
    date = row['date']
    document_type = row['document_type']
    type_of_material = row['type_of_material']
    news_desk = row['news_desk']
    headline = row['headline']
    keywords = row['keywords']
    print_page = row['print_page']
    pub_date = row['pub_date']
    section_name = row['section_name']
    subsection_name = row['subsection_name']
    source = row['source']
    web_url = row['web_url']
    word_count = row['word_count']
    count1 += 1
    newtext = text
    for item in bad_chars:
        newtext = newtext.replace(item,' ')
    t = newtext.encode('utf-8')
    reliable, index, top_3_choices = cld2.detect(t)
    lang = top_3_choices[0][1]
    if lang=='en':
        EngNYT.append([date,pub_date,title,headline,author,newtext,document_type,type_of_material,news_desk,
                       keywords,section_name,subsection_name,source,print_page,word_count,url,web_url])
run_time1 = time.time()-start_time1
print("This takes %s seconds to run" %run_time1)
print "Length of EngNYT = ",len(EngNYT)

In [None]:
dataEngNYT = pd.DataFrame(EngNYT,columns=['date','pub_date','title','headline','author','content','document_type',
                                          'type_of_material','news_desk','keywords','section_name','subsection_name',
                                          'source','print_page','word_count','url','web_url'])

In [None]:
dataEngNYT = dataEngNYT.rename(columns={'newtext':'content'})

### Clean texts before running analysis

In [None]:
import pandas as pd, numpy as np, datetime, time, os, re

In [None]:
DF = pd.read_csv('/mnt/data/TextAnalysis-All/NYTFullEng.txt',encoding='utf-8',sep='|',index_col=0,chunksize=10000)
DF = pd.concat(DF)

In [None]:
# Remove duplicate urls
df = DF.drop_duplicates(subset='url',keep='first').reset_index(drop=True)
# Drop rows that have NaN content
df = df.dropna(subset=['content']).reset_index(drop=True)

In [None]:
# Remove rows reporting lottery numbers (1648)
df = df[df['title']!='Lottery Numbers'].reset_index(drop=True)
# Remove rows reporting operating hours on New Year's Day (2)
df = df[df['title']!=u'New Year\u2019s Day'].reset_index(drop=True)
# Remove rows that are corrections (4330)
df = df[df['title']!=u'Corrections']
df = df[df['title']!=u'Correction'].reset_index(drop=True)
# Remove rows that are Word+Quiz (261)
indices = list()
for index,row in df[702900:].iterrows():
    if type(row['title'])!=float:
        if u'Word + Quiz' in row['title']:
#             print index
            indices.append(index)
df.drop(df.index[indices],inplace=True)
df = df.reset_index(drop=True)

In [None]:
def _clean_nyt_texts_(df):
    texts = list()
    for index,row in df.iterrows():
        text0 = row['content']
        if 0<=text0.find(u'\u2014')<=30:
            if text0[0:3].isupper():
                text1 = text0[text0.find(u'\u2014')+1:]
            else:
                text1 = text0
        else:
            text1 = text0
        text2 = re.sub(r'\b[A-Za-z0-9-_@,.!/]+.com\b','',text1)
        text2 = re.sub(r'\b[A-Za-z0-9-_@,.!/]+.co.za\b','',text2)
        text2 = re.sub(r'\b[A-Za-z0-9-_@,.!/]+.gov\b','',text2)
        text2 = re.sub(r'\b[A-Za-z0-9-_@,.!/]+.org\b','',text2)
        text2 = re.sub(r'\b[A-Za-z0-9-_@,.!/]+.htm\b','',text2)
        text2 = re.sub(r'\b[A-Za-z0-9-_@,.!/]+.io\b','',text2)
        text3 = ' '.join(text2.split())
        texts.append(text3)
    return texts

In [None]:
start_time = time.time()
data1 = df.copy()
result = _clean_nyt_texts_(data1)
data1['cleaned_text'] = result
oldcols = data1.columns.tolist()
newcols = oldcols[:6]+oldcols[-1:]+oldcols[6:-1]
data2 = data1[newcols]
data2 = data2.sort_values(by=['date']).reset_index(drop=True)
print("This takes %s seconds to run" %(time.time()-start_time))

In [None]:
dfNYT = pd.read_csv('/mnt/data/TextAnalysis-All/NYTcleaned.txt',encoding='utf-8',sep='|',index_col=0,chunksize=10000)
dfNYT = pd.concat(dfNYT)

In [None]:
# df = dfNYT[['date','title','author','content','cleaned_text','document_type','type_of_material']].copy()
df = dfNYT.copy()
# Remove rows [300072,331117,452168] that have almost no content after being cleaned again
df.drop(df.index[[300072,331117,452168]],inplace=True)
df.reset_index(drop=True)

In [None]:
lower_texts = list()
for index,row in df.iterrows():
    lower_text = row['cleaned_text'].lower()
    lower_texts.append(lower_text)
len(lower_texts)

In [None]:
df1 = df.copy()
df1['lower_text'] = lower_texts

In [None]:
newtexts = list(); indices1 = list(); indices2 = list(); indices3 = list(); indices4 = list(); indices5 = list()
indices6 = list(); indices7 = list(); indices8 = list(); indices9 = list(); indices10 = list(); indices11 = list()
indices12 = list(); indices13 = list(); indices14 = list(); indices15 = list()

for index,row in df1.iterrows():
    cleaned_text = row['cleaned_text']
#     lower_text = row['lower_text']
    text1 = cleaned_text
    if cleaned_text.find(u'NYTD.fullScreenSlideShowFactory.createSlideshow')>=0:
        while text1.find(u'NYTD.fullScreenSlideShowFactory.createSlideshow')>=0:
            start1 = text1.find(u'NYTD.fullScreenSlideShowFactory.createSlideshow')
            end1 = start1+text1[start1:].find(u';')+1
            term1 = text1[start1:end1]
            text1 = text1.replace(term1,'')
        indices1.append(index)
    if text1.find(u'var data;')>=0:
        start2 = text1.find(u'var data;')
        end2 = text1[:start2+300].rfind(u'}')+1
        term2 = text1[start2:end2]
        text2 = text1.replace(term2,'')
        indices2.append(index)
    else:
        text2 = text1
    if text2.find(u'$.noConflict()')>=0:
        start3 = text2.find(u'$.noConflict()')
        end3 = text2[:start3+780].rfind(u'});')+len(u'});')
        term3 = text2[start3:end3]
        text3 = text2.replace(term3,'')
        indices3.append(index)
    else:
        text3 = text2
    text4 = text3
    if text3.find(u'jQuery(document)')>=0:
        while text4.find(u'jQuery(document)')>=0:
            start4 = text4.find(u'jQuery(document)')
            end4 = start4+text4[start4:].find(u'});')+len(u'});')
            term4 = text4[start4:end4]
            text4 = text4.replace(term4,'')
        indices4.append(index)
    text5 = text4
    if text4.find(u'(function($)')>=0:
        while text5.find(u'(function($)')>=0:
            start5 = text5.find(u'(function($)')
            end5 = start5+text5[start5:].find(u'NYTD.jQuery);')+len(u'NYTD.jQuery);')
            term5 = text5[start5:end5]
            text5 = text5.replace(term5,'')
        indices5.append(index)
    if text5.find(u'NYTD.jQuery = jQuery.noConflict()')>=0:
        start6 = text5.find(u'NYTD.jQuery = jQuery.noConflict()')
        end6 = start6+text5[start6:].find(u'});')+len(u'});')
        term6 = text5[start6:end6]
        text6 = text5.replace(term6,'')
        indices6.append(index)
    else:
        text6 = text5
    if text6.find(u'NYTD.hotSpotsFactory')>=0:
        start7 = text6.find(u'NYTD.hotSpotsFactory')
        end7 = start7+text6[start7:].find(u';')+1
        term7 = text6[start7:end7]
        text7 = text6.replace(term7,'')
        indices7.append(index)
    else:
        text7 = text6
    if text7.find(u'new NYTD.NYTINT')>=0:
        start8 = text7.find(u'new NYTD.NYTINT')
        end8 = start8+text7[start8:].find(u'.photos);')+len(u'.photos);')
        if end8<=start8+len(u'.photos);'):
            end8 = start8+text7[start8:].find(u'fetch();')+len(u'fetch();')
        term8 = text7[start8:end8]
        text8 = text7.replace(term8,'')
        indices8.append(index)
    else:
        text8 = text7
    if text8.find(u'new NYTD.NYTMM')>=0:
        start9 = text8.find(u'new NYTD.NYTMM')
        end9 = start9+text8[start9:].find(u'});')+len(u'});')
        term9 = text8[start9:end9]
        text9 = text8.replace(term9,'')
        indices9.append(index)
    else:
        text9 = text8
    if text9.find(u'var NYTD = window.NYTD')>=0:
        start10 = text9.find(u'var NYTD = window.NYTD')
        end10 = start10+text9[start10:start10+660].rfind(u'});')+len(u'});')
        term10 = text9[start10:end10]
        text10 = text9.replace(term10,'')
        indices10.append(index)
    else:
        text10 = text9
    if text10.find(u'NYTD.NYTD.')>=0:
        start11 = text10.find(u'NYTD.NYTD.')
        end11 = start11+text10[start11:].find(u'1em;}')+len(u'1em;}')
        term11 = text10[start11:end11]
        text11 = text10.replace(term11,'')
        indices11.append(index)
    else:
        text11 = text10
    if text11.find(u'NYTD.NYTINT.')>=0:
        start12 = text11.find(u'NYTD.NYTINT.')
        end12 = start12+text11[start12:].find(u'});')+len(u'});')
        term12 = text11[start12:end12]
        text12 = text11.replace(term12,'')
        indices12.append(index)
    else:
        text12 = text11
    if text12.find(u'// < ![CDATA[')>=0:
        start13 = text12.find(u'// < ![CDATA[')
        end13 = start13+text12[start13:].find(u'// ]]>')+len(u'// ]]>')
        term13 = text12[start13:end13]
        text13 = text12.replace(term13,'')
        indices13.append(index)
    else:
        text13 = text12
    if text13.find(u'NYTD.jQuery(function()')>=0:
        start14 = text13.find(u'NYTD.jQuery(function()')
        end14 = start14+text13[start14:].find(u'});')+len(u'});')
        term14 = text13[start14:end14]
        text14 = text13.replace(term14,'')
        indices14.append(index)
    else:
        text14 = text13
    if text14.find(u'<!-- #screen_name_input')>=0:
        start15 = text14.find(u'<!-- #screen_name_input')
        end15 = start15+text14[start15:].find(u'// ]]>')+len(u'// ]]>')
        term15 = text14[start15:end15]
        text15 = text14.replace(term15,'')
        indices15.append(index)
    else:
        text15 = text14
    if 0<=text15.find(u'\u2014')<=30:
        if text15[0:3].isupper():
            text16 = text15[text15.find(u'\u2014')+1:]
        else:
            text16 = text15
    else:
        text16 = text15
    text16 = u' '.join(text16.split())
    newtexts.append(text16)