In [22]:
#Imports
from bs4 import BeautifulSoup
import re,nltk,string,datetime
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import numpy as np

# Filtering the data

Each XML file containing lots of text that is not very relevant we have first to filter it.

We will first try several approaches to find which one runs the fastest.

For all of them the first part is to extract the text data from the file.

In [2]:
f = open('1996/07.xml','r')
soupArticle = BeautifulSoup(f,'lxml')
f.close()

In [3]:
Article = []
for ft in soupArticle.find_all('full_text'):
    #Here we lower each words and we remove the numbers from the text
    Article.append(re.sub(r'\d+', '', ft.text).lower())

textArticle = ' '.join(Article)  

The first aproach we try is by using nltk and word tokenization.

First we take care of the stopwords. We have to begin to manually update the french stopword list because it misses a lot of words that are not important for the analysis. There are also letters that we have to add because there are parts of the XML that contain really random stuff and gives bizarre results in the counting.

In [4]:
stop_words = set(stopwords.words('french'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']',
                   '{', '}', '<', '>', '...', '-', '•', '/', '%', '«', '»', 'le',
                  'les','..','©','plus','*','``','av','p.','fr','cette','a.','v.','b.','d.',
                   'c.','e.','f.','i.','g.','h.','i.','j.','l.','m.','n.','o.','q.','r.','q.',
                   's.','t.','u.','v.','w.','x.','z.','y.',"d'un","d'une",'dm',"c'est",
                   "jusqu'au",'entre','comme','si','di','„','&',"qu'il",'_','fd',"n'a",'alors',
                   "s'est","n'est",'cs','dès','où',"jusqu'à",'déjà',"''",'|','£','®','+','-«',
                   '--','.-'])

#there are some random letters and we do not want to keep them in our 
#word count
stop_words.update(set(string.whitespace))
stop_words.update(set(string.ascii_lowercase))

We test to see how long it takes with nltk FreqDist and compare it with the counting 
from Collections.

In [5]:
tokensArticle = nltk.word_tokenize(textArticle)
ArticleNltk = nltk.Text(tokensArticle)
textArticleClean = [i.lower() for i in ArticleNltk if i.lower() not in stop_words]
frequentArticle = nltk.FreqDist(textArticleClean)

In [6]:
countFreq = frequentArticle.most_common(1000)
countFreq[:8]

[('ans', 740),
 ('suisse', 449),
 ('deux', 282),
 ('genève', 276),
 ('fait', 202),
 ('sans', 202),
 ('tout', 198),
 ('juin', 183)]

In [7]:
tokensArticle = nltk.word_tokenize(textArticle)
ArticleNltk = nltk.Text(tokensArticle)
textArticleClean = [i.lower() for i in ArticleNltk if i.lower() not in stop_words]
textClean = ' '.join(textArticleClean)
countCounter = Counter(textClean.split()).most_common(1000)

In [8]:
countCounter[:8]

[('ans', 740),
 ('suisse', 449),
 ('deux', 282),
 ('genève', 276),
 ('fait', 202),
 ('sans', 202),
 ('tout', 198),
 ('juin', 183)]

When using nltk we see that both counting approaches give the same results. Now we will try without using NLTK and just filtering the text, which is normal the main point of this test was to see if Collections.Counter would be faster than nltk.FreqDist.

We will also test a brute force approach by working directly with the text string.

We first begin by creating a list containing the stop words, which we will use to clean the text from undesired characters. For that we can use the stop words from nltk as well as our own.

In [9]:
listArticle = textArticle
stop_wordsList = list(stop_words)
stop_wordsList.extend(['','-.','.,','(,','»,',').','-.','av.','».',
                      '....','..','...','.....','()','(r).','—',')-«',
                      '".','-,','(-.',')-.','(©'])

In [10]:
wordsArticle = listArticle.split(' ')
words = [word for word in wordsArticle if word not in stop_wordsList]
countList = Counter(words).most_common(1000)

In [11]:
countList[:8]

[('ans)', 464),
 ('fr.', 449),
 ('suisse', 346),
 ('deux', 279),
 ('genève', 212),
 ('tout', 194),
 ('sans', 189),
 ('fait', 183)]

We also try another method where we replace in the text each symbol that does not suit us. for that we need a dictionary, with only the most basic symbols.

Then we also have to go through all the stopwords because otherwise we keep the word that do not bring information (le, la, les etc..)

In [12]:
listPunct = list(string.punctuation)
listPunct.remove("'")
listPunct.remove("-")
#we do not want to stick together all the words that
#have an apostrophe (l'armée should not become larmée)
stopDict = dict(zip(listPunct,['']*len(listPunct)))

In [13]:
dictArticle = textArticle

In [14]:
for k,v in stopDict.items():
    dictArticle = dictArticle.replace(k,v)

dictCounter = [wordsDict for wordsDict in dictArticle.split(' ') if wordsDict not in stop_wordsList]
countDict = Counter(dictCounter).most_common(1000)

In [15]:
countDict[:10]

[('ans', 735),
 ('suisse', 449),
 ('deux', 282),
 ('genève', 275),
 ('fait', 202),
 ('sans', 202),
 ('tout', 198),
 ('juin', 182),
 ('centre', 177),
 ('sep', 176)]

We see that for the 4 methods that we tested there is a clear tradeoff between speed and words. Going with nltk take 5 times more time but gives more accurate representation of words. Working brute force with the string data goes much faster, but does not give exactly the same results.

The timing of each method (computed with the **%%timeit** magic function is summarized in the following table)



| Method         |  NLTK  |   List  |  Dict  |
|----------|------|------|------|
| Time [s] | 9    | 1.5  | 1.5  |

In [16]:
dfFreq = pd.DataFrame(countFreq).transpose()
dfFreq.columns = dfFreq.loc[0]
dfFreq.drop(0,inplace=True)
dfFreq.index = ['NLTK word counts']

dfDict = pd.DataFrame(countDict).transpose()
dfDict.columns = dfDict.loc[0]
dfDict.drop(0,inplace=True)
dfDict.index = ['Dict word count']

dfList = pd.DataFrame(countList).transpose()
dfList.columns = dfList.loc[0]
dfList.drop(0,inplace=True)
dfList.index = ['List Word count']

pd.concat([dfFreq,dfDict,dfList],axis=0).ix[:,:10]

Unnamed: 0,""").",$,%),"%,",%.,').,'s,(+,(.,(afp)
NLTK word counts,,31.0,,,,,27.0,,,
Dict word count,,,,,,,,,,
List Word count,18.0,29.0,18.0,17.0,30.0,18.0,,36.0,33.0,36.0


In [17]:
pd.concat([dfFreq,dfDict,dfList],axis=0).ix[:,180:190]

Unnamed: 0,cantonal,cantonale,cantons,cap,cap.,capital,car,caractère,carouge,"carouge,"
NLTK word counts,24,19,25,115,,44,59,,45,
Dict word count,24,19,25,115,,44,59,,45,
List Word count,21,17,20,73,39.0,40,55,18.0,23,18.0


In [18]:
pd.concat([dfFreq,dfDict,dfList],axis=0).ix[:,500:510]

Unnamed: 0,"gibson,",giraudeau,global,gouvernement,gp,graf,grand,grande,grandes,grands
NLTK word counts,,29.0,31,58,27.0,31,98,56,36,27
Dict word count,,29.0,31,58,27.0,31,98,56,36,27
List Word count,21.0,,30,48,,25,98,55,36,27


We can see by looking into words that were counted that it seems nltk and dict word counting produce a result that is quite the same, and the list method is not good (no removal of commas, no removal of a lot of special signs). 

As we can see that given the similarity between NLTK and dict results, and given the fact that the dict method is 5 times as fast, we will implement a dict cleaning method in our data extraction pipeline.

Below we show the euclidion distance petween nltk and dict Val and nltk and listVal to have a quantitative look at how they differ.

In [19]:
merge = pd.concat([dfFreq,dfDict,dfList],axis=0)

In [20]:
NLTK = np.nan_to_num(merge.loc['NLTK word counts'].values.astype(np.float))
dictVal = np.nan_to_num(merge.loc['Dict word count'].values.astype(np.float))
listVal = np.nan_to_num(merge.loc['List Word count'].values.astype(np.float))

In [21]:
print('NLTK and dict difference : ', np.linalg.norm(NLTK-dictVal))
print('NLTK and list difference : ', np.linalg.norm(NLTK-listVal))

NLTK and dict difference :  322.001552791
NLTK and list difference :  1129.28517213


We see that as expected NLTK and list are much more separated.

Let's take a look at which words are not well understood in NLTK and dict.

In [38]:
diff = NLTK - dictVal
dfDiff = merge.ix[0:2,diff>0]
dfDiff.ix[:,20:35]

Unnamed: 0,juil.,juin,lac,lausanne,mai,ml,mois,nlg,no,oct.,sep.,sept.,swissca,travail,urgences
NLTK word counts,25.0,183,27,153,60,21,176,55,53,48.0,172.0,19.0,38,129,40
Dict word count,,182,23,150,59,20,175,52,52,,,,37,128,36


In [43]:
dfDiff.ix[:,30:40]

Unnamed: 0,sep.,sept.,swissca,travail,urgences,ville,yen,~,—
NLTK word counts,172.0,19.0,38,129,40,137,55,34.0,85.0
Dict word count,,,37,128,36,119,54,,


In [39]:
merge.ix[:,diff>0].columns

Index(['$', ''s', '-.', '=', '^', 'allemand', 'ans', 'aug.', 'avant', 'banque',
       'chf', 'd'affaires', 'dec.', 'entreprises', 'france', 'fériés', 'gen.',
       'genève', 'inc.', 'jan.', 'juil.', 'juin', 'lac', 'lausanne', 'mai',
       'ml', 'mois', 'nlg', 'no', 'oct.', 'sep.', 'sept.', 'swissca',
       'travail', 'urgences', 'ville', 'yen', '~', '—'],
      dtype='object')

With this analysis we see two things, first that there are not a lot of words where a difference can be found, and when there is a difference it is not very large, so by using the dict method we only gain speed without loosing a lot of information.

The next two cells summarize the fact the we do not loose a lot of inforamtion!

In [45]:
pd.Series(diff).value_counts()

 0.0      1122
 1.0        14
-18.0        8
 4.0         4
-1.0         3
 3.0         2
 25.0        2
-27.0        2
 22.0        2
-71.0        1
-2.0         1
 2.0         1
 18.0        1
 34.0        1
 5.0         1
-19.0        1
-20.0        1
-14.0        1
 39.0        1
 27.0        1
 31.0        1
-3.0         1
-28.0        1
 19.0        1
 28.0        1
 71.0        1
-35.0        1
-23.0        1
 23.0        1
-49.0        1
 40.0        1
-48.0        1
 48.0        1
-6.0         1
-176.0       1
 172.0       1
-16.0        1
 85.0        1
dtype: int64

In [51]:
merge.ix[0:2,diff>80]

Unnamed: 0,sep.,—
NLTK word counts,172.0,85.0
Dict word count,,


Finally we have to set the index as a datetime to be able to plot correctly in the next step.

In [23]:
dfDict

Unnamed: 0,ans,suisse,deux,genève,fait,sans,tout,juin,centre,sep,...,tandis,important,banco,l'emploi,laisse,république,ind,quinze,présidence,liffe
Dict word count,735,449,282,275,202,202,198,182,177,176,...,18,18,18,18,18,18,18,18,18,18


In [24]:
date = pd.to_datetime('1997/01')

In [27]:
dfDict.index = [date]

In [28]:
dfDict

Unnamed: 0,ans,suisse,deux,genève,fait,sans,tout,juin,centre,sep,...,tandis,important,banco,l'emploi,laisse,république,ind,quinze,présidence,liffe
1997-01-01,735,449,282,275,202,202,198,182,177,176,...,18,18,18,18,18,18,18,18,18,18
