In [58]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

# Cleaning the data

Now that we have seen what our datset looked like, we have to a little bit of cleaning. There are three main tasks:
1. Removing two letter words that make no sense
2. Taking care of plural and singular, and merge them
3. Merge words that are similar, i.e having the same root.

In [59]:
wordCountYear = pd.read_csv('Data/3kwordCountMonth.csv',index_col=0)
wordCountYear.index = pd.to_datetime(wordCountYear.index)

### The columns that begin by '-'

Because we wanted to keep words such as week-end, we have a certain number of columns that start with '-' and a word, we will remove the dash.

In [60]:
columnNames = wordCountYear.columns.map(lambda x: x[1:] if str(x).startswith('-') else x)
wordCountYear.columns = columnNames

And the words that end with dash.

In [61]:
otherName = wordCountYear.columns.map(lambda x: x[1:] if str(x).endswith('-') else x)
wordCountYear.columns = otherName

And we can group by the words.

In [62]:
wordCountMonthClean1 =  wordCountYear.groupby(by=wordCountYear.columns,axis=1,level=0).agg(sum)
wordCountMonthClean1.shape

(4351, 77595)

### Dealing with adverbs

We will group words with theit adverb (word and word + ement)

In [63]:
adverbs = wordCountMonthClean1.columns.map(lambda x: x[:-5] if str(x).endswith('ement') else x)
wordCountMonthClean1.columns = adverbs

In [64]:
wordCountMontClean2 = wordCountMonthClean1.groupby(by=wordCountYear.columns,axis=1,level=0).agg(sum)
wordCountMontClean2.shape

(4351, 77458)

### Merging singular and plural words

As we were not satisfied by the NLTK work on the french language, we will implement our own small algorithm to merge similar words together. We start were merging singular and plural words.

In [67]:
#Getting the words
columnNames = wordCountMontClean2.columns.map(lambda x: str(x))

Let's look up a few of the column titles.

In [68]:
columnNames[100:120]

array(['abbatemaggio', 'abbatiale', 'abbatucci', 'abbaye', 'abbayes',
       'abbazia', 'abbesse', 'abbet', 'abbeville', 'abbey', 'abbie',
       'abbot', 'abbott', 'abboud', 'abbruzzes', 'abbé', 'abbés', 'abc',
       'abchases', 'abcès'], dtype=object)

As we can see, the plural of a word comes normally just after it's singular, we therefore do not have to check all of the other words for plurals.

In [107]:
sing = []
plural = []
for i in range(len(columnNames)):
    col = columnNames[i]
    colPlur = col+'s'
    colPlurX = col+'x'
    if (i+5) > (len(columnNames) - 1):
        lastEl =  len(columnNames) - 1
    else:
        lastEl = i+5
    for plur in columnNames[i:lastEl]:
        if (colPlur == plur) or (colPlurX == plur):
            sing.append(col)
            plural.append(plur)

print('Number of Singular-Plural word pairs : ', len(dict(zip(sing,plural))))
#We have to remove the jouis and jouiss pairs because, jouis will already be removed with the joui pair
#pluralWords.pop(1329)
singPlurDict = dict(zip(plural,sing))
print(sing[:7])
print(plural[:7])

Number of Singular-Plural word pairs :  7125
['abandonné', 'abandonnée', 'abattoir', 'abattu', 'abattue', 'abbaye', 'abbé']
['abandonnés', 'abandonnées', 'abattoirs', 'abattus', 'abattues', 'abbayes', 'abbés']


Now that we have that we can replace the column names and then groupby so that we merge the data from the two words.

In [109]:
wordCountMontClean2.rename(columns = singPlurDict,inplace=True)
cleanData = wordCountMontClean2.groupby(by=wordCountMontClean2.columns,axis=1,level=0).agg(sum)

Now we look at the masculine and feminine version of words and of the verbs and their infinitif.

In [110]:
newCol = cleanData.columns.values

In [117]:
fem = []
masc = []

for i in range(len(newCol)):
    col = newCol[i]
    colfem= col+'e'
    infVerb = col+'r'
    if (i+5) > (len(newCol) - 1):
        lastEl =  len(newCol) - 1
    else:
        lastEl = i+5
    for plur in newCol[i:lastEl]:
        if (colfem == plur) or (infVerb == plur):
            masc.append(col)
            fem.append(plur)

dictFM = dict(zip(masc,fem))
print('Number of fem singular, verbs word pairs : ', len(dictFM))
print(masc[:7])
print(fem[:7])

Number of fem singular, verbs word pairs :  3760
['abandonne', 'abandonné', 'abattu', 'abl', 'aboli', 'aboli', 'abondant']
['abandonner', 'abandonnée', 'abattue', 'able', 'abolie', 'abolir', 'abondante']


And we groupby again.

In [118]:
cleanData.rename(columns = dictFM,inplace=True)
cleanData1 = cleanData.groupby(by=cleanData.columns,axis=1,level=0).agg(sum)

Let's take a look at the verbs.

In [153]:
verb = []
conj = []

for i in range(len(cleanData1.columns)):
    col = cleanData1.columns[i]
    infVerb = col[:-1]+'er'
    if (i+5) > (len(cleanData1.columns) - 1):
        lastEl =  len(cleanData1.columns) - 1
    else:
        lastEl = i+5
    for plur in cleanData1.columns[i:lastEl]:
        if (infVerb == plur):
            verb.append(col)
            conj.append(plur)

dictVC = dict(zip(verb,conj))
print('Number of fem singular, verbs word pairs : ', len(dictVC))
print(masc[50:57])
print(fem[50:57])

Number of fem singular, verbs word pairs :  441
['actuell', 'adapté', 'adjoint', 'adjugé', 'administré', 'admirabl', 'admire']
['actuelle', 'adaptée', 'adjointe', 'adjugée', 'administrée', 'admirable', 'admirer']


In [154]:
cleanData1.rename(columns = dictVC,inplace=True)

We also want to replace "participe passé" feminine to the infinitif verb.

In [157]:
print(cleanData1.columns[np.chararray.endswith(cleanData1.columns.values.astype(str),'ée')])

Index(['abandonnée', 'abritée', 'abrogée', 'abrégée', 'accentuée', 'acceptée',
       'accompagnée', 'accordée', 'accouchée', 'accoutumée',
       ...
       'éloignée', 'éprouvée', 'épuisée', 'épée', 'équipée', 'étonnée',
       'étouffée', 'étudiée', 'évacuée', 'évaluée'],
      dtype='object', length=792)


In [162]:
cleanData1.columns = cleanData1.columns.map(lambda x: x[:-2]+'er' if str(x).endswith('ée') else x)

And then we can groupby again.

In [163]:
cleanData2 = cleanData1.groupby(by=cleanData1.columns,axis=1,level=0).agg(sum)
cleanData2.shape

(4351, 65942)

We can see that our dataset is still very large, but those operations manage to reduce its size without loosing information!

Now that we have tried not to loose too much of information by trying to merge words that have the same meaning we can start doing some more direct actions.

### Dealing with missing data

One of the problem that we have for our dataset is that there are some months were a word count is missing (for some words). When importing the data we had to choose a frequency limit, and we did not save words that appeared below this limit. Due to the long tail distribution of the data words that have a fequency near the limit can be above it in one month and below it in the other month, therefore we will have a lot of missing values for this word.

For furutre predictions and visualization we are more interested in the general shape of the time series, and not the few months in between where the word is missing, therefore it could be interesting to interpolate the missing data.

But there are also words that are only in present only in a few months and not more. We do not need these words as they are not interesting to predict and do not bring an interesting visualizations.

Due to the fact that we have made some aggregations in the previous steps, we assume that removing words that do not occur very often will not loose information about the other words. We choose to remove words that only appear in 10 different months or less.

In [164]:
cleanData3 = cleanData2.copy()
numMonth = cleanData3.astype(bool).sum(axis=0).values

Let's look at which words are in this situations.

In [186]:
cleanData3.columns[numMonth<10][1:50]

Index(['a-été', 'aaaa', 'aaaactions', 'aaat', 'aac', 'aach', 'aacom', 'aae',
       'aal', 'aalandais', 'aalborg', 'aalen', 'aalesund', 'aali-pacha', 'aan',
       'aandoz', 'aar-tessin', 'aara', 'aarbourg', 'aardman', 'aargau',
       'aargauerstalden', 'aarmiihle', 'aarvan', 'aarwangen', 'aarwanguen',
       'aasan', 'aassi', 'aasta', 'aat', 'aatre', 'ab-y-berg', 'aba',
       'abadessa', 'abadie', 'abaisser', 'abaissé', 'abakanowicz', 'abako',
       'abandonnait', 'abandonnent', 'abanico', 'abanto', 'abanville',
       'abarth', 'abarzuza', 'abatage', 'abatt', 'abattage'],
      dtype='object')

In [187]:
cleanData3.columns[numMonth<10][700:750]

Index(['alboni', 'alboraya', 'albos', 'albrighi', 'albright', 'albu', 'albula',
       'albumine', 'albéniz', 'albéric', 'alcalins', 'alcaloïdes', 'alcantara',
       'alcat', 'alceste', 'alcibiade', 'alcide', 'alcman', 'alco', 'alcolea',
       'alcon', 'alcoy', 'alcudia', 'alcyon', 'aldanska', 'aldaux',
       'aldeacorba', 'aldebert', 'alden', 'alderman', 'aldermen', 'aldersley',
       'aldewin', 'aldonza', 'aldous', 'aldrin', 'aldsi', 'aldy', 'alecha',
       'alechinsky', 'aleck', 'aleko', 'aleko-pacha', 'aleman', 'alembert',
       'alena', 'alentejo', 'alentours', 'alençon', 'alermillod'],
      dtype='object')

In [191]:
print('Before the cleaning: ', cleanData3.shape)
cleanData3.drop(cleanData3.columns[numMonth < 10].values,axis=1,inplace=True)
print('After the cleaning: ', cleanData3.shape)

Before the cleaning:  (4351, 65942)
After the cleaning:  (4351, 23873)


And now we can save the cleaned dataset.

In [198]:
cleanData3.to_csv('Data/wordMonthClean.csv')