# Word Frequencies & Zipf's Law
This note book is used for generating a dictionary of word frequencies across the whole corpus as well as confirming Zipf's law. 

In [99]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

In [100]:
import pandas as pd
df0 = pd.read_csv("../data/interim/001_normalised_keyed_reviews.csv", sep="\t", low_memory=False)
df0.head()

Unnamed: 0,uniqueKey,reviewText
0,A2XQ5LZHTD4AFT##000100039X,"['timeless', 'classic', 'demanding', 'assuming..."
1,AF7CSSGV93RXN##000100039X,"['first', 'read', 'prophet', 'kahlil', 'gibran..."
2,A1NPNGWBVD9AK3##000100039X,"['one', 'first', 'literary', 'books', 'recall'..."
3,A3IS4WGMFR4X65##000100039X,"['prophet', 'kahlil', 'gibrans', 'best', 'know..."
4,AWLFVCT9128JV##000100039X,"['gibran', 'khalil', 'gibran', 'born', 'one th..."


In [101]:
def convert_text_to_list(review):
    return review.replace("[","").replace("]","").replace("'","").replace("\t","").split(",")

In [102]:
# Convert "reviewText" field to back to list
df0['reviewText'] = df0['reviewText'].astype(str)
df0['reviewText'] = df0['reviewText'].progress_apply(lambda text: convert_text_to_list(text));
df0['reviewText'].head()

Progress:: 100%|██████████| 582711/582711 [00:13<00:00, 42892.56it/s]


0    [timeless,  classic,  demanding,  assuming,  t...
1    [first,  read,  prophet,  kahlil,  gibran,  th...
2    [one,  first,  literary,  books,  recall,  rea...
3    [prophet,  kahlil,  gibrans,  best,  known,  w...
4    [gibran,  khalil,  gibran,  born,  one thousan...
Name: reviewText, dtype: object

In [103]:
# Split negs
def split_neg(review):
    new_review = []
    for token in review:
        if '_' in token:
            split_words = token.split("_")
            new_review.append(split_words[0])
            new_review.append(split_words[1])
        else:
            new_review.append(token)
    return new_review

In [104]:
df0["reviewText"] = df0["reviewText"].progress_apply(lambda review: split_neg(review))
df0["reviewText"].head()

Progress:: 100%|██████████| 582711/582711 [00:09<00:00, 62759.72it/s]


0    [timeless,  classic,  demanding,  assuming,  t...
1    [first,  read,  prophet,  kahlil,  gibran,  th...
2    [one,  first,  literary,  books,  recall,  rea...
3    [prophet,  kahlil,  gibrans,  best,  known,  w...
4    [gibran,  khalil,  gibran,  born,  one thousan...
Name: reviewText, dtype: object

In [105]:
### Remove Stop Words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(review):
    return [token for token in review if not token in stop_words]

In [106]:
df0["reviewText"] = df0["reviewText"].progress_apply(lambda review: remove_stopwords(review))
df0["reviewText"].head()

Progress:: 100%|██████████| 582711/582711 [00:55<00:00, 10507.46it/s]


0    [timeless,  classic,  demanding,  assuming,  t...
1    [first,  read,  prophet,  kahlil,  gibran,  th...
2    [one,  first,  literary,  books,  recall,  rea...
3    [prophet,  kahlil,  gibrans,  best,  known,  w...
4    [gibran,  khalil,  gibran,  born,  one thousan...
Name: reviewText, dtype: object

In [107]:
import nltk
from nltk.probability import FreqDist

def collect_zipfs_law_metrics(review, fd):
    for token in review:
        fd.update([token])

In [108]:
fd = FreqDist()
df0['reviewText'].progress_apply(lambda review: collect_zipfs_law_metrics(review, fd));

Progress:: 100%|██████████| 582711/582711 [03:28<00:00, 2789.07it/s]


In [109]:
fd

FreqDist({'timeless': 26,
          ' classic': 17851,
          ' demanding': 2481,
          ' assuming': 2899,
          ' title': 31208,
          ' gibran': 63,
          ' backs': 1822,
          ' excellent': 48518,
          ' style': 53946,
          ' content': 17043,
          ' means': 30436,
          ' publish': 2811,
          ' century': 34565,
          ' two': 222104,
          ' earlier': 20668,
          ' could': 186395,
          ' inspired': 7735,
          ' new': 191404,
          ' religion': 26344,
          ' mouth': 5174,
          ' old': 76688,
          ' man': 109999,
          ' sail': 783,
          ' away': 69473,
          ' far': 73068,
          ' destination': 1456,
          ' hear': 16369,
          ' wisdom': 13007,
          ' life': 259519,
          ' important': 60359,
          ' aspects': 16828,
          ' messege': 7,
          ' guide': 23941,
          ' book': 1502803,
          ' sufi': 247,
          ' sermon': 1260,
          ' m

In [110]:
words = []
freqs = []

In [111]:
for rank, word in enumerate(fd):
    words.append(word)
    freqs.append(fd[word])

In [112]:
frequencies = {'word': words, 'frequency':freqs}
frequencies_df = pd.DataFrame(frequencies)

In [113]:
frequencies_df.head()

Unnamed: 0,frequency,word
0,26,timeless
1,17851,classic
2,2481,demanding
3,2899,assuming
4,31208,title


In [114]:
frequencies_df = frequencies_df.sort_values(['frequency'], ascending=[False])
frequencies_df = frequencies_df.reset_index()
frequencies_df = frequencies_df.drop(columns=['index'])

In [115]:
frequencies_df[0:20]

Unnamed: 0,frequency,word
0,1502803,book
1,639620,one
2,467228,read
3,386404,like
4,365799,story
5,355169,would
6,287112,no
7,286763,time
8,273330,many
9,267441,much


In [116]:
import plotly 
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
print(cf.__version__)
# Configure cufflings 
cf.set_config_file(offline=False, world_readable=True, theme='pearl')

0.12.1


In [117]:
plotly.tools.set_credentials_file(username='falrashidi', api_key='XaO64TRYU0N3Sdup8Z3H')

In [118]:
frequencies_df['frequency'][0:75].iplot(kind='bar', xTitle='Words', yTitle='Frequency', title='Occurences in the Corpus per Word (Zipf\'s Law)')

In [119]:
frequencies_df.to_csv("../data/interim/003_dictionary.csv", sep='\t', header=True, index=False);

In [128]:
df = frequencies_df.reindex_axis(sorted(frequencies_df.columns, reverse=True), axis=1)


'.reindex_axis' is deprecated and will be removed in a future version. Use '.reindex' instead.



In [131]:
sorted(frequencies_df.columns, reverse=True)

['word', 'frequency']

In [133]:
final_df = frequencies_df.reindex(['word', 'frequency'], axis=1)

In [136]:
# Save a dictionary into a pickle file.
final_df.to_pickle("../data/interim/003_dictionary.p")

In [None]:
## END_OF_FILE