# Word and document frequency analysis
--------

In [148]:
%store -r game_df_cleaned
%store -r swords_df_cleaned

In [149]:
from siuba import *
import pandas as pd

def analyze_frequency(before_df) -> pd.DataFrame:
    df = before_df.copy()
    total_words = count(df, _.book) #count the total number of words from a book
    word_count = count(df, _.word)  #count the number of occurrences of a word in a book

    # add the new data to the data frame
    df = df.merge(word_count, on='word', how='left')
    df = df.merge(total_words, on='book', how='left')

    # rename the columns accordingly
    df = df.rename(columns={'n_x' : 'word_count', 'n_y' : 'total'})

    # calculate and add the frequency of each word
    df = df.assign(frequency = _.word_count / _.total)

    # rank the word in accordance to their frequency
    df['rank'] = df.groupby('book')['word_count'].rank(method='dense', ascending=True)

    return df

In [150]:
#add the name of the book in each data frame
game_df_cleaned = game_df_cleaned.assign(book = 'A Game of Thrones')
swords_df_cleaned = swords_df_cleaned.assign(book = 'Storm of Swords')

# perform word analysis
game_df_cleaned = analyze_frequency(game_df_cleaned)
swords_df_cleaned = analyze_frequency(swords_df_cleaned)

game_df_cleaned = game_df_cleaned.sort_values(by=['rank'], ascending=False)

game_df_cleaned.head(15000)

Unnamed: 0,book,word,word_count,total,frequency,rank
385029,A Game of Thrones,lord,4719,411980,0.011454,445.0
306292,A Game of Thrones,lord,4719,411980,0.011454,445.0
403817,A Game of Thrones,lord,4719,411980,0.011454,445.0
225673,A Game of Thrones,lord,4719,411980,0.011454,445.0
239203,A Game of Thrones,lord,4719,411980,0.011454,445.0
...,...,...,...,...,...,...
82632,A Game of Thrones,jon,2532,411980,0.006146,441.0
115829,A Game of Thrones,jon,2532,411980,0.006146,441.0
275279,A Game of Thrones,jon,2532,411980,0.006146,441.0
259703,A Game of Thrones,jon,2532,411980,0.006146,441.0


In [153]:
from tidytext import *
from siuba import *
from plotnine import *
from siuba.dply.forcats import fct_reorder

def td_idf_analysis(before_df):
    df = before_df.copy()
    df = bind_tf_idf(df, 'word', 'book', 'word_count')


    tf_idf_df = ungroup(group_by(arrange(df, -_.tf_idf), 'book').head(8))

    tf_idf_df = arrange(tf_idf_df, _.book, -_.tf_idf)
    # Get the first 15 entries from each book
    # tf_idf_df = ungroup(group_by(arrange(df, -_.tf_idf), 'book').head(15))
    # tf_idf_df = arrange(tf_idf_df, _.book, -_.tf_idf)
    #tf_idf_df

    return tf_idf_df

In [155]:
from plotnine import *
from siuba.dply.forcats import fct_reorder

books_ = [game_df_cleaned, swords_df_cleaned]
books_df = pd.concat(books_)

books_df = td_idf_analysis(books_df)

# ggplot(books_df) + aes(x=fct_reorder(books_df['word'], x=books_df['tf_idf']), y='tf_idf', fill='book') + coord_flip() + geom_col(show_legend=False) + facet_wrap('~book', ncol = 2, scales = "free") + labs(x = "tf-idf", y = None) + theme(subplots_adjust={'wspace': 0.15}) + theme(subplots_adjust={'wspace': 0.45, 'hspace': 0.3}) + scale_x_discrete()
books_df

Input should have exactly one row per document-term combination.


Unnamed: 0,book,word,word_count,total,frequency,rank,tf,idf,tf_idf
403826,A Game of Thrones,lord,4719,411980,0.011454,445.0,2.578986e-05,0.0,0.0
403729,A Game of Thrones,lord,4719,411980,0.011454,445.0,2.578986e-05,0.0,0.0
403731,A Game of Thrones,lord,4719,411980,0.011454,445.0,2.578986e-05,0.0,0.0
403733,A Game of Thrones,lord,4719,411980,0.011454,445.0,2.578986e-05,0.0,0.0
403951,A Game of Thrones,lord,4719,411980,0.011454,445.0,2.578986e-05,0.0,0.0
403349,A Game of Thrones,lord,4719,411980,0.011454,445.0,2.578986e-05,0.0,0.0
403626,A Game of Thrones,lord,4719,411980,0.011454,445.0,2.578986e-05,0.0,0.0
407925,A Game of Thrones,lord,4719,411980,0.011454,445.0,2.578986e-05,0.0,0.0
336758,Storm of Swords,dennis,2,338464,6e-06,1.0,1.774164e-08,-2.484907,-4.408633e-08
337154,Storm of Swords,minisa,2,338464,6e-06,1.0,1.774164e-08,-2.484907,-4.408633e-08
