In [19]:
import pandas as pd
import pickle

from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag

from sklearn.feature_extraction import text

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
pickle_in = open("book_clean_df.pkl","rb")
data_clean  = pickle.load(pickle_in)

In [22]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [16]:
# Let's create a function to pull out nouns from a string of text

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

def adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_adj = lambda pos: pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    adj = [word for (word, pos) in pos_tag(tokenized) if is_adj(pos)] 
    return ' '.join(adj)

def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)


In [49]:
# noun filter on df
data_nouns = pd.DataFrame(data_clean.description.apply(nouns))
data_nouns

# adjective filter on df
data_adj = pd.DataFrame(data_clean.description.apply(adj))
data_adj

# noun adj filter on df
data_nouns_adj = pd.DataFrame(data_clean.description.apply(nouns_adj))
data_nouns_adj

Unnamed: 0_level_0,description
title,Unnamed: 1_level_1
charlotte s web,book white author stuart little trumpet swan c...
green eggs and ham,green eggs ham asks beginner book seuss house ...
go the fuck to sleep,fuck sleepis bedtime book parents real world f...
the very hungry caterpillar,eric hungry caterpillaris perennial favorite c...
peter pan,peter book famous play unforgettable character...
...,...
rise of the earth dragon dragon masters,dragon masters series dragon dragon slayers se...
little birdie grows up,little birdie delightful picture book charming...
you read to me i ll read to you very short stories to read together,book something new page side other unique book...
like pickle juice on a cookie eleanor,bad august bad august bad pickle juice cookie ...


In [75]:
# Re-add the additional stop words since we are recreating the document-term matrix
stop_noun = ["peter", "pages", "mrs", "beatrix", "potter", "also", "national", "appeal", "everyone", "literature", 
             "nothing", "detailed", "everywhere", "everything", "detailed", "publishers weekly", "adults", 
             "ever", "finally", "parent", "need", "also", "needs", "fans", "asks", "captures", "gift", "five", 
             "detail", "others",  "details", "brought life", "caldecott", "readers", "tale", "tales", "young", 
             "years", "ages", "seuss", "series", "color", "day", "medal", "collier", "review", "seriers", "award", 
             "thing", "stories", "child", "life", "things", "childhood", "year", "world", "award", "winner", "york", 
             "caldecott", "times", "new", "one", "author", "edition", "readers", "reader", "illustrator", "word", 
             "words", "little", "text", "illustration", "illustrations", "story", "picture", "best", "pictures", 
             "children", 'love', 'great', 'book', 'books', 'read', 'reading', 'just', 'like', 'children', 'loved', 
             'time', 'kids', 'fun', 'really', 'reading', 'way', 'favorite', 'page', 'wonderful']

stop_words_noun_agg = text.ENGLISH_STOP_WORDS.union(stop_noun)

# Recreate a document-term matrix with only nouns
tv_noun = TfidfVectorizer(stop_words=stop_words_noun_agg, ngram_range = (1,2), max_df = .6, min_df = .01)
data_tv_noun = tv_noun.fit_transform(data_nouns.description)
data_dtm_noun = pd.DataFrame(data_tv_noun.toarray(), columns=tv_noun.get_feature_names())
data_dtm_noun.index = data_nouns.index
data_dtm_noun

  'stop_words.' % sorted(inconsistent))


Unnamed: 0_level_0,action,activity,addition,adult,adventure,adventures,age,air,alice,aloud,...,woman,wonder,wonders,woods,wordplay,work,works,writers,zany,zoo
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
charlotte s web,0.0,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
green eggs and ham,0.0,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
go the fuck to sleep,0.0,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the very hungry caterpillar,0.0,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
peter pan,0.0,0.0,0.0,0.0000,0.0,0.33436,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rise of the earth dragon dragon masters,0.0,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
little birdie grows up,0.0,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
you read to me i ll read to you very short stories to read together,0.0,0.0,0.0,0.4023,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
like pickle juice on a cookie eleanor,0.0,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
nmf_model = NMF(4)
doc_topic = nmf_model.fit_transform(data_tv_noun)

display_topics(nmf_model, tv_noun.get_feature_names(), 5)


Topic  0
girl, family, boy, home, adventure

Topic  1
animals, baby, farm, animal, babies

Topic  2
cat, dog, cats, hat, mouse

Topic  3
school, journal, school journal, mouse, people


In [76]:
nmf_model_noun = NMF(20)
doc_topic_noun = nmf_model_noun.fit_transform(data_tv_noun)

display_topics(nmf_model_noun, tv_noun.get_feature_names(), 5)


Topic  0
boy, garden, honor, town, boy family

Topic  1
animals, farm, animal, game, wit

Topic  2
cat, cats, hat, julia, adventures

Topic  3
school, journal, school journal, publishers, air

Topic  4
girl, friend, work, beauty, garden

Topic  5
rabbit, today, benjamin, bunny, tom

Topic  6
pooh, shepard, winnie, editions, milne

Topic  7
baby, babies, mama, course, variety

Topic  8
bear, friends, city, christmas, bears

Topic  9
art, imagination, work, artist, museum

Topic  10
family, night, grandmother, families, debut

Topic  11
dog, dogs, george, lovers, city

Topic  12
dragon, dragons, humor, parents, plan

Topic  13
mouse, friend, head, mice, daughter

Topic  14
home, adventure, trip, help, lives

Topic  15
penguin, matter, days, father, character

Topic  16
moon, creatures, land, bedtime, sense

Topic  17
people, god, history, man, parents

Topic  18
letter, letters, alphabet, ways, scott

Topic  19
beginner, meaning, clues, beginner clues, clues meaning


In [83]:
H_noun = pd.DataFrame(doc_topic_noun.round(5),
             index = data_clean.index,
             columns = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19])

H_noun.head(30)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
charlotte s web,0.00289,0.0,0.00539,0.01191,0.14323,0.05602,0.03973,0.0,0.0,0.01009,0.00674,0.00258,0.0,0.04507,0.0,0.0,0.0,0.0,0.0,0.01825
green eggs and ham,0.0,0.0,0.0,0.0,0.0,0.0,0.01267,0.0,0.01992,0.0,0.0,0.0,0.0,0.16063,0.0,0.0,0.0,0.0,0.0,0.51018
go the fuck to sleep,0.0,0.0,0.0,0.0,0.0,0.01801,0.0,0.02946,0.00351,0.0,0.01325,0.00498,0.0256,0.0,0.0,0.0,0.09762,0.06416,0.00119,0.05661
the very hungry caterpillar,0.00191,0.01561,0.00759,0.01341,0.0039,0.00305,0.0,0.03525,0.0,0.0,0.0,0.0,0.00914,0.00729,0.00423,0.0134,0.00972,0.0,0.0,0.0
peter pan,0.05819,0.00265,0.00339,0.0,0.00837,0.0092,0.01526,0.00911,0.0,0.0766,0.0,0.0,0.01002,0.0,0.0,0.0,0.01243,0.04326,0.0,0.01291
goodnight moon,0.00525,0.0,0.01031,0.00497,0.0,0.04645,0.02982,0.0,0.01742,0.00083,0.0,0.0,0.0,0.0,0.00612,0.0,0.16106,0.0,0.0,0.0
flora and ulysses the illuminated adventures,0.00572,0.0,0.00315,0.0073,0.0053,0.08261,0.00939,0.0,0.0,0.0299,0.0,0.01204,0.00434,0.0166,0.0,0.00303,0.03891,0.01153,0.00322,0.00275
i want my hat back,0.0,0.11573,0.02288,0.0,0.0,0.0,0.0,0.0,0.29908,0.00208,0.0,0.0027,0.0,0.0,0.0,0.00344,0.0,0.0,0.01342,0.0
the velveteen rabbit,0.0,0.01086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00578,0.0,0.0,0.0,0.14895,0.0,0.0
winnie the pooh winnie the pooh,0.0,0.0,0.0,0.0,0.0,0.00029,0.47617,0.0,0.02314,0.0,0.00026,0.0,0.0,0.0,0.0,0.0,0.00734,0.00774,0.0,0.0


In [89]:
H_noun.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
charlotte s web,0.008453,0.0,0.015765,0.034835,0.418924,0.163849,0.116204,0.0,0.0,0.029512,0.019713,0.007546,0.0,0.131822,0.0,0.0,0.0,0.0,0.0,0.053378
green eggs and ham,0.0,0.0,0.0,0.0,0.0,0.0,0.018013,0.0,0.02832,0.0,0.0,0.0,0.0,0.228362,0.0,0.0,0.0,0.0,0.0,0.725306
go the fuck to sleep,0.0,0.0,0.0,0.0,0.0,0.057286,0.0,0.093705,0.011164,0.0,0.042145,0.01584,0.081428,0.0,0.0,0.0,0.310506,0.204078,0.003785,0.180063
the very hungry caterpillar,0.015341,0.125382,0.060964,0.107711,0.031325,0.024498,0.0,0.283133,0.0,0.0,0.0,0.0,0.073414,0.058554,0.033976,0.107631,0.078072,0.0,0.0,0.0
peter pan,0.222618,0.010138,0.012969,0.0,0.032021,0.035196,0.05838,0.034852,0.0,0.293049,0.0,0.0,0.038334,0.0,0.0,0.0,0.047553,0.1655,0.0,0.04939


In [84]:
H_noun["sum"] = H_noun.sum(axis=1)

for num in range(0,20):
    H_noun[num] = H_noun[num]/H_noun["sum"]

H_noun = H_noun.drop(columns = "sum")

In [101]:
H_noun.columns = ["BOY", "FARM ANIMAL", "CAT", "SCHOOL", "GIRL", 
                        "RABBIT", "POOH", "BABY", "BEAR", "ART", "FAMILY",  
                        "DOG", "DRAGONS", "MICE", "ADVENTURE", "UNKNOWN?", 
                        "BEDTIME", "HISTORY", "ALPHABET", "EDUCATIONAL"]

In [102]:
H_noun.head()

Unnamed: 0_level_0,BOY,FARM ANIMAL,CAT,SCHOOL,GIRL,RABBIT,POOH,BABY,BEAR,ART,FAMILY,DOG,DRAGONS,MICE,ADVENTURE,UNKNOWN?,BEDTIME,HISTORY,ALPHABET,EDUCATIONAL
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
charlotte s web,0.008453,0.0,0.015765,0.034835,0.418924,0.163849,0.116204,0.0,0.0,0.029512,0.019713,0.007546,0.0,0.131822,0.0,0.0,0.0,0.0,0.0,0.053378
green eggs and ham,0.0,0.0,0.0,0.0,0.0,0.0,0.018013,0.0,0.02832,0.0,0.0,0.0,0.0,0.228362,0.0,0.0,0.0,0.0,0.0,0.725306
go the fuck to sleep,0.0,0.0,0.0,0.0,0.0,0.057286,0.0,0.093705,0.011164,0.0,0.042145,0.01584,0.081428,0.0,0.0,0.0,0.310506,0.204078,0.003785,0.180063
the very hungry caterpillar,0.015341,0.125382,0.060964,0.107711,0.031325,0.024498,0.0,0.283133,0.0,0.0,0.0,0.0,0.073414,0.058554,0.033976,0.107631,0.078072,0.0,0.0,0.0
peter pan,0.222618,0.010138,0.012969,0.0,0.032021,0.035196,0.05838,0.034852,0.0,0.293049,0.0,0.0,0.038334,0.0,0.0,0.0,0.047553,0.1655,0.0,0.04939


In [99]:
H_noun.sort_values(by=['SCHOOL'], ascending=False).head(30)

Unnamed: 0_level_0,BOY,FARM ANIMAL,CAT,SCHOOL,GIRL,RABBIT,POOH,BABY,BEAR?,ART,FAMILY,DOG,DRAGONS,MICE,ADVENTURE,UNKNOWN?,BEDTIME,HISTORY,ALPHABET,EDUCATIONAL
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
the school is alive eerie elementary,0.0,0.0,0.0,0.87573,0.006683,0.0,0.025558,0.003009,0.003806,0.00073,0.0,0.0,0.084484,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the wild christmas reindeer,0.0,0.0,0.0,0.835292,0.0,0.0,0.0,0.0,0.083415,0.0,0.017329,0.008807,0.0,0.030413,0.0,0.0,0.0,0.0,0.0,0.024743
middle school my brother is a big fat liar middle school,0.041275,0.0,0.0,0.794768,0.13101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032947,0.0
miss spider s tea party,0.0,0.0,0.0,0.780984,0.0,0.0,0.0,0.0,0.102963,0.0,0.0,0.018557,0.0,0.00187,0.0,0.002982,0.0,0.0,0.0,0.092643
the quiet book,0.0,0.0,0.0,0.736491,0.0,0.0,0.0,0.0,0.059106,0.0,0.000413,0.052327,0.0,0.0,0.0,0.022099,0.068529,0.0,0.0,0.061034
the first rule of punk,0.0,0.0,0.0,0.729759,0.0,0.0,0.0,0.0,0.0,0.134796,0.007848,0.0,0.0,0.0,0.116846,0.010751,0.0,0.0,0.0,0.0
frazzled everyday disasters and impending doom frazzled,0.0,0.0,0.016587,0.720637,0.136131,0.0,0.009955,0.0,0.0,0.0,0.11669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yo yes,0.246201,0.0,0.0,0.706051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012393,0.0,0.0,0.0,0.035355
mini myths play nice hercules,0.0,0.0,0.0,0.691324,0.0,0.037884,0.0,0.0,0.038201,0.049619,0.0,0.021603,0.011207,0.004581,0.0,0.0,0.055328,0.0,0.0,0.090252
miss nelson is missing miss nelson,0.001297,0.005012,0.023013,0.668985,0.004114,0.089984,0.0,0.0,0.0,0.0,0.009973,0.0,0.0,0.0,0.0,0.0,0.036677,0.035006,0.111452,0.014486


In [103]:
H_noun.to_csv("h_noun_percentages.csv")

In [35]:
# add the adj stop words since we are recreating the document-term matrix

stop_adj = ["perfect", "best", "new", "good", "young", "old", "little", "beautiful", 'love', 'great', 
            "delightful", 'illustrated', 'read', 'reading', 'just', 'like', 'bear', 'loved', 'tale', 'big', 
            'fun', 'really', 'reading', 'way', 'favorite', 'page', 'wonderful', "book book", "middle", "american"]

stop_words_adj_agg = text.ENGLISH_STOP_WORDS.union(stop_adj)



# Recreate a document-term matrix with only adj
tv_adj = TfidfVectorizer(stop_words=stop_words_adj_agg, ngram_range = (1,2), max_df = .6, min_df = .02)
data_tv_adj = tv_adj.fit_transform(data_adj.description)
data_dtm_adj = pd.DataFrame(data_tv_adj.toarray(), columns=tv_adj.get_feature_names())
data_dtm_adj.index = data_adj.index
data_dtm_adj


  'stop_words.' % sorted(inconsistent))


Unnamed: 0_level_0,able,adorable,aloud,amazing,animal,anniversary,available,bad,beloved,black,...,unique,vibrant,visual,vivid,warm,whimsical,white,wild,witty,youngest
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
charlotte s web,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.882341,0.000000,0.0,0.0
green eggs and ham,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
go the fuck to sleep,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
the very hungry caterpillar,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
peter pan,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.503579,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rise of the earth dragon dragon masters,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
little birdie grows up,0.725155,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
you read to me i ll read to you very short stories to read together,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.427829,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
like pickle juice on a cookie eleanor,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.889577,0.0,0.134872,...,0.000000,0.0,0.165951,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0


In [33]:
nmf_model_adj = NMF(8)
doc_topic = nmf_model_adj.fit_transform(data_tv_adj)

display_topics(nmf_model_adj, tv_adj.get_feature_names(), 5)


Topic  0
different, colorful, sweet, humorous, easy

Topic  1
white, black, black white, comic, dramatic

Topic  2
classic, original, anniversary, library, beloved

Topic  3
funny, hilarious, humorous, vibrant, extraordinary

Topic  4
simple, bright, library, bold, seuss

Topic  5
sure, whimsical, irresistible, adorable, expressive

Topic  6
special, true, real, lyrical, spare

Topic  7
green, red, blue, silly, witty


In [36]:
nmf_model_adj = NMF(8)
doc_topic = nmf_model_adj.fit_transform(data_tv_adj)

display_topics(nmf_model_adj, tv_adj.get_feature_names(), 5)


Topic  0
classic, original, anniversary, exquisite, library

Topic  1
white, black, black white, green, popular

Topic  2
sure, whimsical, irresistible, adorable, gentle

Topic  3
simple, library, bright, wild, green

Topic  4
funny, hilarious, humorous, vibrant, clever

Topic  5
different, sweet, whimsical, colorful, humorous

Topic  6
true, real, original, spare, free

Topic  7
special, unique, long, magical, bold


In [37]:
nmf_model_adj = NMF(6)
doc_topic_adj = nmf_model_adj.fit_transform(data_tv_adj)

display_topics(nmf_model_adj, tv_adj.get_feature_names(), 5)


Topic  0
classic, original, anniversary, exquisite, strange

Topic  1
white, black, black white, real, green

Topic  2
sure, whimsical, adorable, irresistible, gentle

Topic  3
simple, library, bright, wild, green

Topic  4
funny, hilarious, humorous, vibrant, clever

Topic  5
different, special, true, sweet, colorful


In [38]:
H_adj = pd.DataFrame(doc_topic_adj.round(5),
             index = data_clean.index,
             columns = ["CLASSIC", "BLACK AND WHITE", "WHIMSICAL", "SIMPLE", "FUNNY", "DIFFERENT"])
H_adj.head(30)

Unnamed: 0_level_0,CLASSIC,BLACK AND WHITE,WHIMSICAL,SIMPLE,FUNNY,DIFFERENT
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
charlotte s web,0.09049,0.29609,0.0,0.0,0.0,0.0
green eggs and ham,0.00044,0.01997,0.0238,0.27142,0.0,0.0
go the fuck to sleep,0.01667,0.01264,0.01106,0.0,0.20421,0.01632
the very hungry caterpillar,0.00771,0.00803,0.0136,0.00602,0.01779,0.01564
peter pan,0.14487,0.0,0.0,0.03695,0.00852,0.01621
goodnight moon,0.10814,0.01166,0.04525,0.05407,0.0,0.0
flora and ulysses the illuminated adventures,0.0,0.45328,0.0,0.0,0.0,0.0042
i want my hat back,0.12098,0.0,0.0,0.25339,0.0008,0.0
the velveteen rabbit,0.2064,0.00428,0.00692,0.0,0.0,0.01488
winnie the pooh winnie the pooh,0.14983,0.00012,0.0,0.05389,0.0,0.07553


In [65]:
H_adj["sum"] = H_adj.sum(axis=1)

H_adj["CLASSIC"] = H_adj["CLASSIC"]/H_adj["sum"]
H_adj["BLACK AND WHITE"] = H_adj["BLACK AND WHITE"]/H_adj["sum"]
H_adj["WHIMSICAL"] = H_adj["WHIMSICAL"]/H_adj["sum"]
H_adj["SIMPLE"] = H_adj["SIMPLE"]/H_adj["sum"]
H_adj["FUNNY"] = H_adj["FUNNY"]/H_adj["sum"]
H_adj["DIFFERENT"] = H_adj["DIFFERENT"]/H_adj["sum"]

H_adj = H_adj.drop(columns = "sum")

Unnamed: 0_level_0,CLASSIC,BLACK AND WHITE,WHIMSICAL,SIMPLE,FUNNY,DIFFERENT,sum
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
charlotte s web,0.09049,0.29609,0.00000,0.00000,0.00000,0.00000,0.38658
green eggs and ham,0.00044,0.01997,0.02380,0.27142,0.00000,0.00000,0.31563
go the fuck to sleep,0.01667,0.01264,0.01106,0.00000,0.20421,0.01632,0.26090
the very hungry caterpillar,0.00771,0.00803,0.01360,0.00602,0.01779,0.01564,0.06879
peter pan,0.14487,0.00000,0.00000,0.03695,0.00852,0.01621,0.20655
...,...,...,...,...,...,...,...
rise of the earth dragon dragon masters,0.00000,0.00161,0.00000,0.00000,0.00000,0.23454,0.23615
little birdie grows up,0.00933,0.00700,0.01154,0.00568,0.00011,0.04004,0.07370
you read to me i ll read to you very short stories to read together,0.01467,0.00689,0.01793,0.02615,0.00000,0.05168,0.11732
like pickle juice on a cookie eleanor,0.01317,0.04152,0.00475,0.07980,0.02971,0.05754,0.22649


In [106]:
H_adj.sort_values(by=['WHIMSICAL'], ascending=False).head(30)

Unnamed: 0_level_0,CLASSIC,BLACK AND WHITE,WHIMSICAL,SIMPLE,FUNNY,DIFFERENT
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
the three ninja pigs,0.0,0.0,1.0,0.0,0.0,0.0
superworm,0.0,0.0,1.0,0.0,0.0,0.0
the bear who couldn t sleep,0.0,0.0,1.0,0.0,0.0,0.0
mustache baby,0.0,0.0,1.0,0.0,0.0,0.0
the very fairy princess,0.0,0.0,1.0,0.0,0.0,0.0
you don t want a unicorn,0.0,0.0,1.0,0.0,0.0,0.0
flip flap fly a book for babies everywhere,0.0,0.006007,0.980394,0.0,0.0136,0.0
animals by the numbers a book of infographics,0.025206,0.0,0.974794,0.0,0.0,0.0
hans brinker or the silver skates,0.050607,0.0,0.949393,0.0,0.0,0.0
there s a giraffe in my soup,0.0,0.0,0.947785,0.0,0.0,0.052215


Unnamed: 0_level_0,CLASSIC,BLACK AND WHITE,WHIMSICAL,SIMPLE,FUNNY,DIFFERENT,sum
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
charlotte s web,0.234078,0.765922,0.000000,0.000000,0.000000,0.000000,1.0
green eggs and ham,0.001394,0.063270,0.075405,0.859931,0.000000,0.000000,1.0
go the fuck to sleep,0.063894,0.048448,0.042392,0.000000,0.782714,0.062553,1.0
the very hungry caterpillar,0.112080,0.116732,0.197703,0.087513,0.258613,0.227359,1.0
peter pan,0.701380,0.000000,0.000000,0.178891,0.041249,0.078480,1.0
...,...,...,...,...,...,...,...
rise of the earth dragon dragon masters,0.000000,0.006818,0.000000,0.000000,0.000000,0.993182,1.0
little birdie grows up,0.126594,0.094980,0.156581,0.077069,0.001493,0.543284,1.0
you read to me i ll read to you very short stories to read together,0.125043,0.058728,0.152830,0.222895,0.000000,0.440505,1.0
like pickle juice on a cookie eleanor,0.058148,0.183319,0.020972,0.352333,0.131176,0.254051,1.0


In [70]:
H_adj.to_csv("H_adj_percentiles.csv")

In [62]:
# Re-add the additional stop words since we are recreating the document-term matrix
"""
stop_noun_adj = ["peter", "pages", "mrs", "beatrix", "potter", "also", "national", "appeal", "everyone", "literature", 
             "nothing", "detailed", "everywhere", "everything", "detailed", "publishers weekly", "adults", 
             "ever", "finally", "parent", "need", "also", "needs", "fans", "asks", "captures", "gift", "five", 
             "detail", "others",  "details", "brought life", "caldecott", "readers", "tale", "tales", "young", 
             "years", "ages", "seuss", "series", "color", "day", "medal", "collier", "review", "seriers", "award", 
             "thing", "stories", "child", "life", "things", "childhood", "year", "world", "award", "winner", "york", 
             "caldecott", "times", "new", "one", "author", "edition", "readers", "reader", "illustrator", "word", 
             "words", "little", "text", "illustration", "illustrations", "story", "picture", "best", "pictures", 
             "children", 'love', 'great', 'book', 'books', 'read', 'reading', 'just', 'like', 'children', 'loved', 
             'time', 'kids', 'fun', 'really', 'reading', 'way', 'favorite', 'page', 'wonderful', 
                 "perfect", "best", "new", "good", "young", "old", "little", "beautiful", 'love', 'great', 
            "delightful", 'illustrated', 'read', 'reading', 'just', 'like', 'bear', 'loved', 'tale', 'big', 
            'fun', 'really', 'reading', 'way', 'favorite', 'page', 'wonderful', "book book", "middle", "american"]



stop_words_noun_adj_agg = text.ENGLISH_STOP_WORDS.union(stop_noun_adj)
"""

def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

# noun adj filter on df
data_nouns_adj = pd.DataFrame(data_clean.description.apply(nouns_adj))
data_nouns_adj

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words_adj = ["tale", "brown", "bear", "american", "anniversary", "library", "green", "red", "perfect", "best", "new", "good", "young", "old", "little", "beautiful", 'love', 'great', 'books', 'books', 'read', 'reading', 'just', 'like', 'children', 'loved', 'time', 'kids', 'fun', 'really', 'reading', 'way', 'favorite', 'page', 'wonderful', "book book"]
add_stop_words_noun = ["years", "tells", "published", "winning", "text", "tale", "tales", "ages", "seuss", "series", "color", "day", "medal", "collier", "review", "seriers", "award", "thing", "stories", "child", "life", "things", "childhood", "year", "world", "award", "winner", "york", "caldecott", "times", "new", "one", "author", "edition", "readers", "reader", "illustrator", "word", "words", "little", "text", "illustration", "illustrations", "story", "picture", "best", "pictures", "children", 'love', 'great', 'book', 'books', 'read', 'reading', 'just', 'like', 'children', 'loved', 'time', 'kids', 'fun', 'really', 'reading', 'way', 'favorite', 'page', 'wonderful']
stop_add = add_stop_words_adj + add_stop_words_noun
stop_words_noun_adj_agg = text.ENGLISH_STOP_WORDS.union(stop_add)

# Recreate a document-term matrix with only nouns
tv_noun_adj = TfidfVectorizer(stop_words=stop_words_noun_adj_agg, ngram_range = (1,2), max_df = .95, min_df = .02)
data_tv_noun_adj = tv_noun_adj.fit_transform(data_nouns_adj.description)
data_dtm_noun_adj = pd.DataFrame(data_tv_noun_adj.toarray(), columns=tv_noun_adj.get_feature_names())
data_dtm_noun_adj.index = data_nouns_adj.index
data_dtm_noun_adj

Unnamed: 0_level_0,able,adorable,adults,adventure,adventures,age,aloud,amazing,animal,animals,...,white,wife,wild,winter,witty,wonder,wonders,woods,work,youngest
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
charlotte s web,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.514268,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
green eggs and ham,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
go the fuck to sleep,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the very hungry caterpillar,0.000000,0.0,0.341275,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
peter pan,0.000000,0.0,0.000000,0.0,0.221409,0.0,0.0,0.270164,0.0,0.0,...,0.000000,0.0,0.225673,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rise of the earth dragon dragon masters,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
little birdie grows up,0.327959,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
you read to me i ll read to you very short stories to read together,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
like pickle juice on a cookie eleanor,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
nmf_model_noun_adj = NMF(2)
doc_topic_noun_adj = nmf_model_noun_adj.fit_transform(data_tv_adj)

display_topics(nmf_model_noun_adj, tv_noun_adj.get_feature_names(), 5)


Topic  0
bad, discovers, early, creatures, busy

Topic  1
fairy, anniversary, answers, choice, days


In [63]:
nmf_model_noun_adj = NMF(20)
doc_topic_noun_adj = nmf_model_noun_adj.fit_transform(data_tv_noun_adj)

display_topics(nmf_model_noun_adj, tv_noun_adj.get_feature_names(), 5)


Topic  0
girl, garden, magical, friend, house

Topic  1
animals, farm, animal, visual, eric

Topic  2
cat, bold, blue, mouse, house

Topic  3
school, school journal, journal, publishers, sweet

Topic  4
white, black, black white, big, fish

Topic  5
baby, adorable, clever, sure, tiny

Topic  6
boy, town, street, home, days

Topic  7
friends, tree, big, whimsical, warm

Topic  8
family, different, real, families, help

Topic  9
classic, adventures, century, house, magic

Topic  10
dog, lovers, sure, humorous, playful

Topic  11
true, people, man, artist, history

Topic  12
rabbit, peter, bunny, today, bad

Topic  13
adventure, city, home, journey, friend

Topic  14
funny, hilarious, mouse, rhymes, problem

Topic  15
night, bedtime, fox, honor, dark

Topic  16
simple, beginner, bright, meaning, parent

Topic  17
collection, short, images, imagination, volume

Topic  18
art, pages, work, wild, famous

Topic  19
special, parents, rhyme, animal, early


In [64]:
H_noun_adj = pd.DataFrame(doc_topic_noun_adj.round(5),
             index = data_clean.index)
H_noun_adj.head(30)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
charlotte s web,0.02957,0.0,0.0034,0.0,0.18043,0.0,0.00208,0.00484,0.00319,0.11397,0.0,0.0,0.02497,0.00456,0.00392,0.00857,0.0,0.0,0.0,0.01974
green eggs and ham,0.00124,0.0,0.01498,0.00275,0.0,0.0,0.0,0.1371,0.0,0.00266,0.0,0.0,0.0421,0.0,0.04653,0.0,0.2935,0.0,0.00385,0.0
go the fuck to sleep,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01566,0.0,0.0,0.00529,0.00448,0.0,0.19949,0.24133,0.0,0.0,0.0,0.09545
the very hungry caterpillar,0.0,0.0167,0.02243,0.00432,0.0,0.02249,0.00557,0.033,0.0,0.00345,0.0,0.0,0.00119,0.01884,0.04022,0.0,0.0,0.0,0.03729,0.0
peter pan,0.00252,0.00022,0.0,0.0,0.0,0.0,0.09186,0.0,0.0,0.13055,0.0,0.05039,0.13016,0.03325,0.0,0.0,0.00648,0.00607,0.03756,0.0
goodnight moon,0.0,0.0,0.0,0.01039,0.01772,0.00606,0.0,0.0,0.01573,0.11961,0.00056,0.0,0.0959,0.00444,0.0,0.05194,0.0,0.00552,0.0,0.0
flora and ulysses the illuminated adventures,0.00167,0.0,0.0071,0.0,0.23955,0.0,0.00747,0.0,0.0,0.0,0.00937,0.03428,0.0736,0.0,0.0,0.0,0.0,0.0,0.01905,0.0178
i want my hat back,0.0,0.18649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10152,0.00778,0.0,0.0,0.00436,0.0,0.0,0.16873,0.0,0.0,0.03877
the velveteen rabbit,0.0,0.0,0.0,0.00682,0.0,0.0,0.0,0.0,0.01855,0.20798,0.0,0.08528,0.00624,0.0,0.0,0.014,0.0,0.0,0.0,0.0
winnie the pooh winnie the pooh,0.0,0.0,0.0,0.00065,0.0,0.0,0.0,0.11226,0.0,0.24705,0.0,0.08951,0.0283,0.0,0.0,0.0,0.01937,0.00361,0.0,0.0


In [None]:
H_noun_adj.to_csv("H_noun_adj.csv")

In [105]:
H_noun_adj pd.merge(H_noun, H_adj, right_index = True, left_index = True)

Unnamed: 0_level_0,BOY,FARM ANIMAL,CAT,SCHOOL,GIRL,RABBIT,POOH,BABY,BEAR,ART,...,BEDTIME,HISTORY,ALPHABET,EDUCATIONAL,CLASSIC,BLACK AND WHITE,WHIMSICAL,SIMPLE,FUNNY,DIFFERENT
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
charlotte s web,0.008453,0.000000,0.015765,0.034835,0.418924,0.163849,0.116204,0.000000,0.000000,0.029512,...,0.000000,0.000000,0.000000,0.053378,0.234078,0.765922,0.000000,0.000000,0.000000,0.000000
green eggs and ham,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.018013,0.000000,0.028320,0.000000,...,0.000000,0.000000,0.000000,0.725306,0.001394,0.063270,0.075405,0.859931,0.000000,0.000000
go the fuck to sleep,0.000000,0.000000,0.000000,0.000000,0.000000,0.057286,0.000000,0.093705,0.011164,0.000000,...,0.310506,0.204078,0.003785,0.180063,0.063894,0.048448,0.042392,0.000000,0.782714,0.062553
the very hungry caterpillar,0.015341,0.125382,0.060964,0.107711,0.031325,0.024498,0.000000,0.283133,0.000000,0.000000,...,0.078072,0.000000,0.000000,0.000000,0.112080,0.116732,0.197703,0.087513,0.258613,0.227359
peter pan,0.222618,0.010138,0.012969,0.000000,0.032021,0.035196,0.058380,0.034852,0.000000,0.293049,...,0.047553,0.165500,0.000000,0.049390,0.701380,0.000000,0.000000,0.178891,0.041249,0.078480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rise of the earth dragon dragon masters,0.000000,0.000000,0.000000,0.007551,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.006818,0.000000,0.000000,0.000000,0.993182
little birdie grows up,0.062891,0.000000,0.077415,0.000000,0.000000,0.000000,0.000000,0.194748,0.000000,0.000000,...,0.118886,0.112994,0.000000,0.236310,0.126594,0.094980,0.156581,0.077069,0.001493,0.543284
you read to me i ll read to you very short stories to read together,0.030905,0.000000,0.005144,0.031875,0.037356,0.000000,0.064255,0.031073,0.000000,0.000000,...,0.000000,0.000506,0.000000,0.000000,0.125043,0.058728,0.152830,0.222895,0.000000,0.440505
like pickle juice on a cookie eleanor,0.020743,0.012804,0.000000,0.487388,0.036339,0.005864,0.000000,0.000000,0.043381,0.000000,...,0.063792,0.103280,0.019642,0.000000,0.058148,0.183319,0.020972,0.352333,0.131176,0.254051
