In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns 

**This notebook presents the words and their categories present in two datasets from the Worldbank database. There are two datasets that present this information.**

# Dataset: Full Child-by-Word
- Language: English (American)
- Form: WS (Words and Sentences)
- Age: 16 - 30 (Months)

In [54]:
df = pd.read_csv('data_WS/wordbank_instrument_data_full_child_by_word_englishAmerican_WS.csv')

In [55]:
df.head()

Unnamed: 0,downloaded,data_id,item_kind,category,item_id,item_definition,english_gloss,uni_lemma,child_id,age,value
0,2023-09-07,245518,word,sounds,item_1,baa baa,baa baa,baa baa,1,28,produces
1,2023-09-07,245518,word,sounds,item_2,choo choo,choo choo,choo choo,1,28,
2,2023-09-07,245518,word,sounds,item_3,cockadoodledoo,cockadoodledoo,cockadoodledoo,1,28,
3,2023-09-07,245518,word,sounds,item_4,grrr,grrr,grrr,1,28,produces
4,2023-09-07,245518,word,sounds,item_5,meow,meow,meow,1,28,


In [56]:
df.shape

(6057997, 11)

In [57]:
#Number of words (item_definition) in the dataset:

df['item_definition'].nunique()

797

In [60]:
#Printed all unique values (all words) in the dataset:

df['item_definition'].unique()

array(['baa baa', 'choo choo', 'cockadoodledoo', 'grrr', 'meow', 'moo',
       'ouch', 'quack quack', 'uh oh', 'vroom', 'woof woof', 'yum yum',
       'alligator', 'animal', 'ant', 'bear', 'bee', 'bird', 'bug',
       'bunny', 'butterfly', 'cat', 'chicken (animal)', 'cow', 'deer',
       'dog', 'donkey', 'duck', 'elephant', 'fish (animal)', 'frog',
       'giraffe', 'goose', 'hen', 'horse', 'kitty', 'lamb', 'lion',
       'monkey', 'moose', 'mouse', 'owl', 'penguin', 'pig', 'pony',
       'puppy', 'rooster', 'sheep', 'squirrel', 'teddybear', 'tiger',
       'turkey', 'turtle', 'wolf', 'zebra', 'airplane', 'bicycle', 'boat',
       'bus', 'car', 'firetruck', 'helicopter', 'motorcycle', 'sled',
       'stroller', 'tractor', 'train', 'tricycle', 'truck', 'ball',
       'balloon', 'bat', 'block', 'book', 'bubbles', 'chalk', 'crayon',
       'doll', 'game', 'glue', 'pen', 'pencil', 'play dough', 'present',
       'puzzle', 'story', 'toy (object)', 'apple', 'applesauce', 'banana',
       'be

## New dataset with all words in each category:

In [46]:
df_words_by_category = df.groupby ('category')['item_definition'].agg(list).reset_index()

In [47]:
words_by_category = words_by_category.rename(columns={'item_definition': 'aggregated_words'})

In [49]:
#Print of each category and their words:

def show_content(row):
    print(row.category)
    for w in row.aggregated_words:
        print(f"- {w}")
    print()
words_by_category.apply(show_content, axis=1)

action_words
- bite
- blow
- break
- bring
- build
- bump
- buy
- carry
- catch
- chase
- clap
- clean (action)
- climb
- close
- cook
- cover
- cry
- cut
- dance
- draw
- drink (action)
- drive
- drop
- dry (action)
- dump
- eat
- fall
- feed
- find
- finish
- fit
- fix
- get
- give
- go
- hate
- have
- hear
- help
- hide
- hit
- hold
- hug
- hurry
- jump
- kick
- kiss
- knock
- lick
- like
- listen
- look
- love
- make
- open
- paint
- pick
- play
- pour
- pretend
- pull
- push
- put
- read
- ride
- rip
- run
- say
- see
- shake
- share
- show
- sing
- sit
- skate
- sleep
- slide (action)
- smile
- spill
- splash
- stand
- stay
- stop
- sweep
- swim
- swing (action)
- take
- talk
- taste
- tear
- think
- throw
- tickle
- touch
- wait
- wake
- walk
- wash
- watch (action)
- wipe
- wish
- work (action)
- write

animals
- alligator
- animal
- ant
- bear
- bee
- bird
- bug
- bunny
- butterfly
- cat
- chicken (animal)
- cow
- deer
- dog
- donkey
- duck
- elephant
- fish (animal)
- frog
- 

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
dtype: object

# Dataset: By Word Summary

- Language: English (American)
- Form: WS (Words and Sentences)
- Age: 16 - 30 (Months)

In [50]:
df2 = pd.read_csv('data_WS/wordbank_by_word_summary_data_englishAmerican_WS.csv')

In [51]:
df2.head(10)

Unnamed: 0,downloaded,item_id,item_definition,category,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
0,2023-08-30,1,baa baa,sounds,0.47,0.52,0.56,0.67,0.71,0.7,0.68,0.78,0.78,0.8,0.8,0.84,0.89,0.8,0.82
1,2023-08-30,2,choo choo,sounds,0.21,0.32,0.39,0.52,0.58,0.6,0.65,0.7,0.76,0.84,0.81,0.85,0.92,0.85,0.89
2,2023-08-30,3,cockadoodledoo,sounds,0.12,0.11,0.15,0.23,0.25,0.31,0.34,0.37,0.44,0.48,0.5,0.54,0.71,0.54,0.6
3,2023-08-30,4,grrr,sounds,0.48,0.49,0.55,0.64,0.65,0.67,0.65,0.74,0.77,0.78,0.76,0.8,0.9,0.79,0.82
4,2023-08-30,5,meow,sounds,0.41,0.48,0.56,0.68,0.71,0.78,0.77,0.81,0.86,0.89,0.88,0.92,0.96,0.91,0.93
5,2023-08-30,6,moo,sounds,0.5,0.52,0.62,0.75,0.76,0.78,0.79,0.85,0.88,0.88,0.88,0.92,0.95,0.92,0.93
6,2023-08-30,7,ouch,sounds,0.34,0.41,0.51,0.61,0.64,0.67,0.7,0.77,0.78,0.86,0.82,0.83,0.93,0.86,0.88
7,2023-08-30,8,quack quack,sounds,0.4,0.39,0.49,0.6,0.61,0.7,0.67,0.77,0.79,0.83,0.84,0.85,0.93,0.85,0.9
8,2023-08-30,9,uh oh,sounds,0.71,0.68,0.78,0.87,0.87,0.89,0.88,0.88,0.9,0.93,0.9,0.93,0.97,0.91,0.89
9,2023-08-30,10,vroom,sounds,0.38,0.4,0.51,0.57,0.57,0.61,0.6,0.69,0.7,0.75,0.71,0.7,0.86,0.72,0.74


In [52]:
#Number of words in the dataset: 

df2['item_definition'].nunique()

680

In [53]:
#Printed all unique values (all words) in the dataset:

df2['item_definition'].unique()

array(['baa baa', 'choo choo', 'cockadoodledoo', 'grrr', 'meow', 'moo',
       'ouch', 'quack quack', 'uh oh', 'vroom', 'woof woof', 'yum yum',
       'alligator', 'animal', 'ant', 'bear', 'bee', 'bird', 'bug',
       'bunny', 'butterfly', 'cat', 'chicken (animal)', 'cow', 'deer',
       'dog', 'donkey', 'duck', 'elephant', 'fish (animal)', 'frog',
       'giraffe', 'goose', 'hen', 'horse', 'kitty', 'lamb', 'lion',
       'monkey', 'moose', 'mouse', 'owl', 'penguin', 'pig', 'pony',
       'puppy', 'rooster', 'sheep', 'squirrel', 'teddybear', 'tiger',
       'turkey', 'turtle', 'wolf', 'zebra', 'airplane', 'bicycle', 'boat',
       'bus', 'car', 'firetruck', 'helicopter', 'motorcycle', 'sled',
       'stroller', 'tractor', 'train', 'tricycle', 'truck', 'ball',
       'balloon', 'bat', 'block', 'book', 'bubbles', 'chalk', 'crayon',
       'doll', 'game', 'glue', 'pen', 'pencil', 'play dough', 'present',
       'puzzle', 'story', 'toy (object)', 'apple', 'applesauce', 'banana',
       'be

## New dataset with all words in each category:

In [9]:
words_by_category = df2.groupby ('category')['item_definition'].agg(list).reset_index()

In [11]:
words_by_category = words_by_category.rename(columns={'item_definition': 'aggregated_words'})

In [17]:
words_by_category

Unnamed: 0,category,aggregated_words
0,action_words,"[bite, blow, break, bring, build, bump, buy, c..."
1,animals,"[alligator, animal, ant, bear, bee, bird, bug,..."
2,body_parts,"[ankle, arm, belly button, buttocks/bottom*, c..."
3,clothing,"[beads, belt, bib, boots, button, coat, diaper..."
4,connecting_words,"[and, because, but, if, so, then]"
5,descriptive_words,"[all gone, asleep, awake, bad, better, big, bl..."
6,food_drink,"[apple, applesauce, banana, beans, bread, butt..."
7,furniture_rooms,"[basement, bathroom, bathtub, bed, bedroom, be..."
8,games_routines,"[bath, breakfast, bye, call (on phone), dinner..."
9,helping_verbs,"[am, are, be, can (auxiliary), could, did/did ..."


In [20]:
cat_animals = words_by_category[words_by_category['category'] == 'animals']

In [58]:
#Printed all categories: 

column = 'category'
values = words_by_category[column]

for value in values:
    print(value)

action_words
animals
body_parts
clothing
connecting_words
descriptive_words
food_drink
furniture_rooms
games_routines
helping_verbs
household
locations
outside
people
places
pronouns
quantifiers
question_words
sounds
time_words
toys
vehicles


In [59]:
#Printed of each category and their words:


def show_content(row):
    print(row.category)
    for w in row.aggregated_words:
        print(f"- {w}")
    print()
words_by_category.apply(show_content, axis=1)

action_words
- bite
- blow
- break
- bring
- build
- bump
- buy
- carry
- catch
- chase
- clap
- clean (action)
- climb
- close
- cook
- cover
- cry
- cut
- dance
- draw
- drink (action)
- drive
- drop
- dry (action)
- dump
- eat
- fall
- feed
- find
- finish
- fit
- fix
- get
- give
- go
- hate
- have
- hear
- help
- hide
- hit
- hold
- hug
- hurry
- jump
- kick
- kiss
- knock
- lick
- like
- listen
- look
- love
- make
- open
- paint
- pick
- play
- pour
- pretend
- pull
- push
- put
- read
- ride
- rip
- run
- say
- see
- shake
- share
- show
- sing
- sit
- skate
- sleep
- slide (action)
- smile
- spill
- splash
- stand
- stay
- stop
- sweep
- swim
- swing (action)
- take
- talk
- taste
- tear
- think
- throw
- tickle
- touch
- wait
- wake
- walk
- wash
- watch (action)
- wipe
- wish
- work (action)
- write

animals
- alligator
- animal
- ant
- bear
- bee
- bird
- bug
- bunny
- butterfly
- cat
- chicken (animal)
- cow
- deer
- dog
- donkey
- duck
- elephant
- fish (animal)
- frog
- 

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
dtype: object