In [89]:
import pandas as pd
import numpy as np
import re

In [90]:
pd.set_option("display.max_columns", 85)
pd.set_option("display.max_rows", 85)

### Talkapalooza - TL BL SS 23

# Data preparation:

As a source of audio files for single word recordings, I downloaded the English language dataset from [Lingualibre's Languages Gallery](https://lingualibre.org/wiki/LinguaLibre:About). LinguaLibre is a project by Wikimédia France to collectively and collaboratively preserve and provide oral recordings of languages across the globe.

The English dataset consists of recordings provided by 109 unique speakers, amassing about 33 100 recordings covering 29 000 unique words. About 75% of recordings were provided by male speakers.

To have an overview over the pronunciations provided, as well as to know for which there are several recordings, some data preparation and analysis has to be performed.

## Exploring the dataset:

The dataset is downloaded as a folder with several subdirectories and files. Each subdirectory represents a user's recordings and file names describe the word recorded.

### You better walk that OS

In [91]:
import os

root = "../.source/lingualibre_en/"

In [92]:
recordings_df = pd.DataFrame(columns=['path', 'name'])
recordings_df.head()

Unnamed: 0,path,name


Treat subdirs?

In [93]:
for path, subdirs, files in os.walk(root):
    for name in files:
        temp_df = pd.DataFrame([[path, name]], columns=['path', 'name'])
        recordings_df= pd.concat([recordings_df, temp_df])

In [94]:
recordings_df = recordings_df.reset_index()
recordings_df = recordings_df.drop('index', axis=1)

In [95]:
recordings_df.shape

(31756, 2)

In [96]:
recordings_df[::50].head(20)

Unnamed: 0,path,name
0,../.source/lingualibre_en/Eric.LEWIN,TBD.ogg
50,../.source/lingualibre_en/Fredericknoronha,Rosary College of Commerce and Arts.ogg
100,../.source/lingualibre_en/Opsylac,they.ogg
150,../.source/lingualibre_en/I learned some phrases,chary.ogg
200,../.source/lingualibre_en/I learned some phrases,sear.ogg
250,../.source/lingualibre_en/I learned some phrases,rusts.ogg
300,../.source/lingualibre_en/I learned some phrases,obligatory.ogg
350,../.source/lingualibre_en/I learned some phrases,quintessential.ogg
400,../.source/lingualibre_en/I learned some phrases,philately.ogg
450,../.source/lingualibre_en/I learned some phrases,fresco.ogg


### Clean up

Use regex to clean up path and file names. 

In [97]:
pattern_path = re.compile(".*/lingualibre_en/")
pattern_name = re.compile("\.ogg")

recordings_df['path'] =  [re.sub(pattern_path, '', str(x)) for x in recordings_df['path']]
recordings_df['name'] =  [x.lower() for x in recordings_df['name']]
recordings_df['name'] =  [re.sub(pattern_name, '', str(x)) for x in recordings_df['name']]

In [98]:
recordings_df['count'] = 1
recordings_df.head()

Unnamed: 0,path,name,count
0,Eric.LEWIN,tbd,1
1,Fredericknoronha,central bank of india calangute branch,1
2,Fredericknoronha,"icici bank limited, calangute, goa branch",1
3,Fredericknoronha,curtorim vidhan sabha constituency,1
4,Fredericknoronha,vagator,1


In [99]:
recordings_df = recordings_df.sort_values(by=['path', 'name'])

In [100]:
recordings_df = recordings_df.reset_index().drop('index', axis=1)
recordings_df.head()

Unnamed: 0,path,name,count
0,Acadienenexil,dragon,1
1,Acadienenexil,tomorrow,1
2,Acadienenexil,winner,1
3,Accuratecy051,see,1
4,AcpoKrane,disproportion,1


### Insights

Unique users, find counts of unique words. Ideally we want to work with terms that have multiple recordings available.

In [101]:
users = recordings_df['path'].unique()
users.shape

(104,)

Subdirectories are also present (in any case negligible data for us).

In [102]:
print(*users, sep=" | ")

Acadienenexil | Accuratecy051 | AcpoKrane | Adithyak1997 | Adélaïde Calais WMFr | Ajshul | Ali Farhi | AlpOktem | Amaranorah | Arlo Barnes | Atudu | BANFECE | BANFECE (Sarah Han) | Back ache | Back ache/Attribution | Back ache/singer | Benluks | Benoît Prieur | Berrely | Bibisuccess | Bluerasberry | Brazal.dang | Brian-emurse | Btrombley | CKali | Clifflandis | Cloud atlas | Commander Keane | Daberechi16 | Daniel Mietchen | Darmo117 | Eric.LEWIN | Exilexi | Fake estate | Foobarista2 | Frankincense Diala | Fredericknoronha | Gibraltar Rocks | Graycier | Greenman | Grendelkhan | I JethroBT | I JethroBT/42nd Precinct  | I learned some phrases | Izundu Okechukwu | Jjamesryan | Jmpmann | John Adams VI | John.d.new77 | Jshlanta | Julie Samothrace | Justinrleung | Kasyap | Kateregga1 | Lepticed7 | Lirazelf | Ltrlg | Lyokoï | Mathieu Kappler | Maxewan | Middle river exports (عُثمان) | Mélange a trois | Mélody Xu YANG WMFr | Nattes à chat | Oge ogu | Onuugu | Opsylac | Opsylac (Marie-Sarah) | P

In [103]:
recordings_df["name"].unique()

array(['dragon', 'tomorrow', 'winner', ..., 'ogbono soup',
       'asad ali palijo', 'اسد علي پليجو'], dtype=object)

In [104]:
words = recordings_df["name"].value_counts()
words.shape

(27549,)

In [105]:
words = pd.DataFrame(words).reset_index().rename(columns={'index': 'word', 'name': 'count'})

In [106]:
words[::2500].head()

Unnamed: 0,word,count
0,thin,10
2500,quadratic,2
5000,hyphenate,1
7500,fabaceous,1
10000,sny,1


### Reshape

Reshape by looking at words, include count and usernames per word.

In [107]:
words_df = pd.DataFrame()

In [108]:
words_df[['word', 'count', 'user']] = (
    recordings_df.groupby(
        ['name']
    ).count().reset_index()
)

In [109]:
words_df.sort_values(by='count', ascending=False).head(400)

Unnamed: 0,word,count,user
24287,thin,10,10
26541,what,9,9
11569,he,9,9
24275,they,9,9
24317,this,9,9
...,...,...,...
8831,even,3,3
24551,to sit,3,3
16405,near,3,3
1286,an,3,3


In [110]:
words_df['user'] = ''
words_df['user'] = (
    recordings_df.groupby(
        ['name'],
        as_index=False
    )['path']
)

In [111]:
words_df.head()

Unnamed: 0,word,count,user
0,town hall police station,1,"( town hall police station, [I JethroBT/42nd P..."
1,&,1,"(&, [Back ache])"
2,'an,1,"('an, [Arlo Barnes])"
3,'d,1,"('d, [She animates])"
4,'hara,1,"('hara, [Arlo Barnes])"


In [115]:
user_vals = words_df['user']
user_vals[:5]

0    ( town hall police station, [I JethroBT/42nd P...
1                                     (&, [Back ache])
2                                 ('an, [Arlo Barnes])
3                                 ('d, [She animates])
4                               ('hara, [Arlo Barnes])
Name: user, dtype: object

In [116]:
for i, val in enumerate(user_vals):
    words_df.loc[i, 'user'] = ';'.join(user_vals[i][1].values)

In [121]:
words_df.sort_values(by='count', ascending=False).head(10)

Unnamed: 0,word,count,user
24287,thin,10,Back ache;Exilexi;Fake estate;Greenman;Lirazel...
26541,what,9,Back ache;Exilexi;Greenman;Lirazelf;Mathieu Ka...
11569,he,9,Back ache;Greenman;Lirazelf;Mathieu Kappler;Mé...
24275,they,9,Back ache;Exilexi;Greenman;Lirazelf;Nattes à c...
24317,this,9,Back ache;Exilexi;Greenman;Lirazelf;Mathieu Ka...
14592,long,9,Back ache;Fake estate;Greenman;Lirazelf;Nattes...
24195,that,9,Back ache;Greenman;Lirazelf;Mathieu Kappler;Na...
22600,some,8,Back ache;Exilexi;Greenman;Lirazelf;Nattes à c...
16765,not,8,Back ache;Greenman;Lirazelf;Mathieu Kappler;Na...
27041,you,8,Back ache;Greenman;Lirazelf;Mathieu Kappler;Mé...


In [120]:
words_df.iloc[24287]['user']

'Back ache;Exilexi;Fake estate;Greenman;Lirazelf;Nattes à chat;Opsylac;Simplificationalizer;SpringProof;Vivekteraiyavt'

### Examine counts

In [122]:
filter_5 = (words_df['count'] >= 5)
filter_4 = (words_df['count'] >= 4)
filter_3 = (words_df['count'] >= 3)
filter_2 = (words_df['count'] >= 2)

In [127]:
print(words_df[filter_5].shape)
words_df[filter_5].sort_values(by='count', ascending=False)[::20].head(10)

(122, 3)


Unnamed: 0,word,count,user
24287,thin,10,Back ache;Exilexi;Fake estate;Greenman;Lirazel...
4603,child,8,Back ache;Greenman;Lepticed7;Lirazelf;Nattes à...
9433,few,7,Back ache;Exilexi;Greenman;Lirazelf;Nattes à c...
10007,four,6,Back ache;Greenman;Lirazelf;Nattes à chat;Spri...
26533,whale,5,Arlo Barnes;Back ache;Graycier;Lirazelf;Nattes...
9848,for,5,Back ache;Grendelkhan;Mathieu Kappler;Nattes à...
22233,skin,5,Back ache;Greenman;Lirazelf;Nattes à chat;Spri...


In [129]:
print(words_df[filter_4].shape)
print(words_df[filter_3].shape)
print(words_df[filter_2].shape)
words_df[filter_2].sort_values(by='count', ascending=False)[::20].head(10)

(225, 3)
(574, 3)
(3148, 3)


Unnamed: 0,word,count,user
24287,thin,10,Back ache;Exilexi;Fake estate;Greenman;Lirazel...
14953,man,8,Back ache;Greenman;Lirazelf;Nattes à chat;Opsy...
9433,few,7,Back ache;Exilexi;Greenman;Lirazelf;Nattes à c...
2904,bird,6,Back ache;Greenman;Lirazelf;Nattes à chat;Spri...
24405,thunder,5,Arlo Barnes;Back ache;Graycier;Lirazelf;Nattes...
6187,cruel,5,Arlo Barnes;Back ache;Graycier;Lirazelf;Nattes...
11396,hand,5,Back ache;Exilexi;Lirazelf;Nattes à chat;Sprin...
9250,fat,4,Back ache;Lirazelf;Nattes à chat;SpringProof
23853,tail,4,Back ache;Lirazelf;Nattes à chat;SpringProof
21972,should,4,Back ache;Nattes à chat;She animates;Vealhurl


## Cross Reference

### Read the csv used by Data Science team:

In [130]:
wordbank_df = pd.read_csv('../.source/wordbank_item_data.csv')

In [135]:
print(wordbank_df.shape)
wordbank_df.head()

(680, 19)


Unnamed: 0,downloaded,item_id,item_definition,category,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
0,2023-09-03,1,baa baa,sounds,0.47,0.52,0.56,0.67,0.71,0.7,0.68,0.78,0.78,0.8,0.8,0.84,0.89,0.8,0.82
1,2023-09-03,2,choo choo,sounds,0.21,0.32,0.39,0.52,0.58,0.6,0.65,0.7,0.76,0.84,0.81,0.85,0.92,0.85,0.89
2,2023-09-03,3,cockadoodledoo,sounds,0.12,0.11,0.15,0.23,0.25,0.31,0.34,0.37,0.44,0.48,0.5,0.54,0.71,0.54,0.6
3,2023-09-03,4,grrr,sounds,0.48,0.49,0.55,0.64,0.65,0.67,0.65,0.74,0.77,0.78,0.76,0.8,0.9,0.79,0.82
4,2023-09-03,5,meow,sounds,0.41,0.48,0.56,0.68,0.71,0.78,0.77,0.81,0.86,0.89,0.88,0.92,0.96,0.91,0.93


In [151]:
wordbank_words = wordbank_df['item_definition'].unique()
print(wordbank_words[::30])

['baa baa' 'frog' 'firetruck' 'beans' 'juice' 'tuna' 'tights' 'basket'
 'penny' 'door' 'ladder' 'farm' 'grandma*' 'peekaboo' 'drink (action)'
 'listen' 'stand' 'blue' 'naughty' 'morning' 'you' 'over' 'does']


In [152]:
pattern_asterisk = re.compile('\*')
pattern_indicator = re.compile('\s\(.*\)')

wordbank_words = [re.sub(pattern_asterisk, '', str(x)) for x in wordbank_words]
wordbank_words = [re.sub(pattern_indicator, '', str(x)) for x in wordbank_words]

print(wordbank_words[::30])

['baa baa', 'frog', 'firetruck', 'beans', 'juice', 'tuna', 'tights', 'basket', 'penny', 'door', 'ladder', 'farm', 'grandma', 'peekaboo', 'drink', 'listen', 'stand', 'blue', 'naughty', 'morning', 'you', 'over', 'does']


In [161]:
filter_wordbank = (words_df['word'].isin(wordbank_words))
filter_wordbank_ext = ~(words_df['word'].isin(wordbank_words))

In [173]:
print(len(words_df[filter_wordbank]), 'words match in both')
words_df[filter_wordbank].head()

477 words match in both


Unnamed: 0,word,count,user
188,a,4,Back ache;Lyokoï;Mathieu Kappler;Nattes à chat
319,about,3,Back ache;Nattes à chat;She animates
321,above,2,Back ache;Nattes à chat
826,after,4,Back ache;Nattes à chat;She animates;Simplific...
1077,all,7,Back ache;Greenman;Lirazelf;Nattes à chat;Opsy...


#### Export

In [214]:
# words_df[filter_wordbank][['word', 'count']].to_csv('../output/wordbank_crossref.csv')

### Closer look at missing terms

In [180]:
wordbank_missing = list()
[wordbank_missing.append(w) for w in wordbank_words if w not in words_df[filter_wordbank]['word'].values]
print(len(wordbank_missing))

192


In [182]:
print(wordbank_missing)

['baa baa', 'choo choo', 'cockadoodledoo', 'quack quack', 'uh oh', 'woof woof', 'yum yum', 'ant', 'bee', 'cow', 'donkey', 'frog', 'giraffe', 'goose', 'kitty', 'lamb', 'moose', 'mouse', 'owl', 'pony', 'puppy', 'rooster', 'teddybear', 'tiger', 'turtle', 'wolf', 'airplane', 'firetruck', 'helicopter', 'sled', 'stroller', 'tractor', 'tricycle', 'truck', 'balloon', 'bat', 'bubbles', 'chalk', 'doll', 'play dough', 'puzzle', 'toy', 'applesauce', 'beans', 'butter', 'candy', 'carrots', 'cereal', 'cheerios', 'coke', 'donut', 'green beans', 'gum', 'ice cream', 'jello', 'lollipop', 'muffin', 'noodles', 'nuts', 'peas', 'peanut butter', 'pickle', 'popsicle', 'potato chip', 'pretzel', 'raisin', 'sandwich', 'sauce', 'soda/pop', 'toast', 'tuna', 'vanilla', 'belt', 'bib', 'boots', 'button', 'diaper', 'gloves', 'mittens', 'pajamas', 'pants', 'slipper', 'sneaker', 'snowsuit', 'ankle', 'belly button', 'buttocks/bottom', 'cheek', 'chin', 'lips', 'owie/boo boo', 'tummy', 'vagina', 'blanket', 'broom', 'garbage

In [207]:
filter_gotta = (words_df['word'] == 'gotta')

words_df[filter_gotta]

Unnamed: 0,word,count,user
10922,gotta,2,Back ache;Vealhurl


In [209]:
filter_gotto = (recordings_df['name'] == 'got to')

recordings_df[filter_gotto]

Unnamed: 0,path,name,count


In [210]:
filter_got = (recordings_df['name'] == 'got')

recordings_df[filter_got]

Unnamed: 0,path,name,count
3823,Back ache,got,1
12593,Semperosculto,got,1
12828,She animates,got,1


#### Future notes

Make sure to include these by splitting strings at "/" and deleting any " to"'s?

If we want to have speaking exercises for these, that is - it is either slang ("gonna") or the bare infinitive form (vs. the more common/instinctual to-infinitive etc.).