In [44]:
import pandas as pd
import numpy as np
import re

In [45]:
pd.set_option("display.max_columns", 85)
pd.set_option("display.max_rows", 85)

### Talkapalooza - TL BL SS 23

# Data preparation:

As a source of audio files for single word recordings, I downloaded the English language dataset from [Lingualibre's Languages Gallery](https://lingualibre.org/wiki/LinguaLibre:About). LinguaLibre is a project by Wikimédia France to collectively and collaboratively preserve and provide oral recordings of languages across the globe.

The English dataset consists of recordings provided by 109 unique speakers, amassing about 33 100 recordings covering 29 000 unique words. About 75% of recordings were provided by male speakers.

To have an overview over the pronunciations provided, as well as to know for which there are several recordings, some data preparation and analysis has to be performed.

## Exploring the dataset:

The dataset is downloaded as a folder with several subdirectories and files. Each subdirectory represents a user's recordings and file names describe the word recorded.

### You better walk that OS

In [46]:
import os

root = "../.source/lingualibre_en/"

In [58]:
recordings_df = pd.DataFrame(columns=['path', 'name'])
recordings_df.head()

Unnamed: 0,path,name


Treat subdirs?

In [69]:
for path, subdirs, files in os.walk(root):
    for name in files:
        temp_df = pd.DataFrame([[path, name]], columns=['path', 'name'])
        recordings_df= pd.concat([recordings_df, temp_df])

In [93]:
recordings_df = recordings_df.reset_index()
recordings_df = recordings_df.drop('index', axis=1)

In [102]:
recordings_df.shape

(31756, 2)

In [103]:
recordings_df[::50].head(20)

Unnamed: 0,path,name
0,../.source/lingualibre_en/Eric.LEWIN,TBD.ogg
50,../.source/lingualibre_en/Fredericknoronha,Rosary College of Commerce and Arts.ogg
100,../.source/lingualibre_en/Opsylac,they.ogg
150,../.source/lingualibre_en/I learned some phrases,chary.ogg
200,../.source/lingualibre_en/I learned some phrases,sear.ogg
250,../.source/lingualibre_en/I learned some phrases,rusts.ogg
300,../.source/lingualibre_en/I learned some phrases,obligatory.ogg
350,../.source/lingualibre_en/I learned some phrases,quintessential.ogg
400,../.source/lingualibre_en/I learned some phrases,philately.ogg
450,../.source/lingualibre_en/I learned some phrases,fresco.ogg


### Clean up

Use regex to clean up path and file names. 

In [135]:
pattern_path = re.compile(".*/lingualibre_en/")
pattern_name = re.compile("\.ogg")

recordings_df['path'] =  [re.sub(pattern_path, '', str(x)) for x in recordings_df['path']]
recordings_df['name'] =  [x.lower() for x in recordings_df['name']]
recordings_df['name'] =  [re.sub(pattern_name, '', str(x)) for x in recordings_df['name']]

In [136]:
recordings_df.head()

Unnamed: 0,path,name
22125,Acadienenexil,dragon
22124,Acadienenexil,tomorrow
22126,Acadienenexil,winner
24542,Accuratecy051,see
30945,AcpoKrane,navarrese


In [137]:
recordings_df = recordings_df.sort_values(by=['path', 'name'])

In [142]:
recordings_df.head()

Unnamed: 0,path,name
22125,Acadienenexil,dragon
22124,Acadienenexil,tomorrow
22126,Acadienenexil,winner
24542,Accuratecy051,see
30940,AcpoKrane,disproportion


In [143]:
recordings_df = recordings_df.reset_index().drop('index', axis=1)
recordings_df.head()

Unnamed: 0,path,name
0,Acadienenexil,dragon
1,Acadienenexil,tomorrow
2,Acadienenexil,winner
3,Accuratecy051,see
4,AcpoKrane,disproportion


### Insights

Unique users, find counts of unique words. Ideally we want to work with terms that have multiple recordings available.

In [144]:
users = recordings_df["path"].unique()
users.shape

(104,)

Subdirectories are also present (in any case negligible data for us).

In [148]:
print(*users, sep=" | ")

Acadienenexil | Accuratecy051 | AcpoKrane | Adithyak1997 | Adélaïde Calais WMFr | Ajshul | Ali Farhi | AlpOktem | Amaranorah | Arlo Barnes | Atudu | BANFECE | BANFECE (Sarah Han) | Back ache | Back ache/Attribution | Back ache/singer | Benluks | Benoît Prieur | Berrely | Bibisuccess | Bluerasberry | Brazal.dang | Brian-emurse | Btrombley | CKali | Clifflandis | Cloud atlas | Commander Keane | Daberechi16 | Daniel Mietchen | Darmo117 | Eric.LEWIN | Exilexi | Fake estate | Foobarista2 | Frankincense Diala | Fredericknoronha | Gibraltar Rocks | Graycier | Greenman | Grendelkhan | I JethroBT | I JethroBT/42nd Precinct  | I learned some phrases | Izundu Okechukwu | Jjamesryan | Jmpmann | John Adams VI | John.d.new77 | Jshlanta | Julie Samothrace | Justinrleung | Kasyap | Kateregga1 | Lepticed7 | Lirazelf | Ltrlg | Lyokoï | Mathieu Kappler | Maxewan | Middle river exports (عُثمان) | Mélange a trois | Mélody Xu YANG WMFr | Nattes à chat | Oge ogu | Onuugu | Opsylac | Opsylac (Marie-Sarah) | P

In [162]:
recordings_df["name"].unique()

array(['dragon', 'tomorrow', 'winner', ..., 'ogbono soup',
       'asad ali palijo', 'اسد علي پليجو'], dtype=object)

In [163]:
words = recordings_df["name"].value_counts()
words.shape

(27549,)

In [184]:
words = pd.DataFrame(words).reset_index().rename(columns={'index': 'word', 'name': 'count'})

In [185]:
words[::2500].head()

Unnamed: 0,word,count
0,thin,10
2500,quadratic,2
5000,hyphenate,1
7500,fabaceous,1
10000,sny,1


### Reshape

Reshape by looking at words, include count and usernames per word.

In [241]:
words_df = pd.DataFrame(columns=['word', 'count', 'user'])
words_df.head()

Unnamed: 0,word,count,user


In [242]:
words_df[['word', 'count']] = words
print(words_df.shape)
words_df.head()

(27549, 3)


Unnamed: 0,word,count,user
0,thin,10,
1,long,9,
2,he,9,
3,what,9,
4,that,9,


In [213]:
# words_df['user'] = recordings_df['path'].where(words_df['word'] == recordings_df['name'], words_df['user'] + recordings_df['path'])
# filter = (words_df['word'] == recordings_df['name'])
# recordings_df[filter]
# words_df.merge(recordings_df, left_on='user', right_on='path', how='left')

In [245]:
for word in words_df['word']:
    filter = (recordings_df['name'] == word)
    users = recordings_df[filter]['path']
    filter = (words_df['word'] == word)
    words_df[filter].loc['user'] = ';'.join(list(users))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words_df[filter].loc['user'] = ';'.join(list(users))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words_df[filter].loc['user'] = ';'.join(list(users))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words_df[filter].loc['user'] = ';'.join(list(users))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words_df[

KeyboardInterrupt: 

In [244]:
words_df.head()

Unnamed: 0,word,count,user
0,thin,10,
1,long,9,
2,he,9,
3,what,9,
4,that,9,


### Examine counts

In [195]:
filter_5 = (words_df['count'] >= 5)
filter_4 = (words_df['count'] >= 4)
filter_3 = (words_df['count'] >= 3)
filter_2 = (words_df['count'] >= 2)

In [198]:
print(words_df[filter_5].shape)
words_df[filter_5][::20].head(10)


(122, 3)


Unnamed: 0,word,count,user
0,thin,10,
20,other,8,
40,many,7,
60,root,6,
80,with,5,
100,tongue,5,
120,drooling,5,


In [197]:
print(words_df[filter_4].shape)
print(words_df[filter_3].shape)
print(words_df[filter_2].shape)
words_df[filter_2][::20].head(10)

(225, 3)
(574, 3)
(3148, 3)


Unnamed: 0,word,count,user
0,thin,10,
20,other,8,
40,many,7,
60,root,6,
80,with,5,
100,tongue,5,
120,drooling,5,
140,black,4,
160,to smell,4,
180,to eat,4,
