<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Packages" data-toc-modified-id="Packages-0.1"><span class="toc-item-num">0.1&nbsp;&nbsp;</span>Packages</a></span></li><li><span><a href="#Load-data-set" data-toc-modified-id="Load-data-set-0.2"><span class="toc-item-num">0.2&nbsp;&nbsp;</span>Load data set</a></span></li></ul></li><li><span><a href="#Consonant-Stats" data-toc-modified-id="Consonant-Stats-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Consonant Stats</a></span><ul class="toc-item"><li><span><a href="#Total-consonant-stats" data-toc-modified-id="Total-consonant-stats-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Total consonant stats</a></span></li><li><span><a href="#Words-with-consonant-stats" data-toc-modified-id="Words-with-consonant-stats-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Words with consonant stats</a></span></li><li><span><a href="#Treemap-graphs" data-toc-modified-id="Treemap-graphs-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Treemap graphs</a></span></li></ul></li><li><span><a href="#CV-Stats" data-toc-modified-id="CV-Stats-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>CV Stats</a></span><ul class="toc-item"><li><span><a href="#Total-syllable-stats" data-toc-modified-id="Total-syllable-stats-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Total syllable stats</a></span></li><li><span><a href="#Words-with-syllable-stats" data-toc-modified-id="Words-with-syllable-stats-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Words with syllable stats</a></span></li><li><span><a href="#Treemap-graphs" data-toc-modified-id="Treemap-graphs-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Treemap graphs</a></span></li></ul></li></ul></div>

## Packages

In [1]:
import pandas as pd
import re
from collections import Counter
import matplotlib.pyplot as plt
import squarify

## Load data set

In [2]:
file_name = "PtLanka.csv"

word_df = pd.read_csv(file_name)
word_df

FileNotFoundError: [Errno 2] No such file or directory: 'PtLanka.csv'

In [None]:
word_list = list(word_df["phonetic_transcription"])
word_list = [" ".join(list(str(i))) for i in word_list]
word_list

# Consonant Stats

In [None]:
word_list = [re.sub("(?<=[aeiout]) ː", "ː", i) for i in word_list]
word_list = [re.sub("(?<=[nl]) ʲ", "ʲ", i) for i in word_list]
word_list = [re.sub("d ̠ ʒ", "d̠ʒ", i) for i in word_list]
word_list = [re.sub("t ̠ ʃ", "t̠ʃ", i) for i in word_list]
word_list = [re.sub("^", " ", i) for i in word_list]
word_list = [re.sub("$", " ", i) for i in word_list]

word_list[0:10]

In [None]:
word_list_split = [i.split(" ") for i in word_list]
word_list_split = [item for sublist in word_list_split for item in sublist]
word_list_split = [x for x in word_list_split if x]
vowel_punct = ["a","aː","e","eː","i","iː","o","oː","u","uː","ə","-"]
word_list_split = [e for e in word_list_split if e not in vowel_punct]

## Total consonant stats

In [None]:
counts = Counter(word_list_split)
df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
df.columns = ["IPA", "count"]
df = df.sort_values(by=["count"],ascending=False).reset_index(drop=True)
df

## Words with consonant stats

In [None]:
IPA_list = list(df["IPA"])
in_word_list = []
for word in IPA_list:
    word_search = " "+ word + " "
    matches = len([match for match in word_list if word_search in match])
    in_word_list.append(matches)
df["word_count"] = in_word_list
df["IPA_count_label"] = df["IPA"] + " : " + df["count"].astype(str)
df["IPA_word_count_label"] = df["IPA"] + " : " + df["word_count"].astype(str)
df

## Treemap graphs

In [None]:
labels = list(df["IPA_count_label"])
sizes = list(df["count"])
squarify.plot(sizes, label = labels, alpha=0.6, pad=True)
fig = plt.gcf()
ax = fig.add_subplot()
fig.set_size_inches(16, 16)
plt.axis('off')
plt.show()

In [None]:
labels = list(df["IPA_word_count_label"])
sizes = list(df["count"])
squarify.plot(sizes, label = labels, alpha=0.6, pad=True)
fig = plt.gcf()
ax = fig.add_subplot()
fig.set_size_inches(16, 16)
plt.axis('off')
plt.show()

# CV Stats

## Load dataset

In [None]:
file_name = "crioulo_IPA_CV.csv"

word_df = pd.read_csv(file_name)
word_df

In [None]:
word_list_CV = list(word_df["phono_CV"])
word_list_split = [i.split(".") for i in word_list_CV]
word_list_split = [item for sublist in word_list_split for item in sublist]
word_list_split = [i.split(" ") for i in word_list_split]
word_list_split = [item for sublist in word_list_split for item in sublist]
word_list_split[:] = [x for x in word_list_split if x]

## Total syllable stats

In [None]:
counts = Counter(word_list_split)
df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
df.columns = ["CV", "count"]
df = df.sort_values(by=["count"],ascending=False).reset_index(drop=True)
df

## Words with syllable stats

In [None]:
word_list_CV = [re.sub("\.", " ", i) for i in word_list_CV]
word_list_CV = [re.sub("^", " ", i) for i in word_list_CV]
word_list_CV = [re.sub("$", " ", i) for i in word_list_CV]

In [None]:
CV_list = list(df["CV"])
in_word_list = []
for word in CV_list:
    word_search = " "+ word + " "
    matches = len([match for match in word_list_CV if word_search in match])
    in_word_list.append(matches)
df["word_count"] = in_word_list
df["word_count"] = in_word_list
df["CV_count_label"] = df["CV"] + " : " + df["count"].astype(str)
df["CV_word_count_label"] = df["CV"] + " : " + df["word_count"].astype(str)
df

## Treemap graphs

In [None]:
labels = list(df["CV_count_label"])
sizes = list(df["count"])
squarify.plot(sizes, label = labels, alpha=0.6, pad=True)
fig = plt.gcf()
ax = fig.add_subplot()
fig.set_size_inches(16, 16)
plt.axis('off')
plt.show()

In [None]:
labels = list(df["CV_word_count_label"])
sizes = list(df["count"])
squarify.plot(sizes, label = labels, alpha=0.6, pad=True)
fig = plt.gcf()
ax = fig.add_subplot()
fig.set_size_inches(16, 16)
plt.axis('off')
plt.show()

In [None]:
word_df[word_df['phono_CV'].str.contains("CVCVC")]