In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load Data
df = pd.read_csv("../data/segment_level_analysis.csv")

In [None]:
# Preview
print(f"Total segments: {len(df)}")
print(f"Unique speakers: {df['speaker_id'].nunique()}")
df.head()

In [None]:
# Number of segments per country
segment_counts = df["country"].value_counts().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x=segment_counts.index, y=segment_counts.values)
plt.title("Number of Segments per Country")
plt.ylabel("Segment Count")
plt.xlabel("Country")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [1]:
# Average words per seconds by country
wps_by_country = df.groupby("country")["wps"].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x=wps_by_country.index, y=wps_by_country.values, palette="viridis")
plt.title("Average Words Per Second by Country")
plt.ylabel("WPS")
plt.xlabel("Country")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Pause before (seconds > 0) by country
df_nonzero = df[df["pause_before"] > 0.0]

plt.figure(figsize=(14, 6))
sns.boxplot(data=df_nonzero, x="country", y="pause_before", showfliers=False)
plt.title("Pause Before (seconds > 0) by Country")
plt.ylabel("Pause Duration")
plt.xlabel("Country")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Percents of segments with pauses > 0
pause_rate = df.groupby("country")["pause_before"].apply(lambda x: (x > 0).mean() * 100)
pause_rate.sort_values(ascending=False).plot(kind="bar", figsize=(12,6), title="Percent of Segments with Pauses > 0")
plt.ylabel("% of Segments with Pauses")
plt.xlabel("Country")
plt.tight_layout()
plt.show()

In [None]:
# Compute average WPS and average pause (excluding 0) per country
avg_wps = df.groupby("country")["wps"].mean()
avg_pause = df[df["pause_before"] > 0].groupby("country")["pause_before"].mean()

In [None]:
# Combine into a single DataFrame
fluency_df = pd.concat([avg_wps, avg_pause], axis=1)
fluency_df.columns = ["avg_wps", "avg_pause"]
fluency_df = fluency_df.dropna()  # in case some countries had no pauses

In [None]:
# Visualize
plt.figure(figsize=(10, 6))
sns.scatterplot(data=fluency_df, x="avg_wps", y="avg_pause", hue=fluency_df.index, s=100)

for country, row in fluency_df.iterrows():
    plt.text(row["avg_wps"] + 0.01, row["avg_pause"] + 0.01, country, fontsize=9)

plt.title("Average Words Per Second vs Pause Duration (by Country)")
plt.xlabel("Average Words Per Second")
plt.ylabel("Average Pause Before (seconds)")
plt.grid(True)
plt.tight_layout()
plt.show()