In [1]:
import os
import glob
import math
import chardet, csv
import pandas as pd                      
import matplotlib.pyplot as plt         
import seaborn as sns                   
from scipy.stats import ( 
    kstest,
    shapiro,      # Shapiro–Wilk normaaljaotuse test
    levene,       # Levene’i test dispersioonide võrdlemiseks
    ttest_ind,    # kahe sõltumatu valimi t-test
    mannwhitneyu,  # Mann–Whitney U-test (mitteparameetriline)
    chisquare
)
from statsmodels.stats.multitest import multipletests
import pingouin as pg
import numpy as np
import matplotlib.pyplot as plt


In [2]:
def loe_csv_automaatselt(path):

    with open(path, 'rb') as f:
        enc = chardet.detect(f.read(10000))['encoding']

    with open(path, encoding=enc) as f:
        sample = f.read(2048)
        dialect = csv.Sniffer().sniff(sample)
    return pd.read_csv(path, encoding=enc, sep=dialect.delimiter)


In [3]:
def analüüsi_fail(failitee, raw, norm):
    df = loe_csv_automaatselt(failitee)
    raw_idx, norm_idx = raw, norm

    # normaliseeritud sagedus
    mehed_norm  = df[df["gender"] == "M"].iloc[:, norm_idx].dropna()
    naised_norm = df[df["gender"] == "F"].iloc[:, norm_idx].dropna()

    # KS-test
    z_m = (mehed_norm - mehed_norm.mean()) / mehed_norm.std(ddof=1)
    z_n = (naised_norm - naised_norm.mean()) / naised_norm.std(ddof=1)
    p_m, p_n = kstest(z_m, 'norm')[1], kstest(z_n, 'norm')[1]

    # testi valimine
    if p_m > 0.05 and p_n > 0.05:
        eq_var = (levene(mehed_norm, naised_norm)[1] > 0.05)
        _, p_norm = ttest_ind(mehed_norm, naised_norm, equal_var=eq_var)
        test_norm = "t-test" if eq_var else "Welch"
        U = None
    else:
        U, p_norm = mannwhitneyu(mehed_norm, naised_norm, alternative='two-sided')
        test_norm = "Mann–Whitney U"

    # Rank-biserial 
    if test_norm == "Mann–Whitney U":
        cles = pg.compute_effsize(mehed_norm, naised_norm, eftype='CLES')
        r_rb = 2 * cles - 1
    else:
        if U is None:
            U, _ = mannwhitneyu(mehed_norm, naised_norm, alternative='two-sided')
        n1, n2 = len(mehed_norm), len(naised_norm)
        r_rb = 1 - (2 * U) / (n1 * n2)

    # salvestan absoluutsed sagedused χ²
    mehed_raw  = df[df["gender"] == "M"].iloc[:, raw_idx].dropna()
    naised_raw = df[df["gender"] == "F"].iloc[:, raw_idx].dropna()
    sum_m, sum_n = mehed_raw.sum(), naised_raw.sum()

    return {
        "failinimi": os.path.basename(failitee),
        "test_norm": test_norm,
        "p_norm":    p_norm,
        "sum_m":     sum_m,
        "sum_n":     sum_n,
        "r_rb":      r_rb,
    }

In [4]:
# kõik tulemused
csv_failid = glob.glob(os.path.join("pos_sections_csv/pos_sections_counter", "*.csv"))
tulemused = [analüüsi_fail(f, 5, 6) for f in csv_failid]
kokkuvõte_df = pd.DataFrame(tulemused)

# Holm–Bonferroni 
rej, p_holm, _, _ = multipletests(kokkuvõte_df["p_norm"], alpha=0.05, method='holm')
kokkuvõte_df["p_norm_holm"]      = p_holm
kokkuvõte_df["signif_norm_holm"] = rej

# χ² + Cramer’s V kui signif_norm_holm == True
chi2_list = []
p_chi2_list = []
cramer_v_list = []
signif_chi2_list = []

for _, row in kokkuvõte_df.iterrows():
    if row["signif_norm_holm"]:
        chi2_stat, p_chi2 = chisquare([row["sum_m"], row["sum_n"]])
        N = row["sum_m"] + row["sum_n"]
        cramer_v = math.sqrt(chi2_stat / N) if N > 0 else None
        signif = (p_chi2 < 0.05)
    else:
        chi2_stat = p_chi2 = cramer_v = None
        signif = False

    chi2_list.append(chi2_stat)
    p_chi2_list.append(p_chi2)
    cramer_v_list.append(cramer_v)
    signif_chi2_list.append(signif)

kokkuvõte_df["chi2_stat"]    = chi2_list
kokkuvõte_df["p_chi2"]       = p_chi2_list
kokkuvõte_df["cramer_v"]     = cramer_v_list
kokkuvõte_df["signif_chi2"]  = signif_chi2_list

kokkuvõte_df.to_csv("tulemused/tulemused_pos.csv", index=False)
print(kokkuvõte_df)


                    failinimi       test_norm        p_norm  sum_m  sum_n  \
0    kaassõnad_by_section.csv  Mann–Whitney U  1.908414e-13   5319   4364   
1  omadussõnad_by_section.csv  Mann–Whitney U  4.409210e-16  13729  12520   
2       verbid_by_section.csv  Mann–Whitney U  1.369080e-17  69310  58287   
3     adverbid_by_section.csv  Mann–Whitney U  8.050419e-18  63572  54396   
4    nimisõnad_by_section.csv  Mann–Whitney U  5.818230e-17  61116  50246   
5     asesõnad_by_section.csv  Mann–Whitney U  3.103897e-16  65097  54149   

       r_rb   p_norm_holm  signif_norm_holm    chi2_stat         p_chi2  \
0 -0.410206  1.908414e-13              True    94.188268   2.868961e-22   
1 -0.435259  9.311691e-16              True    55.685207   8.505592e-14   
2 -0.448915  6.845398e-17              True   952.267914  4.263394e-209   
3 -0.456702  4.830251e-17              True   713.744202  3.068868e-157   
4 -0.439466  2.327292e-16              True  1061.016325  9.815288e-233   
5 -0.43150

In [5]:
def plot_gender_boxplots(
    directory: str,
    output_directory: str,
    norm_col: str = "freq_per_100_lemmas",
    pattern: str = "*.csv"
) -> None:
    
    os.makedirs(output_directory, exist_ok=True)
    file_pattern = os.path.join(directory, pattern)
    csv_files = glob.glob(file_pattern)

    for filepath in csv_files:
        fname = os.path.basename(filepath)

        with open(filepath, 'r', encoding='utf-8') as f:
            sample = f.read(2048)
            try:
                dialect = csv.Sniffer().sniff(sample, delimiters=[',', ';'])
                sep = dialect.delimiter
            except csv.Error:
                sep = ','

        df = pd.read_csv(filepath, sep=sep)

        if 'gender' not in df.columns or norm_col not in df.columns:
            raise ValueError(f"File '{fname}' missing required column 'gender' or '{norm_col}'")


        data_m = df.loc[df['gender'] == 'M', norm_col].dropna()
        data_f = df.loc[df['gender'] == 'F', norm_col].dropna()
        if data_m.empty or data_f.empty:
            print(f"Skipping '{fname}': no data for one of genders.")
            continue

        # boxplot
        fig, ax = plt.subplots()
        ax.boxplot(
            [data_m, data_f],
            tick_labels=['M', 'F'],
            whis=(0, 100),
            showfliers=True
        )
        ax.set_title(f"Распределение '{norm_col}' — {fname}")
        ax.set_ylabel(norm_col)

        # min, median, max
        for i, series in enumerate((data_m, data_f), start=1):
            q0, q1, q2, q3, q4 = np.percentile(series, [0, 25, 50, 75, 100])
            ax.text(i, q2, f"{q2:.2f}", ha='center', va='bottom')
            ax.text(i - 0.1, q0, f"min:{q0:.2f}", ha='right', va='bottom', fontsize=8)
            ax.text(i + 0.1, q4, f"max:{q4:.2f}", ha='left', va='bottom', fontsize=8)

        plt.tight_layout()
        out_file = os.path.join(
            output_directory,
            f"{os.path.splitext(fname)[0]}_boxplot.png"
        )
        plt.savefig(out_file)
        plt.close(fig)
        print(f"Saved '{out_file}'")

In [6]:
plot_gender_boxplots("pos_sections_csv/pos_sections_counter", "plots_boxplots_pos")

Saved 'plots_boxplots_pos/kaassõnad_by_section_boxplot.png'
Saved 'plots_boxplots_pos/omadussõnad_by_section_boxplot.png'
Saved 'plots_boxplots_pos/verbid_by_section_boxplot.png'
Saved 'plots_boxplots_pos/adverbid_by_section_boxplot.png'
Saved 'plots_boxplots_pos/nimisõnad_by_section_boxplot.png'
Saved 'plots_boxplots_pos/asesõnad_by_section_boxplot.png'


In [7]:
# kõik tulemused
csv_failid = glob.glob(os.path.join("tunnused_sections_csv", "*.csv"))
tulemused = [analüüsi_fail(f, 4, 5) for f in csv_failid]
kokkuvõte_df = pd.DataFrame(tulemused)

# Holm–Bonferroni 
rej, p_holm, _, _ = multipletests(kokkuvõte_df["p_norm"], alpha=0.05, method='holm')
kokkuvõte_df["p_norm_holm"]      = p_holm
kokkuvõte_df["signif_norm_holm"] = rej

# 3) χ² + Cramer’s V kui signif_norm_holm == True
chi2_list = []
p_chi2_list = []
cramer_v_list = []
signif_chi2_list = []

for _, row in kokkuvõte_df.iterrows():
    if row["signif_norm_holm"]:
        chi2_stat, p_chi2 = chisquare([row["sum_m"], row["sum_n"]])
        N = row["sum_m"] + row["sum_n"]
        cramer_v = math.sqrt(chi2_stat / N) if N > 0 else None
        signif = (p_chi2 < 0.05)
    else:
        chi2_stat = p_chi2 = cramer_v = None
        signif = False

    chi2_list.append(chi2_stat)
    p_chi2_list.append(p_chi2)
    cramer_v_list.append(cramer_v)
    signif_chi2_list.append(signif)

kokkuvõte_df["chi2_stat"]    = chi2_list
kokkuvõte_df["p_chi2"]       = p_chi2_list
kokkuvõte_df["cramer_v"]     = cramer_v_list
kokkuvõte_df["signif_chi2"]  = signif_chi2_list

kokkuvõte_df.to_csv("tulemused/tulemused_tunnused.csv", index=False)
print(kokkuvõte_df)


                                       failinimi       test_norm  \
0                            tingiv_sections.csv  Mann–Whitney U   
1        intensiivistajad_verbidega_sections.csv  Mann–Whitney U   
2                      umbisikuline_sections.csv  Mann–Whitney U   
3      intensiivistajad_adverbidega_sections.csv  Mann–Whitney U   
4                           eitused_sections.csv  Mann–Whitney U   
5   intensiivistajad_omadussonadega_sections.csv  Mann–Whitney U   
6             esimese_ja_teise_isik_sections.csv  Mann–Whitney U   
7                            kaskiv_sections.csv  Mann–Whitney U   
8                         partikkel_sections.csv  Mann–Whitney U   
9                       kp_markerid_sections.csv  Mann–Whitney U   
10              verbid_partiklitena_sections.csv  Mann–Whitney U   

          p_norm  sum_m  sum_n      r_rb   p_norm_holm  signif_norm_holm  \
0   1.581218e-15   2508   2456 -0.472353  1.264975e-14              True   
1   1.617264e-16   1432   1366 

In [8]:
plot_gender_boxplots("tunnused_sections_uus_csv", "plots_boxplots_pos")

In [9]:
# kõik tulemused
csv_failid = glob.glob(os.path.join("particles_per_value/sections_particles_per_value", "*.csv"))
tulemused = [analüüsi_fail(f, 4, 5) for f in csv_failid]
kokkuvõte_df = pd.DataFrame(tulemused)

# Holm–Bonferroni 
rej, p_holm, _, _ = multipletests(kokkuvõte_df["p_norm"], alpha=0.05, method='holm')
kokkuvõte_df["p_norm_holm"]      = p_holm
kokkuvõte_df["signif_norm_holm"] = rej

# 3) χ² + Cramer’s V kui signif_norm_holm == True
chi2_list = []
p_chi2_list = []
cramer_v_list = []
signif_chi2_list = []

for _, row in kokkuvõte_df.iterrows():
    if row["signif_norm_holm"]:
        chi2_stat, p_chi2 = chisquare([row["sum_m"], row["sum_n"]])
        N = row["sum_m"] + row["sum_n"]
        cramer_v = math.sqrt(chi2_stat / N) if N > 0 else None
        signif = (p_chi2 < 0.05)
    else:
        chi2_stat = p_chi2 = cramer_v = None
        signif = False

    chi2_list.append(chi2_stat)
    p_chi2_list.append(p_chi2)
    cramer_v_list.append(cramer_v)
    signif_chi2_list.append(signif)

kokkuvõte_df["chi2_stat"]    = chi2_list
kokkuvõte_df["p_chi2"]       = p_chi2_list
kokkuvõte_df["cramer_v"]     = cramer_v_list
kokkuvõte_df["signif_chi2"]  = signif_chi2_list

kokkuvõte_df.to_csv("tulemused/tulemused_partiklid.csv", index=False)
print(kokkuvõte_df)


            failinimi       test_norm        p_norm  sum_m  sum_n      r_rb  \
0     no_sections.csv  Mann–Whitney U  2.149289e-03    474    195 -0.260274   
1     oh_sections.csv  Mann–Whitney U  3.711261e-05     39     19 -0.727047   
2    vot_sections.csv  Mann–Whitney U  2.598828e-04     74     59 -0.495895   
3   nagu_sections.csv  Mann–Whitney U  3.559777e-04   7225   3732 -0.207371   
4   okei_sections.csv  Mann–Whitney U  6.804291e-05    225    124 -0.404635   
5    jah_sections.csv  Mann–Whitney U  1.251070e-05     67     44 -0.535802   
6  aitäh_sections.csv  Mann–Whitney U  1.283180e-07     47     74 -0.681633   
7    noh_sections.csv  Mann–Whitney U  2.505540e-07    345    230 -0.445445   

   p_norm_holm  signif_norm_holm    chi2_stat         p_chi2  cramer_v  \
0     0.002149              True   116.354260   3.975455e-27  0.417040   
1     0.000186              True     6.896552   8.636217e-03  0.344828   
2     0.000780              True     1.691729   1.933732e-01  0.11

In [10]:
plot_gender_boxplots("particles_per_value/sections_particles_per_value", "plots_boxplots_pos")

Saved 'plots_boxplots_pos/no_sections_boxplot.png'
Saved 'plots_boxplots_pos/oh_sections_boxplot.png'
Saved 'plots_boxplots_pos/vot_sections_boxplot.png'
Saved 'plots_boxplots_pos/nagu_sections_boxplot.png'
Saved 'plots_boxplots_pos/okei_sections_boxplot.png'
Saved 'plots_boxplots_pos/jah_sections_boxplot.png'
Saved 'plots_boxplots_pos/aitäh_sections_boxplot.png'
Saved 'plots_boxplots_pos/noh_sections_boxplot.png'


In [11]:
# kõik tulemused
csv_failid = glob.glob(os.path.join("kp_markerid_per_value/sections_kp_markerid_per_value", "*.csv"))
tulemused = [analüüsi_fail(f, 4, 5) for f in csv_failid]
kokkuvõte_df = pd.DataFrame(tulemused)

# Holm–Bonferroni 
rej, p_holm, _, _ = multipletests(kokkuvõte_df["p_norm"], alpha=0.05, method='holm')
kokkuvõte_df["p_norm_holm"]      = p_holm
kokkuvõte_df["signif_norm_holm"] = rej

# 3) χ² + Cramer’s V kui signif_norm_holm == True
chi2_list = []
p_chi2_list = []
cramer_v_list = []
signif_chi2_list = []

for _, row in kokkuvõte_df.iterrows():
    if row["signif_norm_holm"]:
        chi2_stat, p_chi2 = chisquare([row["sum_m"], row["sum_n"]])
        N = row["sum_m"] + row["sum_n"]
        cramer_v = math.sqrt(chi2_stat / N) if N > 0 else None
        signif = (p_chi2 < 0.05)
    else:
        chi2_stat = p_chi2 = cramer_v = None
        signif = False

    chi2_list.append(chi2_stat)
    p_chi2_list.append(p_chi2)
    cramer_v_list.append(cramer_v)
    signif_chi2_list.append(signif)

kokkuvõte_df["chi2_stat"]    = chi2_list
kokkuvõte_df["p_chi2"]       = p_chi2_list
kokkuvõte_df["cramer_v"]     = cramer_v_list
kokkuvõte_df["signif_chi2"]  = signif_chi2_list

kokkuvõte_df.to_csv("tulemused/tulemused_kp.csv", index=False)
print(kokkuvõte_df)


             failinimi       test_norm        p_norm  sum_m  sum_n      r_rb  \
0  loodan_sections.csv  Mann–Whitney U  1.262790e-05     56     41 -0.582843   
1  tundub_sections.csv  Mann–Whitney U  7.603629e-07    166    125 -0.462736   
2   arvan_sections.csv  Mann–Whitney U  3.008416e-04    642    362 -0.271650   
3  ütleme_sections.csv  Mann–Whitney U  2.663921e-06    570    360 -0.363119   
4    usun_sections.csv  Mann–Whitney U  2.571712e-09     71     71 -0.694340   

    p_norm_holm  signif_norm_holm  chi2_stat        p_chi2  cramer_v  \
0  2.525579e-05              True   2.319588  1.277539e-01  0.154639   
1  3.041452e-06              True   5.776632  1.624063e-02  0.140893   
2  3.008416e-04              True  78.087649  9.856832e-19  0.278884   
3  7.991764e-06              True  47.419355  5.731398e-12  0.225806   
4  1.285856e-08              True   0.000000  1.000000e+00  0.000000   

   signif_chi2  
0        False  
1         True  
2         True  
3         True  
4

In [12]:
plot_gender_boxplots("kp_markerid_per_value/sections_kp_markerid_per_value", "plots_boxplots_pos")

Saved 'plots_boxplots_pos/loodan_sections_boxplot.png'
Saved 'plots_boxplots_pos/tundub_sections_boxplot.png'
Saved 'plots_boxplots_pos/arvan_sections_boxplot.png'
Saved 'plots_boxplots_pos/ütleme_sections_boxplot.png'
Saved 'plots_boxplots_pos/usun_sections_boxplot.png'


In [13]:
# kõik tulemused
csv_failid = glob.glob(os.path.join("intensiivistajad_per_value/sections_intensiivistajad_per_value", "*.csv"))
tulemused = [analüüsi_fail(f, 4, 5) for f in csv_failid]
kokkuvõte_df = pd.DataFrame(tulemused)

# Holm–Bonferroni 
rej, p_holm, _, _ = multipletests(kokkuvõte_df["p_norm"], alpha=0.05, method='holm')
kokkuvõte_df["p_norm_holm"]      = p_holm
kokkuvõte_df["signif_norm_holm"] = rej

# 3) χ² + Cramer’s V kui signif_norm_holm == True
chi2_list = []
p_chi2_list = []
cramer_v_list = []
signif_chi2_list = []

for _, row in kokkuvõte_df.iterrows():
    if row["signif_norm_holm"]:
        chi2_stat, p_chi2 = chisquare([row["sum_m"], row["sum_n"]])
        N = row["sum_m"] + row["sum_n"]
        cramer_v = math.sqrt(chi2_stat / N) if N > 0 else None
        signif = (p_chi2 < 0.05)
    else:
        chi2_stat = p_chi2 = cramer_v = None
        signif = False

    chi2_list.append(chi2_stat)
    p_chi2_list.append(p_chi2)
    cramer_v_list.append(cramer_v)
    signif_chi2_list.append(signif)

kokkuvõte_df["chi2_stat"]    = chi2_list
kokkuvõte_df["p_chi2"]       = p_chi2_list
kokkuvõte_df["cramer_v"]     = cramer_v_list
kokkuvõte_df["signif_chi2"]  = signif_chi2_list

kokkuvõte_df.to_csv("tulemused/tulemused_intensiivistajad.csv", index=False)
print(kokkuvõte_df)

                    failinimi       test_norm        p_norm  sum_m  sum_n  \
0            nii_sections.csv  Mann–Whitney U  1.996544e-15   1033    982   
1          üldse_sections.csv  Mann–Whitney U  1.246169e-11    227    269   
2         tõesti_sections.csv  Mann–Whitney U  8.280287e-11    126    202   
3            kui_sections.csv  Mann–Whitney U  3.670859e-17   1043    981   
4          palju_sections.csv  Mann–Whitney U  3.043449e-10    318    424   
5          eriti_sections.csv  Mann–Whitney U  1.524186e-08     81     43   
6   absoluutselt_sections.csv  Mann–Whitney U  1.317769e-03     56     30   
7         natuke_sections.csv  Mann–Whitney U  1.202041e-06    174    130   
8          päris_sections.csv  Mann–Whitney U  2.187499e-08    312    248   
9         täitsa_sections.csv  Mann–Whitney U  1.344232e-04     72     39   
10       täiesti_sections.csv  Mann–Whitney U  7.004241e-08    149    171   
11          üsna_sections.csv  Mann–Whitney U  4.309849e-03     86     55   

In [14]:
plot_gender_boxplots("intensiivistajad_per_value/sections_intensiivistajad_per_value", "plots_boxplots_pos")

Saved 'plots_boxplots_pos/nii_sections_boxplot.png'
Saved 'plots_boxplots_pos/üldse_sections_boxplot.png'
Saved 'plots_boxplots_pos/tõesti_sections_boxplot.png'
Saved 'plots_boxplots_pos/kui_sections_boxplot.png'
Saved 'plots_boxplots_pos/palju_sections_boxplot.png'
Saved 'plots_boxplots_pos/eriti_sections_boxplot.png'
Saved 'plots_boxplots_pos/absoluutselt_sections_boxplot.png'
Saved 'plots_boxplots_pos/natuke_sections_boxplot.png'
Saved 'plots_boxplots_pos/päris_sections_boxplot.png'
Saved 'plots_boxplots_pos/täitsa_sections_boxplot.png'
Saved 'plots_boxplots_pos/täiesti_sections_boxplot.png'
Saved 'plots_boxplots_pos/üsna_sections_boxplot.png'
Saved 'plots_boxplots_pos/hästi_sections_boxplot.png'
Saved 'plots_boxplots_pos/suht_sections_boxplot.png'
Saved 'plots_boxplots_pos/natukene_sections_boxplot.png'
Saved 'plots_boxplots_pos/liiga_sections_boxplot.png'
Saved 'plots_boxplots_pos/suhteliselt_sections_boxplot.png'
Saved 'plots_boxplots_pos/väga_sections_boxplot.png'
