In [14]:
# generar_graficos_matriculados.py
import pandas as pd
import matplotlib.pyplot as plt

# datos (segundos) tal y como los transcribimos
first = [152,109,135,103,100,129,130,102,86,127]
reuse =  [76,60,82,63,26,74,63,57,57,66]

# crear dataframe
df = pd.DataFrame({
    "run_type": ["first"]*len(first) + ["reuse"]*len(reuse),
    "response_time_s": first + reuse,
    "pestana_id": list(range(1, len(first)+1)) + list(range(1, len(reuse)+1))
})

# salvar CSV (opcional)
df.to_csv("matriculados_results.csv", index=False)

# 1) Boxplot (First vs Reuse)
plt.figure(figsize=(6,4))
data_to_plot = [df[df['run_type']=='first']['response_time_s'].values,
                df[df['run_type']=='reuse']['response_time_s'].values]
plt.boxplot(data_to_plot, labels=['First','Reuse'])
plt.ylabel('Tiempo (s)')
plt.title('Latencias - template "matriculados" (First vs Reuse)')
plt.tight_layout()
plt.savefig("matriculados_boxplot.png")
plt.close()

# 2) Histograma comparativo
plt.figure(figsize=(6,4))
plt.hist(df[df['run_type']=='first']['response_time_s'], bins=8, alpha=0.6, label='First')
plt.hist(df[df['run_type']=='reuse']['response_time_s'], bins=8, alpha=0.6, label='Reuse')
plt.xlabel('Tiempo (s)')
plt.ylabel('Frecuencia')
plt.title('Histograma de latencias - "matriculados"')
plt.legend()
plt.tight_layout()
plt.savefig("matriculados_hist.png")
plt.close()

print("Guardados: matriculados_boxplot.png, matriculados_hist.png, matriculados_results.csv")


Guardados: matriculados_boxplot.png, matriculados_hist.png, matriculados_results.csv


In [3]:
# generar_graficos_tasas.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- Datos (segundos) según tus mediciones ---
fac_first = np.array([152,104,94,105,223])
fac_reuse = np.array([129,35,81,44,73])

tit_first = np.array([193,130,116,50,131])
tit_reuse = np.array([69,31,76,46,134])

# --- DataFrame ---
rows=[]
for i,(a,b) in enumerate(zip(fac_first, fac_reuse), start=1):
    rows.append({'depth':'facultad','pestana':i,'run':'first','time_s':int(a)})
    rows.append({'depth':'facultad','pestana':i,'run':'reuse','time_s':int(b)})
for i,(a,b) in enumerate(zip(tit_first, tit_reuse), start=1):
    rows.append({'depth':'titulacion','pestana':i,'run':'first','time_s':int(a)})
    rows.append({'depth':'titulacion','pestana':i,'run':'reuse','time_s':int(b)})
df = pd.DataFrame(rows)
df.to_csv("tasas_results.csv", index=False)

# --- Funciones estadísticas simples (paired bootstrap) ---
def paired_bootstrap_pvalue(first, reuse, B=10000, seed=12345):
    dif = first - reuse
    rng = np.random.default_rng(seed)
    boots = [np.mean(rng.choice(dif, size=len(dif), replace=True)) for _ in range(B)]
    boots = np.array(boots)
    ci_low, ci_high = np.percentile(boots, [2.5, 97.5])
    p_two = 2 * min(np.mean(boots <= 0), np.mean(boots >= 0))
    return {'mean_diff': np.mean(dif), 'sd_diff': np.std(dif, ddof=1),
            'ci':(ci_low, ci_high), 'p_bootstrap': p_two, 'boots':boots}

# compute stats
fac_stats = paired_bootstrap_pvalue(fac_first, fac_reuse)
tit_stats = paired_bootstrap_pvalue(tit_first, tit_reuse)

print("Facultad stats:", fac_stats)
print("Titulacion stats:", tit_stats)

# --- 1) Boxplot combined ---
plt.figure(figsize=(8,4))
data = [fac_first, fac_reuse, tit_first, tit_reuse]
labels = ['Fac-First','Fac-Reuse','Tit-First','Tit-Reuse']
plt.boxplot(data, labels=labels)
plt.ylabel('Tiempo (s)')
plt.title('Latencias - template tasas (Facultad / Titulación)')
plt.tight_layout()
plt.savefig('tasas_boxplot.png')
plt.close()

# --- 2) Bar chart means +/- std ---
means = [np.mean(arr) for arr in data]
stds = [np.std(arr, ddof=1) for arr in data]
x = np.arange(len(means))
plt.figure(figsize=(8,4))
plt.bar(x, means, yerr=stds, capsize=6)
plt.xticks(x, labels)
plt.ylabel('Media tiempo (s)')
plt.title('Media y desviación (tasas)')
plt.tight_layout()
plt.savefig('tasas_means.png')
plt.close()

# --- 3) Histograms Facultad ---
plt.figure(figsize=(7,4))
plt.hist(fac_first, bins=6, alpha=0.6, label='Fac-First')
plt.hist(fac_reuse, bins=6, alpha=0.6, label='Fac-Reuse')
plt.xlabel('Tiempo (s)')
plt.ylabel('Frecuencia')
plt.title('Histograma - Facultad')
plt.legend()
plt.tight_layout()
plt.savefig('tasas_hist_facultad.png')
plt.close()

# --- 4) Histograms Titulacion ---
plt.figure(figsize=(7,4))
plt.hist(tit_first, bins=6, alpha=0.6, label='Tit-First')
plt.hist(tit_reuse, bins=6, alpha=0.6, label='Tit-Reuse')
plt.xlabel('Tiempo (s)')
plt.ylabel('Frecuencia')
plt.title('Histograma - Titulación')
plt.legend()
plt.tight_layout()
plt.savefig('tasas_hist_titulacion.png')
plt.close()

# --- 5) Paired plot Facultad ---
plt.figure(figsize=(6,4))
p = np.arange(1, len(fac_first)+1)
plt.plot(p, fac_first, 'o-', label='First')
plt.plot(p, fac_reuse, 's--', label='Reuse')
plt.xlabel('Pestaña ID')
plt.ylabel('Tiempo (s)')
plt.title('Paired times - Facultad')
plt.legend()
plt.tight_layout()
plt.savefig('tasas_paired_facultad.png')
plt.close()

# --- 6) Paired plot Titulacion ---
plt.figure(figsize=(6,4))
p = np.arange(1, len(tit_first)+1)
plt.plot(p, tit_first, 'o-', label='First')
plt.plot(p, tit_reuse, 's--', label='Reuse')
plt.xlabel('Pestaña ID')
plt.ylabel('Tiempo (s)')
plt.title('Paired times - Titulación')
plt.legend()
plt.tight_layout()
plt.savefig('tasas_paired_titulacion.png')
plt.close()

print("Saved plots and CSV. Inspect 'tasas_results.csv' and the PNG files.")


Facultad stats: {'mean_diff': 63.2, 'sd_diff': 54.10360431616363, 'ci': (26.2, 108.4), 'p_bootstrap': 0.0, 'boots': array([65.8, 80.4, 62.8, ..., 35.8, 88.6, 35.8])}
Titulacion stats: {'mean_diff': 52.8, 'sd_diff': 56.70714240728411, 'ci': (8.4, 97.2), 'p_bootstrap': 0.0146, 'boots': array([61. , 21.6, 55. , ..., 61.4, 27.4, 61.4])}
Saved plots and CSV. Inspect 'tasas_results.csv' and the PNG files.


In [11]:
# generar_graficos_sin_template.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Datos (segundos)
simple_first = np.array([28,33,26,22,29])
simple_reuse = np.array([23,15,17,16,33])
multiple_first = np.array([73,67,47,50,37])
multiple_reuse = np.array([31,34,18,49,20])
doble_first = np.array([64,96,70,61,48])
doble_reuse = np.array([25,18,47,32,26])

# 1) Boxplot combinado
plt.figure(figsize=(10,5))
data = [simple_first, simple_reuse, multiple_first, multiple_reuse, doble_first, doble_reuse]
labels = ['Simple-First','Simple-Reuse','Multiple-First','Multiple-Reuse','Doble-First','Doble-Reuse']
plt.boxplot(data, labels=labels)
plt.ylabel('Tiempo (s)')
plt.title('Latencias - consultas sin template (First vs Reuse)')
plt.xticks(rotation=30)
plt.tight_layout()
plt.savefig('sin_template_boxplot.png')
plt.close()

# 2) Bar chart media +/- std
means = [simple_first.mean(), simple_reuse.mean(),
         multiple_first.mean(), multiple_reuse.mean(),
         doble_first.mean(), doble_reuse.mean()]
stds = [simple_first.std(ddof=1), simple_reuse.std(ddof=1),
        multiple_first.std(ddof=1), multiple_reuse.std(ddof=1),
        doble_first.std(ddof=1), doble_reuse.std(ddof=1)]
x = range(len(means))
plt.figure(figsize=(10,4))
plt.bar(x, means, yerr=stds, capsize=6)
plt.xticks(x, labels, rotation=30)
plt.ylabel('Media tiempo (s)')
plt.title('Media y desviación estándar - consultas sin template')
plt.tight_layout()
plt.savefig('sin_template_means.png')
plt.close()

# 3) Paired plots por tipo
def paired_plot(first, reuse, name):
    p = np.arange(1, len(first)+1)
    plt.figure(figsize=(6,3))
    plt.plot(p, first, 'o-', label='First')
    plt.plot(p, reuse, 's--', label='Reuse')
    plt.xlabel('Pestaña ID')
    plt.ylabel('Tiempo (s)')
    plt.title(f'Tiempos Pareados - {name}')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'sin_paired_{name}.png')
    plt.close()

paired_plot(simple_first, simple_reuse, 'simple')
paired_plot(multiple_first, multiple_reuse, 'multiple')
paired_plot(doble_first, doble_reuse, 'doble')

# 4) Histograms por tipo
def hist_pair(first, reuse, name):
    plt.figure(figsize=(6,3))
    plt.hist(first, bins=6, alpha=0.6, label='First')
    plt.hist(reuse, bins=6, alpha=0.6, label='Reuse')
    plt.xlabel('Tiempo (s)')
    plt.ylabel('Frecuencia')
    plt.title(f'Histograma - {name}')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'sin_hist_{name}.png')
    plt.close()

hist_pair(simple_first, simple_reuse, 'simple')
hist_pair(multiple_first, multiple_reuse, 'multiple')
hist_pair(doble_first, doble_reuse, 'doble')

print("Saved PNGs: sin_template_boxplot.png, sin_template_means.png, sin_paired_*.png, sin_hist_*.png")


Saved PNGs: sin_template_boxplot.png, sin_template_means.png, sin_paired_*.png, sin_hist_*.png


In [10]:
# generar_graficos_comparativa.py
"""
Genera resumen estadístico y gráficos comparativos (boxplot, bar chart con error bars
y paired plots) a partir de un CSV con columnas:
  group,pestana,run,time_s

Ejecutar:
  python generar_graficos_comparativa.py
Ajusta CSV_PATH si tu fichero tiene otro nombre/ruta.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

CSV_PATH = Path("all_results_combined.csv")   # AJUSTA si tu CSV está en otra ruta
OUT_DIR  = Path(".")                          # carpeta de salida (ajusta si quieres)
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Leer CSV ---
df = pd.read_csv(CSV_PATH)

# Normalizar nombres (opcional)
# df['group'] = df['group'].astype(str)

# --- Estadísticas por grupo y condición ---
summary = df.groupby(['group','run'])['time_s'].agg(['count','mean','median','std','min','max']).reset_index()
summary['p90'] = df.groupby(['group','run'])['time_s'].quantile(0.90).values
summary['p95'] = df.groupby(['group','run'])['time_s'].quantile(0.95).values
summary.to_csv(OUT_DIR/"all_groups_summary.csv", index=False)
print("Saved: all_groups_summary.csv")

# --- Speedups (mean_first / mean_reuse) ---
groups = sorted(df['group'].unique())
speedups = []
for g in groups:
    mean_first = df[(df['group']==g)&(df['run']=='first')]['time_s'].mean()
    mean_reuse = df[(df['group']==g)&(df['run']=='reuse')]['time_s'].mean()
    if not np.isnan(mean_first) and not np.isnan(mean_reuse) and mean_reuse>0:
        speedups.append({'group':g, 'mean_first':mean_first, 'mean_reuse':mean_reuse, 'speedup': mean_first/mean_reuse})
speed_df = pd.DataFrame(speedups)
speed_df.to_csv(OUT_DIR/"speedups.csv", index=False)
print("Saved: speedups.csv")
print(speed_df)

# --- Function: paired Cohen's d and bootstrap CI for mean diff ---
def paired_stats(first, reuse, B=10000, seed=12345):
    first = np.asarray(first)
    reuse = np.asarray(reuse)
    dif = first - reuse
    n = len(dif)
    mean_diff = np.mean(dif)
    sd_diff = np.std(dif, ddof=1)
    cohens_d = mean_diff / sd_diff if sd_diff != 0 else np.nan
    rng = np.random.default_rng(seed)
    boots = np.array([np.mean(rng.choice(dif, size=n, replace=True)) for _ in range(B)])
    ci_low, ci_high = np.percentile(boots, [2.5, 97.5])
    p_two = 2 * min(np.mean(boots <= 0), np.mean(boots >= 0))
    return {'mean_diff':mean_diff, 'sd_diff':sd_diff, 'cohens_d':cohens_d, 'ci95':(ci_low,ci_high), 'p_boot':p_two}

# Calculate detailed stats per group
detailed = []
for g in groups:
    arr_first = df[(df['group']==g)&(df['run']=='first')]['time_s'].values
    arr_reuse = df[(df['group']==g)&(df['run']=='reuse')]['time_s'].values
    if len(arr_first)>0 and len(arr_first)==len(arr_reuse):
        ps = paired_stats(arr_first, arr_reuse)
    else:
        ps = {'mean_diff': np.nan, 'sd_diff': np.nan, 'cohens_d': np.nan, 'ci95':(np.nan,np.nan), 'p_boot':np.nan}
    detailed.append({'group':g,
                     'n_first': len(arr_first),
                     'n_reuse': len(arr_reuse),
                     'mean_first': float(np.mean(arr_first)) if len(arr_first)>0 else np.nan,
                     'mean_reuse': float(np.mean(arr_reuse)) if len(arr_reuse)>0 else np.nan,
                     'median_first': float(np.median(arr_first)) if len(arr_first)>0 else np.nan,
                     'median_reuse': float(np.median(arr_reuse)) if len(arr_reuse)>0 else np.nan,
                     'std_first': float(np.std(arr_first, ddof=1)) if len(arr_first)>1 else np.nan,
                     'std_reuse': float(np.std(arr_reuse, ddof=1)) if len(arr_reuse)>1 else np.nan,
                     'p90_first': float(np.percentile(arr_first,90)) if len(arr_first)>0 else np.nan,
                     'p95_first': float(np.percentile(arr_first,95)) if len(arr_first)>0 else np.nan,
                     'mean_diff': ps['mean_diff'],
                     'sd_diff': ps['sd_diff'],
                     'cohens_d_paired': ps['cohens_d'],
                     'ci95_low': ps['ci95'][0],
                     'ci95_high': ps['ci95'][1],
                     'p_boot': ps['p_boot']})
detailed_df = pd.DataFrame(detailed)
detailed_df.to_csv(OUT_DIR/"detailed_stats.csv", index=False)
print("Saved: detailed_stats.csv")

# --- Gráfico 1: Boxplot comparativo (First vs Reuse por grupo) ---
data_for_plot = []
labels = []
for g in groups:
    for r in ['first','reuse']:
        vals = df[(df['group']==g)&(df['run']==r)]['time_s'].values
        if len(vals)>0:
            data_for_plot.append(vals)
            labels.append(f"{g}\n{r}")

plt.figure(figsize=(14,6))
plt.boxplot(data_for_plot, labels=labels)
plt.ylabel("Tiempo (s)")
plt.title("Comparativa de latencias por grupo (First vs Reuse)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(OUT_DIR/"all_groups_boxplot.png", dpi=150)
plt.close()
print("Saved: all_groups_boxplot.png")

# --- Gráfico 2: Bar chart de medias ± std con anotación de speedup ---
means = [np.mean(x) for x in data_for_plot]
stds  = [np.std(x, ddof=1) for x in data_for_plot]
x = np.arange(len(means))
plt.figure(figsize=(14,5))
bars = plt.bar(x, means, yerr=stds, capsize=6)
plt.xticks(x, labels, rotation=45, ha='right')
plt.ylabel("Media tiempo (s)")
plt.title("Media de latencias por grupo y condición (First vs Reuse)")

# Annotate speedup above each pair (every two bars)
label_texts = []
for i,g in enumerate(groups):
    # compute index in means for first and reuse
    idx_first = 2*i
    idx_reuse = 2*i+1
    if idx_first < len(means) and idx_reuse < len(means) and means[idx_reuse] > 0:
        sp = means[idx_first] / means[idx_reuse]
        # place annotation between bars
        x_pos = (x[idx_first] + x[idx_reuse]) / 2
        y_pos = max(means[idx_first], means[idx_reuse]) + max(stds[idx_first], stds[idx_reuse]) + 5
        plt.text(x_pos, y_pos, f"speedup {sp:.2f}×", ha='center', va='bottom', fontsize=9)
plt.tight_layout()
plt.savefig(OUT_DIR/"all_groups_means.png", dpi=150)
plt.close()
print("Saved: all_groups_means.png")

# --- Gráfico 3: Paired plots por grupo (opcional) ---
for g in groups:
    arr_first = df[(df['group']==g)&(df['run']=='first')].sort_values('pestana')['time_s'].values
    arr_reuse = df[(df['group']==g)&(df['run']=='reuse')].sort_values('pestana')['time_s'].values
    if len(arr_first)>0 and len(arr_first)==len(arr_reuse):
        p = np.arange(1, len(arr_first)+1)
        plt.figure(figsize=(6,3))
        plt.plot(p, arr_first, 'o-', label='First')
        plt.plot(p, arr_reuse, 's--', label='Reuse')
        plt.xlabel('Pestaña ID')
        plt.ylabel('Tiempo (s)')
        plt.title(f'Paired times - {g}')
        plt.legend()
        plt.tight_layout()
        plt.savefig(OUT_DIR/f"paired_{g}.png", dpi=150)
        plt.close()
        print(f"Saved: paired_{g}.png")

print("FIN. Revisa los CSVs y PNGs generados en la carpeta de salida.")


Saved: all_groups_summary.csv
Saved: speedups.csv
              group  mean_first  mean_reuse   speedup
0      matriculados       117.3        62.4  1.879808
1        sin_double        67.8        29.6  2.290541
2      sin_multiple        54.8        30.4  1.802632
3        sin_simple        27.6        20.8  1.326923
4    tasas_facultad       135.6        72.4  1.872928
5  tasas_titulacion       124.0        71.2  1.741573
Saved: detailed_stats.csv
Saved: all_groups_boxplot.png
Saved: all_groups_means.png
Saved: paired_matriculados.png
Saved: paired_sin_double.png
Saved: paired_sin_multiple.png
Saved: paired_sin_simple.png
Saved: paired_tasas_facultad.png
Saved: paired_tasas_titulacion.png
FIN. Revisa los CSVs y PNGs generados en la carpeta de salida.
