In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Definimos la ruta relativa estándar
path = "../data/raw/"

try:
    # 1. Cargamos los archivos (Fíjate en los nombres de la izquierda)
    df_demo = pd.read_csv(f"{path}df_final_demo.txt")
    df_experiment_clients = pd.read_csv(f"{path}df_final_experiment_clients.txt")
    df_web_data_pt_1 = pd.read_csv(f"{path}df_final_web_data_pt_1.txt")
    df_web_data_pt_2 = pd.read_csv(f"{path}df_final_web_data_pt_2.txt")
    
    # 2. Concatenamos usando los nombres CORRECTOS
    # Antes tenías 'df_web_pt1', pero la variable se llama 'df_web_data_pt_1'
    df_web = pd.concat([df_web_data_pt_1, df_web_data_pt_2])
    
    print("✅ Conexión establecida: Los archivos se leen correctamente desde data/raw/")
    print(f"Total de registros cargados en web_data: {len(df_web)}")

except FileNotFoundError as e:
    print(f"❌ Error: No se encontraron los archivos en {path}")
    print("Asegúrate de haber creado la carpeta data/raw dentro de lannister_project_github")

In [None]:
print(df_demo.columns)
print(df_experiment_clients.columns)
print(df_web_data_pt_1.columns)
print(df_web_data_pt_2.columns)

In [None]:
# Ver todas las columnas y sus nombres
print(df_demo.columns)

# Revisar primeras filas
df_demo.head(10)

# Información básica: tipos, nulos
df_demo.info()


In [None]:
import pandas as pd

# Cargar dataset
df_demo = pd.read_csv(f"{path}df_final_demo.txt", sep=",")

# Mostrar todas las columnas
print("Columnas disponibles en df_demo:")
print(df_demo.columns.tolist())


In [None]:
# Mostrar las primeras 10 filas
pd.set_option('display.max_columns', None)  # Para que no corte columnas
df_demo.head(10)


In [None]:
# Tipos y valores nulos
df_demo.info()

# Estadísticas básicas de columnas numéricas
df_demo.describe()

# Estadísticas básicas de columnas categóricas (object)
categorical_cols = df_demo.select_dtypes(include='object').columns
for col in categorical_cols:
    print(f"\nUnique values in {col}:")
    print(df_demo[col].value_counts(dropna=False).head(10))


In [None]:
df_web_data_pt_1.columns
df_web_data_pt_2.columns


In [None]:
df_web = pd.concat(
    [df_web_data_pt_1, df_web_data_pt_2],
    axis=0,
    ignore_index=True
)


In [None]:
df_web.isna().sum()

In [None]:
df_web.duplicated().sum()

In [None]:
df_web_clean = df_web.drop_duplicates()

In [None]:
# Conteo de visitas/eventos por cliente
client_counts = df_web['client_id'].value_counts()

# Mostrar los 10 clientes con más eventos
print("Top 10 clientes con más interacciones:")
print(client_counts.head(10))


In [None]:
# Tomamos solo un registro por cliente en web (clientes únicos)
df_clients_web = df_web[['client_id']].drop_duplicates()

# Hacemos merge con df_demo para obtener variables demográficas
df_clients = df_clients_web.merge(
    df_demo,
    on='client_id',
    how='left'
)

# Revisar primeros registros
df_clients.head()


In [None]:
# Edad promedio y distribución
print("Mean Age:", df_clients['clnt_age'].mean())

# Histograma de edad
plt.figure(figsize=(10,5))
sns.histplot(df_clients['clnt_age'], bins=20, kde=True)
plt.title("Distribution by client's ages")
plt.xlabel("Age")
plt.ylabel("Num of clientes")
plt.show()

# Crear grupos de edad
bins = [0, 25, 35, 50, 65, 100]
labels = ['<25', '25-34', '35-49', '50-64', '65+']
df_clients['age_group'] = pd.cut(df_clients['clnt_age'], bins=bins, labels=labels)

# Conteo por grupo de edad
print(df_clients['age_group'].value_counts())


In [None]:
df_clients['customer_type'] = df_clients['clnt_tenure_yr'].apply(lambda x: 'Nuevo' if x<1 else 'Recurrente')
print(df_clients['customer_type'].value_counts())

# Histograma
plt.figure(figsize=(10,5))
sns.histplot(df_clients['clnt_tenure_yr'], bins=20)
plt.title("Customer tenure (years)")
plt.xlabel("Years since registration")
plt.ylabel("Number of clients")
plt.show()



In [None]:
print(df_clients['gendr'].value_counts())

sns.countplot(data=df_clients, x='gendr')
plt.title("Customer gender distribution")
plt.show()


In [None]:
# Número de cuentas
sns.histplot(df_clients['num_accts'], bins=10)
plt.title("Number of accounts per customer")
plt.xlabel("Number of accounts")
plt.ylabel("Number of clients")
plt.show()

# Logons últimos 6 meses
sns.histplot(df_clients['logons_6_mnth'], bins=20)
plt.title("Logons last 6 months")
plt.xlabel("Number of logons")
plt.ylabel("Number of clients")
plt.show()


In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import chi2_contingency

# Team's official loading function copied here to fix the Import Error
def load_and_concat_data(demo_path, web_pt1_path, web_pt2_path, exp_path):
    df_demo = pd.read_csv(demo_path, sep=",")
    df_web_pt1 = pd.read_csv(web_pt1_path, sep=",")
    df_web_pt2 = pd.read_csv(web_pt2_path, sep=",")
    df_exp = pd.read_csv(exp_path, sep=",")
    df_web = pd.concat([df_web_pt1, df_web_pt2], axis=0, ignore_index=True)
    return df_demo, df_web, df_exp

print("Functions loaded locally. You are ready to go!")

In [None]:
import os
import pandas as pd
from scipy import stats
from scipy.stats import chi2_contingency

# 1. Definimos la ruta absoluta que nos has pasado
raw_data_path = "C:/users/marta/ironhack/week5/lannister_project_github/vanguard-ab-test/data/raw/"

# 2. Creamos las rutas completas para cada archivo
path_demo = os.path.join(raw_data_path, "df_final_demo.txt")
path_web1 = os.path.join(raw_data_path, "df_final_web_data_pt_1.txt")
path_web2 = os.path.join(raw_data_path, "df_final_web_data_pt_2.txt")
path_exp  = os.path.join(raw_data_path, "df_final_experiment_clients.txt")

# 3. Cargamos los datos (usando la función que definimos antes)
try:
    df_demo, df_web, df_exp = load_and_concat_data(path_demo, path_web1, path_web2, path_exp)
    print("¡Archivos cargados con éxito usando la ruta absoluta!")
    
    # Unimos los datos demográficos con la variación del experimento
    combined_df_2 = df_demo.merge(df_exp[['client_id', 'Variation']], on='client_id', how='inner')

    # --- TEST 1: EDAD MEDIA (T-Test) ---
    age_control = combined_df_2[combined_df_2['Variation'] == 'Control']['clnt_age'].dropna()
    age_test = combined_df_2[combined_df_2['Variation'] == 'Test']['clnt_age'].dropna()
    t_stat, p_val_age = stats.ttest_ind(age_control, age_test)

    print(f"\n--- Análisis de Edad ---")
    print(f"Media Control: {age_control.mean():.2f}")
    print(f"Media Test: {age_test.mean():.2f}")
    print(f"P-valor: {p_val_age:.4f}")

    # --- TEST 2: GÉNERO (Chi-Square) ---
    gender_tab = pd.crosstab(combined_df_2['gendr'], combined_df_2['Variation'])
    chi2, p_val_gender, dof, ex = chi2_contingency(gender_tab)

    print(f"\n--- Análisis de Género ---")
    print(gender_tab)
    print(f"P-valor: {p_val_gender:.4f}")

except Exception as e:
    print(f"Error al cargar: {e}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# 1. Preparar los datos de edad para cada grupo
age_control = combined_df_2[combined_df_2['Variation'] == 'Control']['clnt_age'].dropna()
age_test = combined_df_2[combined_df_2['Variation'] == 'Test']['clnt_age'].dropna()

# 2. Ejecutar el T-test
t_stat, p_val_age = stats.ttest_ind(age_control, age_test)

# 3. Visualización para tu presentación
plt.figure(figsize=(10, 6))
sns.boxplot(x='Variation', y='clnt_age', data=combined_df_2, palette='Set2')
plt.title('Distribution of Age by Group', fontsize=15)
plt.xlabel('Group (Variation)', fontsize=12)
plt.ylabel('Client Age', fontsize=12)

# Añadimos los resultados al gráfico
plt.text(0.5, 90, f'P-value: {p_val_age:.4f}', ha='center', fontsize=12, color='red', fontweight='bold')
plt.show()

# 4. Conclusión escrita
print(f"--- Client Age Hypothesis Test ---")
print(f"Control Average Age: {age_control.mean():.2f}")
print(f"Test Average Age: {age_test.mean():.2f}")
print(f"P-value: {p_val_age:.4f}")

if p_val_age < 0.05:
    print("\nCONCLUSION: There IS a statistically significant difference in age between the groups.")
    print("The Test group is slightly younger on average.")
else:
    print("\nCONCLUSION: There is NO significant difference in age. The groups are well-balanced.")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# 1. Definición de Hipótesis
# H0: La edad media de los clientes es igual en ambos grupos (Control y Test).
# H1: La edad media de los clientes es diferente entre los dos grupos.

# 2. Preparar los datos
age_control = combined_df_2[combined_df_2['Variation'] == 'Control']['clnt_age'].dropna()
age_test = combined_df_2[combined_df_2['Variation'] == 'Test']['clnt_age'].dropna()

# 3. Realizar el Independent T-Test
t_stat, p_val_age = stats.ttest_ind(age_control, age_test)
lannister_colors = ["#fbdc6a", "#930416"]
# 4. Visualización
plt.figure(figsize=(10, 6))
sns.boxplot(x='Variation', y='clnt_age', data=combined_df_2, palette=lannister_colors)
plt.title('Comparison of Client Age by Variation Group', fontsize=14)
plt.xlabel('Group', fontsize=12)
plt.ylabel('Age', fontsize=12)

# Añadimos una anotación con el P-valor
plt.text(0.5, combined_df_2['clnt_age'].max() - 5, f'P-value: {p_val_age:.4f}', 
         horizontalalignment='center', size='large', color='red', weight='semibold')
plt.savefig('grafico_edad_lannister.png', dpi=300, bbox_inches='tight')
plt.show()

# 5. Resultados detallados
print(f"--- RESULTS FOR AGE HYPOTHESIS ---")
print(f"Mean Age (Control): {age_control.mean():.2f}")
print(f"Mean Age (Test): {age_test.mean():.2f}")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_val_age:.4f}")

if p_val_age < 0.05:
    print("\nCONCLUSION:The T-test shows a statistically significant difference in age (p=0.0157). However, the effect size is negligible (47.50 vs 47.16 years). With such a large sample size, the test becomes highly sensitive to minor deviations. For the purpose of this A/B test, we consider the groups to be practically balanced, as a 4-month age difference is unlikely to influence how users interact with the interface.")
else:
    print("\nCONCLUSION: Fail to reject H0. There is no significant difference in age.")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

# 1. Definir "Engagement" (Éxito)
completed_clients = df_web[df_web['process_step'] == 'confirm']['client_id'].unique()
combined_df_2['completed'] = combined_df_2['client_id'].isin(completed_clients)

# --- IMPORTANTE: Filtrar la 'X' desde el principio para que el test y la gráfica coincidan ---
combined_df_filtered = combined_df_2[combined_df_2['gendr'] != 'X']

# 2. Análisis por Género y Variación
completion_by_gender = combined_df_filtered.groupby(['gendr', 'Variation'])['completed'].mean().reset_index()
completion_by_gender['completion_rate'] = completion_by_gender['completed'] * 100

# 3. Visualización (Estilo Lannister Opción A)
lannister_colors = ["#fbdc6a", "#930416"] # Amarillo primero
sns.set_theme(style="white")
plt.figure(figsize=(12, 6))

ax = sns.barplot(
    x='gendr', 
    y='completion_rate', 
    hue='Variation', 
    data=completion_by_gender, 
    palette=lannister_colors,
    edgecolor='#930416'
)

plt.title('Completion Rate by Gender and Variation', fontsize=15, fontweight='bold', color='#930416')
plt.ylabel('Completion Rate (%)', color='#930416')
plt.xlabel('Gender', color='#930416')
plt.ylim(0, 100)
plt.legend(title='Group')

# Bordes de la gráfica en granate
for spine in plt.gca().spines.values():
    spine.set_edgecolor('#930416')

# GUARDAR ANTES DE MOSTRAR
plt.savefig('grafico_genero_lannister.png', dpi=300, bbox_inches='tight')
plt.show()

# 4. Test Estadístico (Usando el dataframe sin la 'X')
test_group = combined_df_filtered[combined_df_filtered['Variation'] == 'Test']
ct_test = pd.crosstab(test_group['gendr'], test_group['completed'])
chi2, p_val_test, _, _ = chi2_contingency(ct_test)

control_group = combined_df_filtered[combined_df_filtered['Variation'] == 'Control']
ct_control = pd.crosstab(control_group['gendr'], control_group['completed'])
chi2_c, p_val_control, _, _ = chi2_contingency(ct_control)

print(f"--- Engagement Analysis (Gender vs Completion) ---")
print(f"P-value for Gender in CONTROL group: {p_val_control:.4f}")
print(f"P-value for Gender in TEST group: {p_val_test:.4f}")

if p_val_test < 0.05:
    print("\nCONCLUSION: Gender DOES affect how users engage with the NEW process.")
else:
    print("\nCONCLUSION: Gender does NOT significantly affect engagement.")

In [None]:
# 1. Aseguramos que las fechas sean correctas
df_web['date_time'] = pd.to_datetime(df_web['date_time'])

# 2. Unimos Web Data con Experiment para tener la columna 'Variation'
df_merged = pd.merge(df_web, df_exp, on="client_id", how="inner")

# 3. Calculamos la duración por visita (necesario para tu gráfico)
# Agrupamos por visita y calculamos la diferencia entre el primer y último paso
df_durations = df_merged.groupby(['visit_id', 'Variation'])['date_time'].agg(['min', 'max']).reset_index()
df_durations['total_duration'] = (df_durations['max'] - df_durations['min']).dt.total_seconds()

# 4. Creamos el df_final filtrando solo duraciones lógicas (ej. > 0 y < percentil 95)
upper_limit = df_durations['total_duration'].quantile(0.95)
df_final = df_durations[(df_durations['total_duration'] > 0) & (df_durations['total_duration'] < upper_limit)]

print("✅ 'df_final' ha sido creado con éxito.")
print(f"Columnas disponibles: {df_final.columns.tolist()}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Definimos los colores Lannister
lannister_colors = ["#fbdc6a","#930416"] 

# 2. Configuramos el estilo con fondo blanco puro
sns.set_theme(style="white") 
plt.figure(figsize=(12, 6))

# 3. Creamos el gráfico
sns.kdeplot(
    data=df_final, 
    x='total_duration', 
    hue='Variation', 
    fill=True, 
    common_norm=False, 
    palette=lannister_colors,
    alpha=0.7
)

# 4. Personalizamos los textos y bordes en granate
plt.title('Distribution of Visit Duration: Test vs Control', fontsize=15, fontweight='bold', color='#930416')
plt.xlabel('Total Duration (seconds)', fontsize=12, color='#930416')
plt.ylabel('Density', fontsize=12, color='#930416')

# 5. Opcional: poner los bordes (spines) en granate para que sea 100% Lannister
ax = plt.gca()
for spine in ax.spines.values():
    spine.set_edgecolor('#930416')
    spine.set_linewidth(1.5)

# 6. GUARDAR EL ARCHIVO
# Nota: transparent=False para que mantenga el fondo blanco y no se pierda en la slide
plt.savefig('grafico_duracion_lannister_final.png', dpi=300, bbox_inches='tight', transparent=False)

plt.show()

In [None]:
from scipy import stats

# 1. Calculate duration per visit (Hypothesis 1)
df_h1 = df_merged.groupby(['visit_id', 'Variation'])['date_time'].agg(['min', 'max']).reset_index()
df_h1['duration_sec'] = (df_h1['max'] - df_h1['min']).dt.total_seconds()

# 2. Clean outliers (keep 95th percentile and exclude 0s)
limit = df_h1['duration_sec'].quantile(0.95)
df_h1_clean = df_h1[(df_h1['duration_sec'] > 0) & (df_h1['duration_sec'] <= limit)]

# 3. Separate the groups
control_duration = df_h1_clean[df_h1_clean['Variation'] == 'Control']['duration_sec']
test_duration = df_h1_clean[df_h1_clean['Variation'] == 'Test']['duration_sec']

# 4. Perform the Independent Two-Sample T-Test
t_stat, p_value = stats.ttest_ind(control_duration, test_duration, equal_var=False)
percent_diff = (absolute_diff / mean_control) * 100
# 5. Output Results
print("--- Hypothesis 1: Time Duration Test Results ---")
print(f"Control Mean Duration: {control_duration.mean():.2f} seconds")
print(f"Test Mean Duration: {test_duration.mean():.2f} seconds")
print(f"Difference: {absolute_diff:.2f} seconds ({percent_diff:.2f}%)")
print(f"P-value: {p_value:.4f}")

# 6. Logic check
alpha = 0.05
if p_value < alpha:
    print("\nRESULT: Reject H0. The difference is statistically significant.")
else:
    print("\nRESULT: Fail to reject H0. No significant difference found.")