In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import os

In [3]:
dataset_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(''))), 'datasets')
llamados_v2= pd.read_excel(os.path.join(dataset_dir, 'xlsx/llamados_v2.xlsx'), parse_dates=['llamado_fecha_hora'])

In [38]:
# Identify all categorical variables
categorical_variables = llamados_v2.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove the variable you want to exclude
categorical_variables.remove('victima_convive_agresor')

# Initialize a list to store contingency tables
contingency_tables = []

# Iterate through each categorical variable
for variable in categorical_variables:
    # Create a contingency table
    contingency_table = pd.crosstab(index=llamados_v2['victima_convive_agresor'], columns=llamados_v2[variable])
    
    # Append the contingency table to the list
    contingency_tables.append(contingency_table)

# Combine all contingency tables into a single table
combined_contingency_table = pd.concat(contingency_tables, axis=1)

# Calculate expected frequencies
expected_frequencies = chi2_contingency(combined_contingency_table)[3]

# Check condition 5: No expected frequencies should be 0
if (expected_frequencies > 1).all():
    print("Condition 5: All expected frequencies are greater than 1.")
else:
    print("Condition 5: Not all expected frequencies are greater than 1.")

# Check condition 6: Proportion of cells with expected frequencies less than 5
cells_less_than_5 = (expected_frequencies < 5).sum()
total_cells = combined_contingency_table.size
proportion_less_than_5 = cells_less_than_5 / total_cells

if proportion_less_than_5 <= 0.2:
    print("Condition 6: Proportion of cells with expected frequencies less than 5 is acceptable.")
else:
    print("Condition 6: Proportion of cells with expected frequencies less than 5 is not acceptable.")


Condition 5: Not all expected frequencies are greater than 1.
Condition 6: Proportion of cells with expected frequencies less than 5 is acceptable.


In [None]:
# Initialize an empty list to store results
results = []

# remove categorical varaibles where conditions for chi squared are not met


#categorical_variables = [e for e in categorical_variables if e not in ('vs_Intento_violación_tercera_persona',
#                                   'vs_explotacion_sexual_viajes_turismo','ofv_intento_quemar',
#                                   'ofv_intento_ahorcar','ofv_intento_ahogar','ofv_uso_arma_fuego',
#                                   'ofv_intento_matar','ofv_uso_animal_victimizar')]


# Perform chi-squared test for each variable combination
for variable in categorical_variables:
    # Create a contingency table
    contingency_table = pd.crosstab(index=llamados_v2['victima_convive_agresor'], columns=llamados_v2[variable])
    
    # Perform chi-squared test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    # Round p-value to 4 decimal places
    p_rounded = round(p, 4)

    # Append results to the list
    results.append({'Variable': variable, 'Chi-Squared Statistic': chi2, 'P-value': p_rounded})

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Print the resulting table
results_df





In [39]:
llamados_v5= pd.read_excel(os.path.join(dataset_dir, 'xlsx/llamados_v5.xlsx'), parse_dates=['llamado_fecha_hora'])

In [41]:
# Identify all categorical variables
categorical_variables = llamados_v5.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove the variable you want to exclude
categorical_variables.remove('victima_convive_agresor')

# Initialize a list to store contingency tables
contingency_tables = []

# Iterate through each categorical variable
for variable in categorical_variables:
    # Create a contingency table
    contingency_table = pd.crosstab(index=llamados_v5['victima_convive_agresor'], columns=llamados_v5[variable])
    
    # Append the contingency table to the list
    contingency_tables.append(contingency_table)

# Combine all contingency tables into a single table
combined_contingency_table = pd.concat(contingency_tables, axis=1)

# Calculate expected frequencies
expected_frequencies = chi2_contingency(combined_contingency_table)[3]

# Check condition 5: No expected frequencies should be 0
if (expected_frequencies > 1).all():
    print("Condition 5: All expected frequencies are greater than 1.")
else:
    print("Condition 5: Not all expected frequencies are greater than 1.")

# Check condition 6: Proportion of cells with expected frequencies less than 5
cells_less_than_5 = (expected_frequencies < 5).sum()
total_cells = combined_contingency_table.size
proportion_less_than_5 = cells_less_than_5 / total_cells

if proportion_less_than_5 <= 0.2:
    print("Condition 6: Proportion of cells with expected frequencies less than 5 is acceptable.")
else:
    print("Condition 6: Proportion of cells with expected frequencies less than 5 is not acceptable.")


Condition 5: All expected frequencies are greater than 1.
Condition 6: Proportion of cells with expected frequencies less than 5 is acceptable.


In [42]:
# Initialize an empty list to store results
results = []

# remove categorical varaibles where conditions for chi squared are not met


#categorical_variables = [e for e in categorical_variables if e not in ('vs_Intento_violación_tercera_persona',
#                                   'vs_explotacion_sexual_viajes_turismo','ofv_intento_quemar',
#                                   'ofv_intento_ahorcar','ofv_intento_ahogar','ofv_uso_arma_fuego',
#                                   'ofv_intento_matar','ofv_uso_animal_victimizar')]


# Perform chi-squared test for each variable combination
for variable in categorical_variables:
    # Create a contingency table
    contingency_table = pd.crosstab(index=llamados_v5['victima_convive_agresor'], columns=llamados_v5[variable])
    
    # Perform chi-squared test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    # Round p-value to 4 decimal places
    p_rounded = round(p, 4)

    # Append results to the list
    results.append({'Variable': variable, 'Chi-Squared Statistic': chi2, 'P-value': p_rounded})

# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Print the resulting table
results_df

Unnamed: 0,Variable,Chi-Squared Statistic,P-value
0,llamado_provincia,3012.363696,0.0
1,llamante_genero,2269.788629,0.0
2,llamante_vinculo,5343.959685,0.0
3,caso_judicializado,1103.734558,0.0
4,hecho_lugar,5292.774147,0.0
5,victima_a_resguardo,2854.132584,0.0
6,victima_genero,1739.86127,0.0
7,victima_nacionalidad,2215.928989,0.0
8,victima_vinculo_agresor,10112.13988,0.0
9,victima_discapacidad,2713.794896,0.0


In [44]:
llamados_v4= pd.read_excel(os.path.join(dataset_dir, 'xlsx/llamados_v4.xlsx'), parse_dates=['llamado_fecha_hora'])

In [45]:
# Identify all categorical variables
categorical_variables = llamados_v4.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove the variable you want to exclude
categorical_variables.remove('victima_convive_agresor')

# Initialize a list to store contingency tables
contingency_tables = []

# Iterate through each categorical variable
for variable in categorical_variables:
    # Create a contingency table
    contingency_table = pd.crosstab(index=llamados_v4['victima_convive_agresor'], columns=llamados_v4[variable])
    
    # Append the contingency table to the list
    contingency_tables.append(contingency_table)

# Combine all contingency tables into a single table
combined_contingency_table = pd.concat(contingency_tables, axis=1)

# Calculate expected frequencies
expected_frequencies = chi2_contingency(combined_contingency_table)[3]

# Check condition 5: No expected frequencies should be 0
if (expected_frequencies > 1).all():
    print("Condition 5: All expected frequencies are greater than 1.")
else:
    print("Condition 5: Not all expected frequencies are greater than 1.")

# Check condition 6: Proportion of cells with expected frequencies less than 5
cells_less_than_5 = (expected_frequencies < 5).sum()
total_cells = combined_contingency_table.size
proportion_less_than_5 = cells_less_than_5 / total_cells

if proportion_less_than_5 <= 0.2:
    print("Condition 6: Proportion of cells with expected frequencies less than 5 is acceptable.")
else:
    print("Condition 6: Proportion of cells with expected frequencies less than 5 is not acceptable.")

Condition 5: Not all expected frequencies are greater than 1.
Condition 6: Proportion of cells with expected frequencies less than 5 is acceptable.


In [None]:
# Identify all categorical variables
numerical_variables = llamados_v2.select_dtypes(include=['integer', 'float', 'timestamp']).columns.tolist()


In [None]:
llamados_v_dummy=pd.concat([llamados_v2, pd.llamados_v2(llamados_v2['victima_convive_agresor'],prefix='victima_convive_agresor')],axis=1)
