In [1]:
import pandas as pd
import numpy as np

# Load the final cleaned dataset
file_path = 'Translated_Journal.csv'
df = pd.read_csv(file_path)

print("--- Descriptive Statistics for All Numerical Columns ---")
# Select only numerical columns for the describe() function
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(df[numerical_cols].describe())

print("\n--- Value Counts for Key Categorical Columns ---")
print("\nSex:")
# Replace inconsistent values to ensure a clean count
df['Sex'] = df['Sex'].replace({'M': 'Male', 'F': 'Female'})
print(df['Sex'].value_counts())

print("\nDiagnosis Stage:")
print(df['Diagnosis Stage'].value_counts())

print("\nRoute of transmission:")
print(df['Route of transmission'].value_counts())

print("\nCo_infection_Status_Combined:")
print(df['Co_infection_Status_Combined'].value_counts())


--- Descriptive Statistics for All Numerical Columns ---
               Age   Latest VN result (copies/ml)  \
count  7182.000000                   7.155000e+03   
mean     39.887218                   5.572308e+04   
std      12.691100                   5.959061e+05   
min       0.000000                   1.000000e+00   
25%      32.000000                   4.000000e+01   
50%      40.000000                   4.000000e+01   
75%      48.000000                   4.000000e+01   
max      87.000000                   2.370000e+07   

       Since when has been using drugs   First CD4 result (cells/ml)  \
count                       725.000000                   6753.000000   
mean                       2003.291034                    363.664594   
std                           7.183429                    306.762324   
min                        1980.000000                      0.000000   
25%                        1999.000000                    177.000000   
50%                        2003.0

KeyError: 'Diagnosis Stage'

In [3]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Load the final cleaned dataset
file_path = 'Translated_Journal_Cleaned.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    df = None

if df is not None:
    # --- Critical Fix: Standardize column names after loading ---
    # This step will fix any lingering whitespace or invisible characters
    df.columns = df.columns.str.strip()
    # Also, we will explicitly rename the columns that were causing the KeyError
    df = df.rename(columns={
        'Sex': 'Sex',  # Making sure 'Sex' is correct
        'Diagnosis Stage': 'Diagnosis Stage',  # Fixing any potential issue with this name
        'Route of transmission': 'Route of transmission',  # Fixing any potential issue with this name
        'Co_infection_Status_Combined': 'Co_infection_Status_Combined'  # Fixing any potential issue with this name
    }, errors='ignore')

    # --- Step 1: Descriptive Statistics for All Numerical Columns ---
    print("--- Descriptive Statistics for All Numerical Columns ---")
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    print(df[numerical_cols].describe())

    # --- Step 2: Value Counts for Key Categorical Columns ---
    print("\n--- Value Counts for Key Categorical Columns ---")
    print("\nSex:")
    # Standardize 'M' and 'F' to 'Male' and 'Female' for a clean count
    df['Sex'] = df['Sex'].replace({'M': 'Male', 'F': 'Female'})
    print(df['Sex'].value_counts())

    print("\nDiagnosis Stage:")
    print(df['Diagnosis Stage'].value_counts())

    print("\nRoute of transmission:")
    print(df['Route of transmission'].value_counts())

    print("\nCo_infection_Status_Combined:")
    print(df['Co_infection_Status_Combined'].value_counts())

    # --- Step 3: Chi-Squared Test for Association (Sex vs. Diagnosis Stage) ---
    print("\n\n--- Chi-Squared Test for Sex vs. Diagnosis Stage ---")
    contingency_table_sex_diag = pd.crosstab(df['Sex'], df['Diagnosis Stage'])
    print("Contingency Table:")
    print(contingency_table_sex_diag)
    chi2, p, dof, expected = chi2_contingency(contingency_table_sex_diag)
    print(f"\nChi-squared statistic: {chi2}")
    print(f"p-value: {p}")
    alpha = 0.05
    if p < alpha:
        print("\nConclusion: The association between Sex and Diagnosis Stage is statistically significant (p < 0.05).")
    else:
        print("\nConclusion: There is no statistically significant association between Sex and Diagnosis Stage (p >= 0.05).")

    # --- Step 4: Chi-Squared Test for Association (Route of Transmission vs. Co-infection Status) ---
    print("\n\n--- Chi-Squared Test for Route of Transmission vs. Co-infection Status ---")
    contingency_table_route_coinfect = pd.crosstab(df['Route of transmission'], df['Co_infection_Status_Combined'])
    print("Contingency Table:")
    print(contingency_table_route_coinfect)
    chi2, p, dof, expected = chi2_contingency(contingency_table_route_coinfect)
    print(f"\nChi-squared statistic: {chi2}")
    print(f"p-value: {p}")
    alpha = 0.05
    if p < alpha:
        print("\nConclusion: The association between Route of transmission and Co-infection Status is statistically significant (p < 0.05).")
    else:
        print("\nConclusion: There is no statistically significant association between Route of transmission and Co-infection Status (p >= 0.05).")



ModuleNotFoundError: No module named 'scipy'