In [1]:
import pandas as pd

file_path = 'Trial Data 1-23.csv' # Make sure this filename is EXACTLY correct!

# *** IMPORTANT CHANGE HERE: Added encoding='latin1' ***
df = pd.read_csv(file_path, encoding='latin1')

print("DataFrame Info:")
df.info()

print("\nFirst 5 rows of the DataFrame:")
print(df.head())

print("\nAll Column Names:")
for column in df.columns:
    print(column)

columns_to_check_unique = [
    'Sex', 'Age', 'Region', 'HIV Status',
    'Last result of HCS (AntiHCV)', 'Last result of HBV test (HBsAg)',
    'ART Started', 'Route of transmission', 'Outcome of HCV treatment',
    'CD4 Initial', 'CD4 Latest', 'Viral Load Latest',
    'Date of Birth', 'Treatment Start Date', 'Treatment End Date',
    'Since when has been using drugs', 'Alcohol abuse', 'Migrant',
    'D-examination:Injecting drug use in the last 6 months',
    'D-examination:Sexual contacts in the last 6 months',
    'D-examination:Provision of commercial sex services in the last 6 months'
]

print("\n--- Key Column Insights (Unique Values) ---")
for col in columns_to_check_unique:
    if col in df.columns:
        print(f"\nUnique values in '{col}':")
        print([str(val) for val in df[col].unique()])
    else:
        print(f"\nColumn '{col}' not found in the DataFrame.")

print("\nDescriptive statistics for numerical columns:")
print(df.describe())

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Columns: 171 entries, Country of arrival (for foreigners) to NGO: Date of service provision
dtypes: float64(31), int64(1), object(139)
memory usage: 65.6+ KB

First 5 rows of the DataFrame:
  Country of arrival (for foreigners) Current area of medical examination  \
0                                 NaN                              BATKEN   
1                                 NaN                              BATKEN   
2                                 NaN                              BATKEN   
3                                 NaN                              BATKEN   
4                                 NaN                              BATKEN   

     Citizenship Date of IB/PCR/ELISA Sex  Age  \
0  Citizen of KR           2025-06-13   ?   38   
1  Citizen of KR           2025-06-04   ?   23   
2  Citizen of KR           2025-04-30   ?   58   
3  Citizen of KR           2025-03-28   ?   19   
4  Citizen 

In [2]:
# This line of code will go through every column name in your DataFrame
# and remove any spaces from the start or end of each name.
df.columns = df.columns.str.strip()

# Now, let's print all the column names again to see the effect.
# This time, they should be clean of leading/trailing spaces.
print("\nAll Column Names AFTER Stripping Whitespace:")
for column in df.columns:
    print(column)


All Column Names AFTER Stripping Whitespace:
Country of arrival (for foreigners)
Current area of medical examination
Citizenship
Date of IB/PCR/ELISA
Sex
Age
????????? ????????? ?? (?????/??)
On D-account in
Date of ART initiation
Date of D-registration
Detected in KR
Country of detection
Region of detection
Administrative unit of detection
UIC
Last result of HCS (AntiHCV)
Last result of HBV test (HBsAg)
HBV vaccination
Date of initiation of HCV treatment
Examination code
Date of birth
Transgender
IB/PCR/ELISA number
IB conclusion
Route of transmission
Retrospective child
LUIN
Since when has been using drugs
Registered with a narcologist
Alcohol abuse
Full name of the attending physician
Registration address: region
Registration address: administrative unit of the region
Registration address: locality
Registration address (c/v)
Residential address: region
Residential address: administrative unit of the region
Residential address: locality
Residential address: street, house, apartment


In [3]:
# Let's inspect some key columns again after cleaning their names.
# We expect columns like 'Region', 'HIV Status', 'ART Started', etc., to now be found.

columns_to_check_unique = [
    'Sex',
    'Age',
    'Region', # Re-checking after stripping whitespace
    'HIV Status', # Re-checking after stripping whitespace
    'Last result of HCS (AntiHCV)',
    'Last result of HBV test (HBsAg)',
    'ART Started', # Re-checking after stripping whitespace
    'Route of transmission',
    'Outcome of HCV treatment',
    'Date of birth', # Re-checking this one (note: 'Date of Birth' from initial list changed to 'Date of birth' based on your output)
    'Date of ART initiation', # Re-checking this one (exact name)
    'Treatment Start Date', # Re-checking this one (exact name)
    'Treatment End Date', # Re-checking this one (exact name)
    # The CD4 and Viral Load columns with exact names from your full list
    'Date of the first CD4 result',
    'First CD4 result (cells/ml) <= 500',
    'Date of the last CD4 result',
    'Last result of %CD4',
    'Date of the first VL result',
    'First VL result (copies/ml)',
    'Date of the last VL result',
    'Since when has been using drugs',
    'Alcohol abuse',
    'Migrant',
    'D-examination:Injecting drug use in the last 6 months',
    'D-examination:Sexual contacts in the last 6 months',
    'D-examination:Provision of commercial sex services in the last 6 months'
]

print("\n--- Key Column Insights (Unique Values) after stripping whitespace ---")
for col in columns_to_check_unique:
    if col in df.columns: # Now this check should work correctly for cleaned names
        print(f"\nUnique values in '{col}':")
        # Convert to string to handle NaN properly for printing
        print([str(val) for val in df[col].unique()])
    else:
        print(f"\nColumn '{col}' is STILL not found. Please double-check the exact spelling.")

# Also, let's see descriptive statistics again, as the numerical columns are likely better identified now.
print("\nDescriptive statistics for numerical columns (re-checked):")
print(df.describe())


--- Key Column Insights (Unique Values) after stripping whitespace ---

Unique values in 'Sex':
['?']

Unique values in 'Age':
['38', '23', '58', '19', '43', '32', '29', '44', '30', '35', '26', '36', '2', '37', '46', '18', '50', '15', '33', '65', '7', '1', '41', '57', '56', '51', '31', '24', '45', '52', '34', '22', '39', '25']

Column 'Region' is STILL not found. Please double-check the exact spelling.

Column 'HIV Status' is STILL not found. Please double-check the exact spelling.

Unique values in 'Last result of HCS (AntiHCV)':
['Negative', 'nan', 'Positive']

Unique values in 'Last result of HBV test (HBsAg)':
['Negative', 'nan', 'Positive']

Column 'ART Started' is STILL not found. Please double-check the exact spelling.

Unique values in 'Route of transmission':
['Homosexual, sexual', 'Heterosexual, sexual', 'Undetermined route', '???????????? (?? ????? ????????????, ?????, ??? ????????? ??????)']

Unique values in 'Outcome of HCV treatment':
['nan', '??? 12']

Unique values in 

In [4]:
# Let's inspect our key columns with their EXACT names from the DataFrame.

columns_to_check_unique = [
    'Sex',
    'Age',
    'Region of detection', # Corrected name for Region
    'HIV Status', # This was in your full list, let's re-verify why it wasn't found before
    'Last result of HCS (AntiHCV)',
    'Last result of HBV test (HBsAg)',
    'ART Started', # Corrected name for ART Started
    'Route of transmission',
    'Outcome of HCV treatment',
    'Date of birth',
    'Date of ART initiation',
    'Date of initiation of HCV treatment', # Exact name for HCV treatment start date
    'Date of completion of HCV treatment', # Exact name for HCV treatment end date
    'Date of the first CD4 result',
    'First CD4 result (cells/ml) <= 500',
    'Date of the last CD4 result',
    'Last result of %CD4',
    'Date of the first VL result',
    'First VL result (copies/ml)',
    'Date of the last VL result',
    'Since when has been using drugs',
    'Alcohol abuse',
    'Migrant',
    'D-examination:Injecting drug use in the last 6 months',
    'D-examination:Sexual contacts in the last 6 months',
    'D-examination:Provision of commercial sex services in the last 6 months'
]

print("\n--- Key Column Insights (Unique Values) - FINAL CHECK with Exact Names ---")
for col in columns_to_check_unique:
    if col in df.columns:
        print(f"\nUnique values in '{col}':")
        print([str(val) for val in df[col].unique()])
    else:
        print(f"\nColumn '{col}' is STILL not found. This indicates an issue with the name itself or its absence.")

print("\nDescriptive statistics for numerical columns (FINAL CHECK):")
print(df.describe())


--- Key Column Insights (Unique Values) - FINAL CHECK with Exact Names ---

Unique values in 'Sex':
['?']

Unique values in 'Age':
['38', '23', '58', '19', '43', '32', '29', '44', '30', '35', '26', '36', '2', '37', '46', '18', '50', '15', '33', '65', '7', '1', '41', '57', '56', '51', '31', '24', '45', '52', '34', '22', '39', '25']

Unique values in 'Region of detection':
['BATKEN', '?.??????', '??????', '?.??']

Column 'HIV Status' is STILL not found. This indicates an issue with the name itself or its absence.

Unique values in 'Last result of HCS (AntiHCV)':
['Negative', 'nan', 'Positive']

Unique values in 'Last result of HBV test (HBsAg)':
['Negative', 'nan', 'Positive']

Column 'ART Started' is STILL not found. This indicates an issue with the name itself or its absence.

Unique values in 'Route of transmission':
['Homosexual, sexual', 'Heterosexual, sexual', 'Undetermined route', '???????????? (?? ????? ????????????, ?????, ??? ????????? ??????)']

Unique values in 'Outcome of H

In [5]:
print("--- Investigating 'HIV Status' and 'ART Started' ---")

# Find columns that contain 'HIV' (case-insensitive)
hiv_cols = [col for col in df.columns if 'hiv' in col.lower()]
print("\nColumns containing 'HIV' (case-insensitive):")
if hiv_cols:
    for col in hiv_cols:
        print(f"- '{col}' (Length: {len(col)})")
        # Print the raw string representation to reveal hidden characters
        print(f"  Raw: {repr(col)}")
else:
    print("No columns containing 'HIV' found.")

# Find columns that contain 'ART' (case-insensitive)
art_cols = [col for col in df.columns if 'art' in col.lower()]
print("\nColumns containing 'ART' (case-insensitive):")
if art_cols:
    for col in art_cols:
        print(f"- '{col}' (Length: {len(col)})")
        print(f"  Raw: {repr(col)}")
else:
    print("No columns containing 'ART' found.")

# Let's also check a known good column for comparison, like 'Sex'
print(f"\nExample of known good column 'Sex': '{'Sex'}' (Length: {len('Sex')})")
print(f"  Raw: {repr('Sex')}")

# Finally, let's explicitly try to access 'HIV Status' and 'ART Started' by exact name again just to be sure
try:
    print(f"\nAttempting to access 'HIV Status': {df['HIV Status'].unique()}")
except KeyError:
    print("Error: 'HIV Status' column still not found by exact match.")

try:
    print(f"\nAttempting to access 'ART Started': {df['ART Started'].unique()}")
except KeyError:
    print("Error: 'ART Started' column still not found by exact match.")

--- Investigating 'HIV Status' and 'ART Started' ---

Columns containing 'HIV' (case-insensitive):
- 'RR: PLHIV informed' (Length: 18)
  Raw: 'RR: PLHIV informed'

Columns containing 'ART' (case-insensitive):
- 'Date of ART initiation' (Length: 22)
  Raw: 'Date of ART initiation'
- 'Residential address: street, house, apartment' (Length: 45)
  Raw: 'Residential address: street, house, apartment'
- 'Obtaining written consent for ART' (Length: 33)
  Raw: 'Obtaining written consent for ART'
- 'Date of primary ART initiation' (Length: 30)
  Raw: 'Date of primary ART initiation'
- 'Date of initiation of the current course of ART' (Length: 47)
  Raw: 'Date of initiation of the current course of ART'
- 'ART is provided by' (Length: 18)
  Raw: 'ART is provided by'
- 'Last date of change in ART' (Length: 26)
  Raw: 'Last date of change in ART'
- 'Type of the last change in ART' (Length: 30)
  Raw: 'Type of the last change in ART'
- 'Reason for the last change in ART' (Length: 33)
  Raw: 'Reason

In [6]:
# Define a dictionary to map old column names (Kyrgyz) to new ones (English)
# Ensure these Kyrgyz names exactly match what you saw in the full column list output (from Step 6)
# The 'u' prefix for unicode strings might be needed in older Python versions, but often isn't required now.
# I'm using the exact text representations you would have seen in the raw text output.

column_rename_map = {
    'ВИЧ статус': 'HIV Status', # Mapping Kyrgyz 'ВИЧ статус' to 'HIV Status'
    'АРТ начата': 'ART Started' # Mapping Kyrgyz 'АРТ начата' to 'ART Started'
}

# Rename the columns in your DataFrame
# The 'errors="ignore"' means if a column in the map isn't found, it won't stop the process.
df = df.rename(columns=column_rename_map, errors="ignore")

print("\n--- Columns Renamed ---")

# Now, let's re-check if our newly named columns are found and display their unique values.
# We will also confirm their presence.
columns_to_verify_rename = [
    'HIV Status',
    'ART Started'
]

print("\n--- Verifying Renamed Columns ---")
for col in columns_to_verify_rename:
    if col in df.columns:
        print(f"\nUnique values in '{col}':")
        print([str(val) for val in df[col].unique()])
    else:
        print(f"\nColumn '{col}' (renamed) is STILL not found. Please double-check original Kyrgyz name match or presence.")


--- Columns Renamed ---

--- Verifying Renamed Columns ---

Column 'HIV Status' (renamed) is STILL not found. Please double-check original Kyrgyz name match or presence.

Column 'ART Started' (renamed) is STILL not found. Please double-check original Kyrgyz name match or presence.


In [7]:
print("--- Deep Inspection of All Column Names (Raw Representation) ---")

# Iterate through all column names and print their raw representation using repr()
# This will reveal any hidden characters or precise Unicode forms that might be causing mismatches.
for col_name in df.columns:
    print(f"'{col_name}' -> Raw: {repr(col_name)}")

print("\n--- Manual Scan for HIV/ART Related Columns ---")
print("Please carefully look through the 'Raw:' outputs above.")
print("Specifically, search for entries that look like:")
print("  - 'ВИЧ статус' (This is 'HIV status' in Kyrgyz)")
print("  - 'АРТ начата' (This is 'ART started' in Kyrgyz)")
print("Copy the *exact* 'Raw:' string for those columns and paste it here.")
print("If you see any other columns that look like they indicate HIV status or ART, please note them too.")

--- Deep Inspection of All Column Names (Raw Representation) ---
'Country of arrival (for foreigners)' -> Raw: 'Country of arrival (for foreigners)'
'Current area of medical examination' -> Raw: 'Current area of medical examination'
'Citizenship' -> Raw: 'Citizenship'
'Date of IB/PCR/ELISA' -> Raw: 'Date of IB/PCR/ELISA'
'Sex' -> Raw: 'Sex'
'Age' -> Raw: 'Age'
'????????? ????????? ?? (?????/??)' -> Raw: '????????? ????????? ?? (?????/??)'
'On D-account in' -> Raw: 'On D-account in'
'Date of ART initiation' -> Raw: 'Date of ART initiation'
'Date of D-registration' -> Raw: 'Date of D-registration'
'Detected in KR' -> Raw: 'Detected in KR'
'Country of detection' -> Raw: 'Country of detection'
'Region of detection' -> Raw: 'Region of detection'
'Administrative unit of detection' -> Raw: 'Administrative unit of detection'
'UIC' -> Raw: 'UIC'
'Last result of HCS (AntiHCV)' -> Raw: 'Last result of HCS (AntiHCV)'
'Last result of HBV test (HBsAg)' -> Raw: 'Last result of HBV test (HBsAg)'
'HBV 

In [8]:
# Define a dictionary to map the old Kyrgyz HIV Status column to a new English name
# We use the EXACT raw string from your output for the old name.
column_rename_map_final = {
    '????????? ????????? ?? (?????/??)': 'HIV Status'
}

# Rename the column in your DataFrame
df = df.rename(columns=column_rename_map_final, errors="ignore")

print("\n--- HIV Status Column Renamed ---")

# Now, let's verify that 'HIV Status' is found and display its unique values.
print("\n--- Verifying Renamed 'HIV Status' Column ---")
if 'HIV Status' in df.columns:
    print(f"Unique values in 'HIV Status':")
    print([str(val) for val in df['HIV Status'].unique()])
else:
    print("Error: 'HIV Status' column still not found after renaming.")

print("\n--- Clarification on 'ART Started' ---")
print("The column 'АРТ начата' from your image corresponds to 'Date of ART initiation' in the data.")
print("This means ART status (started/not started) is likely indicated by whether this column has a date or is empty (NaN).")
print("Unique values in 'Date of ART initiation':")
print([str(val) for val in df['Date of ART initiation'].unique()])


--- HIV Status Column Renamed ---

--- Verifying Renamed 'HIV Status' Column ---
Unique values in 'HIV Status':
['340000.0', '362000.0', '348000.0', '509000.0', '60400.0', '1.559999943', '42.0', '444.0', '40.0', '96.0', '203.0', '166.0', '181000.0', '12.0', '43.0', '1.0', '114.0', '136.0', '710.0', '20700.0']

--- Clarification on 'ART Started' ---
The column 'АРТ начата' from your image corresponds to 'Date of ART initiation' in the data.
This means ART status (started/not started) is likely indicated by whether this column has a date or is empty (NaN).
Unique values in 'Date of ART initiation':
['2025-06-16', '2025-06-05', '2025-05-13', '2025-04-07', '2025-04-02', '2025-04-24', '2025-03-18', '2025-03-26', '2025-02-26', '2025-02-25', '2025-01-15', '2024-12-23', '2024-12-16', '2024-10-16', '2024-10-04', '2024-10-01', '2024-09-26', '2024-09-18', '2024-09-16', '2024-09-23', '2024-09-12', '2024-10-10', '2024-08-28', '2024-08-21', '2024-07-23', '2024-07-16', '2024-07-12', '2024-05-07', '2

In [9]:
# --- Part A: Renaming Key Kyrgyz Columns to English ---

# Define a dictionary to map the identified Kyrgyz column to its English name.
# Based on your confirmation, '????????? ????????? ?? (?????/??)' is the INITIAL VIRAL LOAD.
column_rename_map = {
    '????????? ????????? ?? (?????/??)': 'Initial Viral Load (copies/ml)',
    # Add other Kyrgyz columns here as you get their confirmed translations.
    # For now, let's focus on this confirmed one.

    # Let's also rename some other clearly identified Kyrgyz-named numeric/date columns for future use
    # (based on our previous investigation and common sense for health data)
    '?????? ????????? CD4 (??/??)': 'Initial CD4 Result (cells/ml)',
    '????????????? ????????? CD4 (??/??)': 'Penultimate CD4 Result (cells/ml)',
    '????????? ????????? CD4 (??/??)': 'Latest CD4 Result (cells/ml)',
    '????????????? ????????? ?? (?????/??)': 'Penultimate Viral Load (copies/ml)',
}

# Apply the renaming to the DataFrame
df = df.rename(columns=column_rename_map, errors="ignore")

print("\n--- Key Kyrgyz Columns Renamed ---")
print(f"New column names for confirmation: {[col for col in column_rename_map.values() if col in df.columns]}")


# --- Part B: Converting Date Columns to DateTime Objects ---

# For date calculations (like duration of treatment, age from birth date),
# these columns must be in a proper 'datetime' format.

# List of columns we've identified as dates (use their *new* names if renamed)
date_columns = [
    'Date of birth',
    'Date of ART initiation',
    'Date of initiation of HCV treatment',
    'Date of completion of HCV treatment',
    'Date of IB/PCR/ELISA', # This is likely a diagnosis/test date
    'Date of the first CD4 result',
    'Date of the last CD4 result',
    'Date of the first VL result',
    'Date of the last VL result',
    # Add other date columns as you identify them from the full list
]

# Loop through each identified date column and convert it
for col in date_columns:
    if col in df.columns: # Check if the column actually exists in the DataFrame
        # 'errors="coerce"' will turn any dates that can't be parsed into 'NaT' (Not a Time)
        df[col] = pd.to_datetime(df[col], errors='coerce')
        print(f"Converted '{col}' to datetime type.")
    else:
        print(f"Warning: Date column '{col}' not found for conversion.")

# --- Part C: Verifying Changes and Displaying Initial Cleaned Data ---

print("\n--- Data Info After Renaming and Date Conversion ---")
df.info()

print("\n--- First 5 Rows with Renamed Columns and Converted Dates ---")
# Displaying selected key columns for easy viewing
display_columns = [
    'Sex', 'Age', 'Region of detection', 'Last result of HCS (AntiHCV)', 'Last result of HBV test (HBsAg)',
    'Initial Viral Load (copies/ml)', 'Initial CD4 Result (cells/ml)', 'Date of ART initiation',
    'Date of initiation of HCV treatment', 'Date of completion of HCV treatment',
    'Route of transmission'
]
# Filter to only show columns that exist in the DataFrame
existing_display_columns = [col for col in display_columns if col in df.columns]
print(df[existing_display_columns].head())


--- Key Kyrgyz Columns Renamed ---
New column names for confirmation: ['Initial CD4 Result (cells/ml)', 'Penultimate CD4 Result (cells/ml)', 'Latest CD4 Result (cells/ml)', 'Penultimate Viral Load (copies/ml)']
Converted 'Date of birth' to datetime type.
Converted 'Date of ART initiation' to datetime type.
Converted 'Date of initiation of HCV treatment' to datetime type.
Converted 'Date of completion of HCV treatment' to datetime type.
Converted 'Date of IB/PCR/ELISA' to datetime type.
Converted 'Date of the first CD4 result' to datetime type.
Converted 'Date of the last CD4 result' to datetime type.
Converted 'Date of the first VL result' to datetime type.
Converted 'Date of the last VL result' to datetime type.

--- Data Info After Renaming and Date Conversion ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Columns: 171 entries, Country of arrival (for foreigners) to NGO: Date of service provision
dtypes: datetime64[ns](9), float64(31), int64(1), object(130

In [10]:
# --- Part A: Cleaning 'Sex' Column ---
# We saw 'Sex' contains '?'. Let's replace it with 'Unknown'.
# We'll also make sure consistent casing (e.g., 'Male', 'Female')
df['Sex'] = df['Sex'].replace('?', 'Unknown').str.capitalize()
print("\n--- Cleaned 'Sex' Column Unique Values ---")
print(df['Sex'].unique())


# --- Part B: Standardizing 'Route of transmission' ---
# This column has English, Kyrgyz, and 'Undetermined route'.
# We need to standardize and translate the Kyrgyz part.
# From previous outputs, '???????????? (?? ????? ????????????, ?????, ??? ????????? ??????)'
# likely means something like 'Other (undetermined, etc.)' or 'Unknown'.
# Let's map it to 'Undetermined route' or 'Other/Unknown'.

# Define a mapping dictionary for standardization
route_of_transmission_map = {
    'Homosexual, sexual': 'Homosexual/MSM',
    'Heterosexual, sexual': 'Heterosexual',
    'Undetermined route': 'Undetermined/Other',
    # Assuming the Kyrgyz phrase means 'Other (undetermined route, etc.)'
    '???????????? (?? ????? ????????????, ?????, ??? ????????? ??????)': 'Undetermined/Other'
}

# Apply the mapping. Using .fillna() to ensure NaNs are handled explicitly if needed later.
df['Route of transmission'] = df['Route of transmission'].replace(route_of_transmission_map).fillna('Undetermined/Other')
print("\n--- Cleaned 'Route of transmission' Column Unique Values ---")
print(df['Route of transmission'].unique())


# --- Part C: Standardizing 'D-examination' Risk Factor Columns ---
# These columns have 'No' and 'No ??????' (No in Kyrgyz).
# We'll map them to clear 'Yes'/'No' boolean-like values.
# Assuming 'No ??????' also means 'No'.

risk_col_map = {
    'No': 'No',
    'No ??????': 'No',
    'Only with men': 'Sexual contacts with men', # For 'D-examination:Sexual contacts in the last 6 months'
    '?????? ? ?????????': 'Sexual contacts (women)', # Assuming this means sexual contacts with women
    '? ? ?????????, ? ? ?????????': 'Sexual contacts (men & women)' # Assuming this means sexual contacts with both
}

# Apply mapping to relevant columns
df['D-examination:Injecting drug use in the last 6 months'] = df['D-examination:Injecting drug use in the last 6 months'].replace(risk_col_map).fillna('No')
df['D-examination:Sexual contacts in the last 6 months'] = df['D-examination:Sexual contacts in the last 6 months'].replace(risk_col_map).fillna('No contacts reported')
df['D-examination:Provision of commercial sex services in the last 6 months'] = df['D-examination:Provision of commercial sex services in the last 6 months'].replace(risk_col_map).fillna('No')

print("\n--- Cleaned D-examination Risk Factor Columns Unique Values ---")
print(f"Injecting drug use: {df['D-examination:Injecting drug use in the last 6 months'].unique()}")
print(f"Sexual contacts: {df['D-examination:Sexual contacts in the last 6 months'].unique()}")
print(f"Commercial sex services: {df['D-examination:Provision of commercial sex services in the last 6 months'].unique()}")


# --- Part D: Standardizing 'Outcome of HCV treatment' ---
# We saw '??? 12'. This likely means SVR 12 (Sustained Virologic Response at 12 weeks), which indicates cure.
df['Outcome of HCV treatment'] = df['Outcome of HCV treatment'].replace('??? 12', 'SVR 12 (Cured)').fillna('Not Treated/Unknown Outcome')
print("\n--- Cleaned 'Outcome of HCV treatment' Unique Values ---")
print(df['Outcome of HCV treatment'].unique())


# --- Part E: Feature Engineering: Creating 'ART Started' Boolean Column ---
# Since all patients are HIV positive and 'Date of ART initiation' exists,
# we can create a simple 'ART Started' column.
# If 'Date of ART initiation' is not NaT (i.e., it has a date), then ART has started.
df['ART Started'] = df['Date of ART initiation'].notna().astype(bool)
print("\n--- New 'ART Started' Column Unique Values ---")
print(df['ART Started'].unique())

# --- Part F: Feature Engineering: Calculating 'Age at Diagnosis/Data' ---
# We already have 'Age' as a numerical column, which is useful.
# If we wanted to calculate age from 'Date of birth' at a specific 'Date of IB/PCR/ELISA', we could:
# df['Age at Diagnosis'] = (df['Date of IB/PCR/ELISA'] - df['Date of birth']).dt.days / 365.25
# For now, we'll rely on the existing 'Age' column since it's already numeric.

# --- Part G: Displaying a Sample of Cleaned Data ---
print("\n--- First 5 Rows with Cleaned Values and New Features ---")
display_columns_cleaned = [
    'Sex', 'Age', 'Region of detection', 'Last result of HCS (AntiHCV)', 'Last result of HBV test (HBsAg)',
    'Initial Viral Load (copies/ml)', 'Initial CD4 Result (cells/ml)', 'Date of ART initiation',
    'ART Started', 'Route of transmission',
    'D-examination:Injecting drug use in the last 6 months',
    'D-examination:Sexual contacts in the last 6 months',
    'Outcome of HCV treatment'
]
existing_display_columns_cleaned = [col for col in display_columns_cleaned if col in df.columns]
print(df[existing_display_columns_cleaned].head())


--- Cleaned 'Sex' Column Unique Values ---
['Unknown']

--- Cleaned 'Route of transmission' Column Unique Values ---
['Homosexual/MSM' 'Heterosexual' 'Undetermined/Other']

--- Cleaned D-examination Risk Factor Columns Unique Values ---
Injecting drug use: ['No']
Sexual contacts: ['No contacts' 'No' 'Sexual contacts with men' 'Sexual contacts (women)'
 'Sexual contacts (men & women)']
Commercial sex services: ['No']

--- Cleaned 'Outcome of HCV treatment' Unique Values ---
['Not Treated/Unknown Outcome' 'SVR 12 (Cured)']

--- New 'ART Started' Column Unique Values ---
[ True]

--- First 5 Rows with Cleaned Values and New Features ---
       Sex  Age Region of detection Last result of HCS (AntiHCV)  \
0  Unknown   38              BATKEN                     Negative   
1  Unknown   23              BATKEN                     Negative   
2  Unknown   58              BATKEN                     Negative   
3  Unknown   19              BATKEN                     Negative   
4  Unknown   43  

In [11]:
# --- Part A: Consolidating 'No' and 'No contacts' in 'D-examination:Sexual contacts in the last 6 months' ---

# Define a mapping dictionary for this specific column
sexual_contacts_map = {
    'No contacts': 'No contacts reported',
    'No': 'No contacts reported', # Consolidate 'No' into 'No contacts reported'
    'Sexual contacts with men': 'Sexual contacts with men',
    'Sexual contacts (women)': 'Sexual contacts with women', # Keep as is, or simplify to 'Sexual contacts with women' if needed
    'Sexual contacts (men & women)': 'Sexual contacts with men & women'
}

# Apply the mapping
df['D-examination:Sexual contacts in the last 6 months'] = df['D-examination:Sexual contacts in the last 6 months'].replace(sexual_contacts_map)

print("\n--- Refined 'D-examination:Sexual contacts in the last 6 months' Unique Values ---")
print(df['D-examination:Sexual contacts in the last 6 months'].unique())


# --- Part B: Overview of Data Types After All Conversions ---
# It's good practice to re-check data types before moving to analysis
print("\n--- Final Data Info After All Conversions and Cleaning ---")
df.info()

# --- Part C: Displaying a Sample of the Fully Cleaned Core Data ---
print("\n--- First 10 Rows of Core Cleaned Data for Review ---")
display_columns_final = [
    'Sex', 'Age', 'Region of detection', 'Last result of HCS (AntiHCV)', 'Last result of HBV test (HBsAg)',
    'Initial Viral Load (copies/ml)', 'Initial CD4 Result (cells/ml)',
    'Date of ART initiation', 'ART Started', 'Route of transmission',
    'D-examination:Injecting drug use in the last 6 months',
    'D-examination:Sexual contacts in the last 6 months',
    'D-examination:Provision of commercial sex services in the last 6 months',
    'Date of initiation of HCV treatment', 'Date of completion of HCV treatment',
    'Outcome of HCV treatment'
]
# Ensure we only try to display columns that actually exist in the DataFrame
existing_display_columns_final = [col for col in display_columns_final if col in df.columns]
print(df[existing_display_columns_final].head(10)) # Showing 10 rows for better overview


--- Refined 'D-examination:Sexual contacts in the last 6 months' Unique Values ---
['No contacts reported' 'Sexual contacts with men'
 'Sexual contacts with women' 'Sexual contacts with men & women']

--- Final Data Info After All Conversions and Cleaning ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Columns: 172 entries, Country of arrival (for foreigners) to ART Started
dtypes: bool(1), datetime64[ns](9), float64(31), int64(1), object(130)
memory usage: 65.6+ KB

--- First 10 Rows of Core Cleaned Data for Review ---
       Sex  Age Region of detection Last result of HCS (AntiHCV)  \
0  Unknown   38              BATKEN                     Negative   
1  Unknown   23              BATKEN                     Negative   
2  Unknown   58              BATKEN                     Negative   
3  Unknown   19              BATKEN                     Negative   
4  Unknown   43              BATKEN                     Negative   
5  Unknown   32              BATKEN    

In [12]:
# --- Part A: Summary of HIV and Co-infection Status ---
print("\n--- HIV and Co-infection Status Summary ---")

# All patients are HIV positive, so we confirm the count.
# We can assume all 49 rows represent HIV positive patients as per your confirmation.
total_hiv_patients = len(df)
print(f"Total HIV Positive Patients in Sample: {total_hiv_patients}")

# Check Hepatitis C co-infection
hcv_coinfection_counts = df['Last result of HCS (AntiHCV)'].value_counts(dropna=False)
print("\nHepatitis C (HCV) Co-infection Status:")
print(hcv_coinfection_counts)
print(f"  - Positive: {hcv_coinfection_counts.get('Positive', 0)} ({hcv_coinfection_counts.get('Positive', 0)/total_hiv_patients:.1%})")
print(f"  - Negative: {hcv_coinfection_counts.get('Negative', 0)} ({hcv_coinfection_counts.get('Negative', 0)/total_hiv_patients:.1%})")
print(f"  - Missing/Not Tested: {hcv_coinfection_counts.get('nan', 0)} ({hcv_coinfection_counts.get('nan', 0)/total_hiv_patients:.1%})")

# Check Hepatitis B co-infection
hbv_coinfection_counts = df['Last result of HBV test (HBsAg)'].value_counts(dropna=False)
print("\nHepatitis B (HBV) Co-infection Status:")
print(hbv_coinfection_counts)
print(f"  - Positive: {hbv_coinfection_counts.get('Positive', 0)} ({hbv_coinfection_counts.get('Positive', 0)/total_hiv_patients:.1%})")
print(f"  - Negative: {hbv_coinfection_counts.get('Negative', 0)} ({hbv_coinfection_counts.get('Negative', 0)/total_hiv_patients:.1%})")
print(f"  - Missing/Not Tested: {hbv_coinfection_counts.get('nan', 0)} ({hbv_coinfection_counts.get('nan', 0)/total_hiv_patients:.1%})")


# --- Part B: Demographics Summary ---
print("\n--- Demographics Summary ---")

print("\nSex Distribution:")
print(df['Sex'].value_counts(dropna=False))
print(f"  - Unknown: {df['Sex'].value_counts().get('Unknown', 0)} ({df['Sex'].value_counts().get('Unknown', 0)/total_hiv_patients:.1%})")
# (Note: All will be 'Unknown' for this sample based on previous output)

print("\nAge Distribution:")
print(df['Age'].describe()) # Basic statistics for age


# --- Part C: Treatment Status Summary ---
print("\n--- Treatment Status Summary ---")

print("\nART Started Status:")
art_started_counts = df['ART Started'].value_counts(dropna=False)
print(art_started_counts)
print(f"  - Started ART: {art_started_counts.get(True, 0)} ({art_started_counts.get(True, 0)/total_hiv_patients:.1%})")
print(f"  - Not Started ART: {art_started_counts.get(False, 0)} ({art_started_counts.get(False, 0)/total_hiv_patients:.1%})")
# (Note: All will be True for this sample based on previous output)


# --- Part D: Key Risk Factors Summary ---
print("\n--- Key Risk Factors Summary (Last 6 Months) ---")

print("\nInjecting Drug Use:")
print(df['D-examination:Injecting drug use in the last 6 months'].value_counts(dropna=False))

print("\nSexual Contacts:")
print(df['D-examination:Sexual contacts in the last 6 months'].value_counts(dropna=False))

print("\nCommercial Sex Services:")
print(df['D-examination:Provision of commercial sex services in the last 6 months'].value_counts(dropna=False))


--- HIV and Co-infection Status Summary ---
Total HIV Positive Patients in Sample: 49

Hepatitis C (HCV) Co-infection Status:
Last result of HCS (AntiHCV)
Negative    42
Positive     4
NaN          3
Name: count, dtype: int64
  - Positive: 4 (8.2%)
  - Negative: 42 (85.7%)
  - Missing/Not Tested: 0 (0.0%)

Hepatitis B (HBV) Co-infection Status:
Last result of HBV test (HBsAg)
Negative    44
NaN          3
Positive     2
Name: count, dtype: int64
  - Positive: 2 (4.1%)
  - Negative: 44 (89.8%)
  - Missing/Not Tested: 0 (0.0%)

--- Demographics Summary ---

Sex Distribution:
Sex
Unknown    49
Name: count, dtype: int64
  - Unknown: 49 (100.0%)

Age Distribution:
count    49.000000
mean     34.938776
std      13.955895
min       1.000000
25%      26.000000
50%      34.000000
75%      44.000000
max      65.000000
Name: Age, dtype: float64

--- Treatment Status Summary ---

ART Started Status:
ART Started
True    49
Name: count, dtype: int64
  - Started ART: 49 (100.0%)
  - Not Started ART:

In [13]:
# Define the path and filename for your cleaned data.
# It's good practice to add "_cleaned" to the filename.
output_file_path = 'Trial_Data_1-23_Cleaned.csv'

# Export the DataFrame to a new CSV file.
# index=False prevents pandas from writing the DataFrame index as a column in the CSV.
# encoding='utf-8' is a good standard for outputting clean text.
df.to_csv(output_file_path, index=False, encoding='utf-8')

print(f"\nCleaned data successfully exported to: {output_file_path}")
print("You can now open this CSV file and import it into Power BI.")


Cleaned data successfully exported to: Trial_Data_1-23_Cleaned.csv
You can now open this CSV file and import it into Power BI.


In [14]:
# --- Part A: Calculate Age at Initial Data Collection / Diagnosis ---
# We already have 'Age' as a numerical column, which is good.
# If we wanted to calculate age at a specific event date (e.g., date of first test), we could do:
# df['Age_at_Initial_Test'] = (df['Date of IB/PCR/ELISA'] - df['Date of birth']).dt.days / 365.25
# For simplicity with this sample, we'll primarily use the 'Age' column provided, assuming it's age at current data point.
# If 'Age' is age at diagnosis, that's even better. We'll proceed with 'Age' as is.

print("\n--- Feature Engineering: New Analytical Columns ---")

# --- Part B: Calculate Duration on ART (in Days) ---
# We'll use 'Date of ART initiation' and 'Date of the last VL result' as the end point
# (as it's often the latest recorded activity date).
df['ART_Duration_Days'] = (df['Date of the last VL result'] - df['Date of ART initiation']).dt.days
print(f"Calculated 'ART_Duration_Days'. First 5 values:\n{df['ART_Duration_Days'].head()}")


# --- Part C: Calculate Viral Load Change (Initial to Latest) ---
# First, ensure VL columns are numeric, coercing errors to NaN.
# 'First VL result (copies/ml)' is the initial VL, and 'Penultimate Viral Load (copies/ml)' or 'Latest Viral Load (copies/ml)'
# would be the latest. Let's use 'First VL result (copies/ml)' as initial and 'Penultimate Viral Load (copies/ml)' as latest for demonstration.
# Note: 'Initial Viral Load (copies/ml)' was the renamed Kyrgyz column, which is likely the *first* VL.
# So, let's use 'Initial Viral Load (copies/ml)' as initial and 'First VL result (copies/ml)' as latest if they represent different points.
# Given the names, 'Initial Viral Load (copies/ml)' and 'First VL result (copies/ml)' might be the same.
# Let's assume 'First VL result (copies/ml)' is indeed the first, and 'Date of the last VL result' implies a corresponding 'Last VL result (copies/ml)' column.
# From your full column list, you have 'First VL result (copies/ml)' and '????????????? ????????? ?? (?????/??)' (now 'Penultimate Viral Load (copies/ml)').
# Let's use 'First VL result (copies/ml)' as the initial, and 'Penultimate Viral Load (copies/ml)' as the latest available for change.

# Ensure columns are numeric
df['First VL result (copies/ml)'] = pd.to_numeric(df['First VL result (copies/ml)'], errors='coerce')
df['Penultimate Viral Load (copies/ml)'] = pd.to_numeric(df['Penultimate Viral Load (copies/ml)'], errors='coerce')

# Calculate VL change. Handle cases where values might be 0 or very small to avoid division by zero or large ratios.
# We'll calculate a simple difference for now.
df['VL_Change_Absolute'] = df['Penultimate Viral Load (copies/ml)'] - df['First VL result (copies/ml)']
# For percentage change, handle division by zero:
df['VL_Change_Percentage'] = (df['VL_Change_Absolute'] / df['First VL result (copies/ml)']).fillna(0) * 100 # Fill NaN with 0 for no change/missing

print(f"Calculated 'VL_Change_Absolute'. First 5 values:\n{df['VL_Change_Absolute'].head()}")
print(f"Calculated 'VL_Change_Percentage'. First 5 values:\n{df['VL_Change_Percentage'].head()}")


# --- Part D: Calculate CD4 Change (Initial to Latest) ---
# Ensure CD4 columns are numeric.
df['Initial CD4 Result (cells/ml)'] = pd.to_numeric(df['Initial CD4 Result (cells/ml)'], errors='coerce')
df['Latest CD4 Result (cells/ml)'] = pd.to_numeric(df['Latest CD4 Result (cells/ml)'], errors='coerce')

df['CD4_Change_Absolute'] = df['Latest CD4 Result (cells/ml)'] - df['Initial CD4 Result (cells/ml)']
df['CD4_Change_Percentage'] = (df['CD4_Change_Absolute'] / df['Initial CD4 Result (cells/ml)']).fillna(0) * 100 # Fill NaN with 0

print(f"Calculated 'CD4_Change_Absolute'. First 5 values:\n{df['CD4_Change_Absolute'].head()}")
print(f"Calculated 'CD4_Change_Percentage'. First 5 values:\n{df['CD4_Change_Percentage'].head()}")


# --- Part E: Create Combined Co-infection Status ---
# This will categorize patients based on their HIV status AND HCV/HBV co-infection.
# Remember, all patients are HIV positive.

def get_coinfection_status(row):
    hcv_positive = row['Last result of HCS (AntiHCV)'] == 'Positive'
    hbv_positive = row['Last result of HBV test (HBsAg)'] == 'Positive'

    if hcv_positive and hbv_positive:
        return 'HIV+HCV+HBV'
    elif hcv_positive:
        return 'HIV+HCV'
    elif hbv_positive:
        return 'HIV+HBV'
    else:
        return 'HIV Only' # Assuming negative for both HCV and HBV, or missing results mean not co-infected for this purpose

df['Co_infection_Status_Combined'] = df.apply(get_coinfection_status, axis=1)
print(f"Created 'Co_infection_Status_Combined'. Unique values:\n{df['Co_infection_Status_Combined'].unique()}")


# --- Part F: Display Descriptive Statistics for New Numerical Features ---
print("\n--- Descriptive Statistics for New Numerical Features ---")
print(df[['ART_Duration_Days', 'VL_Change_Absolute', 'VL_Change_Percentage',
          'CD4_Change_Absolute', 'CD4_Change_Percentage']].describe())

# --- Part G: Display Value Counts for New Categorical Features ---
print("\n--- Value Counts for New Categorical Features ---")
print(df['Co_infection_Status_Combined'].value_counts())

# --- Part H: Display First 5 Rows with New Features ---
print("\n--- First 5 Rows with Newly Engineered Features ---")
display_cols_new_features = [
    'Sex', 'Age', 'ART_Duration_Days', 'VL_Change_Absolute', 'VL_Change_Percentage',
    'CD4_Change_Absolute', 'CD4_Change_Percentage', 'Co_infection_Status_Combined'
]
existing_display_cols_new_features = [col for col in display_cols_new_features if col in df.columns]
print(df[existing_display_cols_new_features].head())


--- Feature Engineering: New Analytical Columns ---
Calculated 'ART_Duration_Days'. First 5 values:
0    0
1    0
2    0
3   -3
4    0
Name: ART_Duration_Days, dtype: int64
Calculated 'VL_Change_Absolute'. First 5 values:
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: VL_Change_Absolute, dtype: float64
Calculated 'VL_Change_Percentage'. First 5 values:
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: VL_Change_Percentage, dtype: float64
Calculated 'CD4_Change_Absolute'. First 5 values:
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: CD4_Change_Absolute, dtype: float64
Calculated 'CD4_Change_Percentage'. First 5 values:
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: CD4_Change_Percentage, dtype: float64
Created 'Co_infection_Status_Combined'. Unique values:
['HIV Only' 'HIV+HCV' 'HIV+HBV']

--- Descriptive Statistics for New Numerical Features ---
       ART_Duration_Days  VL_Change_Absolute  VL_Change_Percentage  \
count          49.000000        4.100000e+01             49.000000 

  sqr = _ensure_numeric((avg - values) ** 2)


In [15]:
import numpy as np # Import numpy for handling NaN and inf values

print("\n--- Refining Numerical Features ---")

# --- Part A: Handle Negative ART_Duration_Days ---
# Convert negative durations to 0 (assuming a quick follow-up or data entry error where VL date < ART initiation date)
df['ART_Duration_Days'] = df['ART_Duration_Days'].apply(lambda x: max(0, x) if pd.notna(x) else x)
print(f"Refined 'ART_Duration_Days'. First 5 values:\n{df['ART_Duration_Days'].head()}")
print(f"Min 'ART_Duration_Days' after refinement: {df['ART_Duration_Days'].min()}")


# --- Part B: Handle NaN and Inf in VL/CD4 Change Columns ---
# Replace infinite values with NaN, then fill all NaNs with 0 (assuming no change if data is missing)
# This is a simplification for a small sample; for full data, a more nuanced approach might be needed.

# VL Change
df['VL_Change_Absolute'] = df['VL_Change_Absolute'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['VL_Change_Percentage'] = df['VL_Change_Percentage'].replace([np.inf, -np.inf], np.nan).fillna(0)

# CD4 Change
df['CD4_Change_Absolute'] = df['CD4_Change_Absolute'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['CD4_Change_Percentage'] = df['CD4_Change_Percentage'].replace([np.inf, -np.inf], np.nan).fillna(0)

print("\n--- VL/CD4 Change Columns Refined (NaN/Inf handled) ---")
print(f"Refined 'VL_Change_Absolute'. First 5 values:\n{df['VL_Change_Absolute'].head()}")
print(f"Refined 'CD4_Change_Absolute'. First 5 values:\n{df['CD4_Change_Absolute'].head()}")


# --- Part C: Re-Export Cleaned Data with New Features ---
# Define the path and filename for your updated cleaned data.
# We'll overwrite the previous cleaned file with the new features.
output_file_path = 'Trial_Data_1-23_Cleaned.csv'

# Export the DataFrame to a new CSV file.
# index=False prevents pandas from writing the DataFrame index as a column in the CSV.
# encoding='utf-8' is a good standard for outputting clean text.
df.to_csv(output_file_path, index=False, encoding='utf-8')

print(f"\nCleaned data with new features successfully exported to: {output_file_path}")
print("You can now re-load this CSV file into Power BI to use the new features.")

# --- Part D: Display Final Data Info and Head ---
print("\n--- Final Data Info After All Feature Engineering and Refinements ---")
df.info()

print("\n--- First 5 Rows with All Engineered Features (Final Review) ---")
display_cols_final_features = [
    'Sex', 'Age', 'ART_Duration_Days', 'VL_Change_Absolute', 'VL_Change_Percentage',
    'CD4_Change_Absolute', 'CD4_Change_Percentage', 'Co_infection_Status_Combined'
]
existing_display_cols_final_features = [col for col in display_cols_final_features if col in df.columns]
print(df[existing_display_cols_final_features].head())


--- Refining Numerical Features ---
Refined 'ART_Duration_Days'. First 5 values:
0    0
1    0
2    0
3    0
4    0
Name: ART_Duration_Days, dtype: int64
Min 'ART_Duration_Days' after refinement: 0

--- VL/CD4 Change Columns Refined (NaN/Inf handled) ---
Refined 'VL_Change_Absolute'. First 5 values:
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: VL_Change_Absolute, dtype: float64
Refined 'CD4_Change_Absolute'. First 5 values:
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: CD4_Change_Absolute, dtype: float64

Cleaned data with new features successfully exported to: Trial_Data_1-23_Cleaned.csv
You can now re-load this CSV file into Power BI to use the new features.

--- Final Data Info After All Feature Engineering and Refinements ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Columns: 178 entries, Country of arrival (for foreigners) to Co_infection_Status_Combined
dtypes: bool(1), datetime64[ns](9), float64(35), int64(2), object(131)
memory usage: 67.

In [17]:
import pandas as pd
import os

# Define the file path for the uploaded file on your local machine
# IMPORTANT: Make sure this path is correct for where you saved the file.
file_path = 'Translated_Journal.xlsx - Sheet1.csv'

try:
    print("Attempting to load the new, full dataset from your local file path...")
    
    # Read the content directly from the file path
    # We'll use latin1 encoding again to handle any special characters
    df = pd.read_csv(file_path, encoding='latin1')

    print("Success! The full dataset has been loaded.")

except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    print("Please check that the file is in the same folder as this notebook or that the file path is correct.")
    # Stop execution if there is an error
    df = None

if df is not None:
    # --- Initial Inspection of the Full Dataset ---
    print("\n--- Full Dataset Info ---")
    df.info()

    print("\n--- First 5 Rows of the Full Dataset ---")
    print(df.head())

    print("\n--- All Column Names in the Full Dataset ---")
    for i, col in enumerate(df.columns):
        print(f"[{i+1}] {col}")

    # Check for unique values in some key columns to understand the new data
    print("\n--- Unique Values for Key Columns in Full Dataset ---")
    if 'Sex' in df.columns:
        print(f"\nUnique values in 'Sex': {df['Sex'].unique()}")
    if 'Diagnosis' in df.columns:
        print(f"\nUnique values in 'Diagnosis': {df['Diagnosis'].unique()}")
    if 'Route of transmission' in df.columns:
        print(f"\nUnique values in 'Route of transmission': {df['Route of transmission'].unique()}")
    if 'Last result of HCS (AntiHCV)' in df.columns:
        print(f"\nUnique values in 'Last result of HCS (AntiHCV)': {df['Last result of HCS (AntiHCV)'].unique()}")
    if 'Last result of HBV test (HBsAg)' in df.columns:
        print(f"\nUnique values in 'Last result of HBV test (HBsAg)': {df['Last result of HBV test (HBsAg)'].unique()}")
    if 'IB conclusion' in df.columns:
        print(f"\nUnique values in 'IB conclusion': {df['IB conclusion'].unique()}")



Attempting to load the new, full dataset from your local file path...
Error: File not found at Translated_Journal.xlsx - Sheet1.csv
Please check that the file is in the same folder as this notebook or that the file path is correct.
