In [None]:
import pandas as pd
import zipfile
import io
import os
from google.colab import files

zip_path_test = uploaded_file_name

csv_filenames_test = [
    'df_volume_test1.csv',
    'df_generics_test1.csv',
    'df_medicine_info_test1.csv'
]

dataframes_test = {}
try:
    with zipfile.ZipFile(zip_path_test, 'r') as z:
        print(f"\nFiles found inside {zip_path_test}: {z.namelist()}")

        for short_name in csv_filenames_test:
            # use only the short filename (no folder prefix)
            internal_file_name = short_name

            with z.open(internal_file_name) as f:
                dataframes_test[short_name.replace('.csv', '')] = pd.read_csv(io.BytesIO(f.read()))

    df_volume_test = dataframes_test['df_volume_test1']
    df_generics_test = dataframes_test['df_generics_test1']
    df_medicine_test = dataframes_test['df_medicine_info_test1']

    print("\nTest data files loaded successfully.")

except KeyError as e:
    print(f"\n KEY ERROR: The file {e} was not found inside the zip.")
    print("Please check the names printed above. If they are different, update 'csv_filenames_test'.")
    raise
except Exception as e:
    print(f"\n General Error loading test data: {e}.")
    raise

# Define keys (same as training data)
KEYS_TIME_SERIES = ['country', 'brand_name', 'months_postgx']
KEYS_STATIC = ['country', 'brand_name']

# A. Merge Time-Series Data (Volume and Generics) using LEFT JOIN
df_merged_ts_test = pd.merge(
    df_volume_test,
    df_generics_test,
    on=KEYS_TIME_SERIES,
    how='left'
)

# B. Clean Static Data
df_medicine_clean_test = df_medicine_test.drop_duplicates(subset=KEYS_STATIC, keep='first').copy()

# C. Add Static Drug Features (Medicine Info)
df_test_full = pd.merge(
    df_merged_ts_test,
    df_medicine_clean_test,
    on=KEYS_STATIC,
    how='left'
)

print(f"Rows after Time-Series Merge (Volume + Generics): {len(df_merged_ts_test)}")
print(f"Final Merged Test Data Shape: {df_test_full.shape}")
print("Final Merged Test Data Sample:")
print(df_test_full.head())

In [None]:
# Define the output filename
submission_feature_filename = 'novartis_test_data_merged_full.csv'

# Save the DataFrame to a CSV file (index=False prevents saving the DataFrame index as a column)
df_test_full.to_csv(submission_feature_filename, index=False)

# download
files.download(submission_feature_filename)

print(f"\n File saved successfully and download initiated: {submission_feature_filename}")