In [1]:
import pandas as pd
import numpy as np
import os
from google.colab import drive
import matplotlib.pyplot as plt

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Load the PKL file from Drive

# Define the path to your saved PKL file
pkl_path = '/content/drive/MyDrive/Dataset/final_dataset/final_merged_all_7quarters.pkl'

In [4]:
# Check if file exists
if os.path.exists(pkl_path):
    print(f"Found PKL file at: {pkl_path}")
    file_size = os.path.getsize(pkl_path) / (1024**2)
    print(f"   File size: {file_size:.2f} MB")
else:
    print(f"File not found at: {pkl_path}")
    print("Please check the path!")

Found PKL file at: /content/drive/MyDrive/Dataset/final_dataset/final_merged_all_7quarters.pkl
   File size: 719.92 MB


In [5]:
# Load the dataset
print("\nLoading dataset...")
df = pd.read_pickle(pkl_path)

print(f"\nDataset loaded successfully!")
print(f"   Shape: {df.shape}")
print(f"   Cases: {df.shape[0]:,}")
print(f"   Features: {df.shape[1]}")
print(f"   Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


Loading dataset...

Dataset loaded successfully!
   Shape: (2848079, 21)
   Cases: 2,848,079
   Features: 21
   Memory usage: 2083.81 MB


In [6]:
df.head(3)

Unnamed: 0,primaryid,age_years,age_group,sex_clean,occr_country,is_elderly,is_pediatric,suspect_drugs,num_suspect_drugs,polypharmacy_category,...,num_concom_drugs,all_reactions,num_reactions,has_serious_reaction,reaction_severity,outcome_codes,is_serious_outcome,outcome_descriptions,indications,num_indications
0,1001678125,56.0,middle_age,Female,CA,0,0,"[SANDOSTATIN LAR DEPOT, SANDOSTATIN]",2.0,dual_therapy,...,11.0,"[BLOOD CREATINE INCREASED, FALL, SINUSITIS, SK...",76.0,True,extreme,[OT],False,[Other Serious (Important Medical Event)],[NEUROENDOCRINE TUMOUR],1.0
1,1002872124,57.0,middle_age,Female,CA,0,0,"[SANDOSTATIN LAR DEPOT, AFINITOR, SANDOSTATIN]",3.0,moderate_poly,...,4.0,"[PNEUMONITIS, ABDOMINAL PAIN UPPER, CARCINOID ...",24.0,True,extreme,"[OT, HO]",True,"[Other Serious (Important Medical Event), Hosp...","[PANCREATIC NEUROENDOCRINE TUMOUR, CARCINOID T...",2.0
2,100293663,32.0,adult,Male,AU,0,0,"[CYCLOSPORINE, BASILIXIMAB, MYCOPHENOLATE MOFE...",4.0,moderate_poly,...,,"[STAPHYLOCOCCAL INFECTION, MYCOBACTERIUM HAEMO...",4.0,False,moderate,[OT],False,[Other Serious (Important Medical Event)],"[RENAL TRANSPLANT, IMMUNOSUPPRESSANT DRUG THER...",2.0


In [7]:
# Calculate missing values
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
    'Data_Type': df.dtypes
})

In [8]:
# Sort by missing percentage
missing_data = missing_data.sort_values('Missing_Percentage', ascending=False)

# Display all columns with their missing values
for idx, row in missing_data.iterrows():
    if row['Missing_Count'] > 0:
        print(f"{row['Column']:30s}: {row['Missing_Count']:10,} ({row['Missing_Percentage']:6.2f}%)")
    else:
        print(f"{row['Column']:30s}: No missing values âœ“")

num_concom_drugs              :  2,074,686 ( 72.85%)
concomitant_drugs             :  2,074,686 ( 72.85%)
is_serious_outcome            :  1,259,063 ( 44.21%)
outcome_codes                 :  1,259,063 ( 44.21%)
outcome_descriptions          :  1,259,063 ( 44.21%)
age_group                     :  1,138,325 ( 39.97%)
age_years                     :  1,138,287 ( 39.97%)
num_indications               :    753,306 ( 26.45%)
indications                   :    753,306 ( 26.45%)
occr_country                  :    171,768 (  6.03%)
reaction_severity             :        137 (  0.00%)
polypharmacy_category         :         37 (  0.00%)
primaryid                     : No missing values âœ“
sex_clean                     : No missing values âœ“
all_reactions                 : No missing values âœ“
suspect_drugs                 : No missing values âœ“
num_suspect_drugs             : No missing values âœ“
is_elderly                    : No missing values âœ“
is_pediatric                  : No missi

In [9]:
# 2. Summary Statistics
print("\n2. SUMMARY STATISTICS:")
print("-"*50)
total_cells = len(df) * len(df.columns)
total_missing = df.isnull().sum().sum()

print(f"Total cells in dataset: {total_cells:,}")
print(f"Total missing values: {total_missing:,}")
print(f"Overall missing percentage: {(total_missing/total_cells)*100:.2f}%")


2. SUMMARY STATISTICS:
--------------------------------------------------
Total cells in dataset: 59,809,659
Total missing values: 11,881,727
Overall missing percentage: 19.87%


###Preprocess the dataset

In [10]:
# Check data types

print("\nColumn Data Types:")
print("-"*50)
for col in df.columns:
    dtype = df[col].dtype
    if dtype == 'object':
        # Check if it's a list column
        sample_val = df[col].dropna().iloc[0] if df[col].notna().any() else None
        if isinstance(sample_val, list):
            print(f"{col:30s}: List")
        else:
            print(f"{col:30s}: {dtype}")
    else:
        print(f"{col:30s}: {dtype}")


Column Data Types:
--------------------------------------------------
primaryid                     : int64
age_years                     : float64
age_group                     : category
sex_clean                     : object
occr_country                  : object
is_elderly                    : int64
is_pediatric                  : int64
suspect_drugs                 : List
num_suspect_drugs             : float64
polypharmacy_category         : category
concomitant_drugs             : List
num_concom_drugs              : float64
all_reactions                 : List
num_reactions                 : float64
has_serious_reaction          : object
reaction_severity             : category
outcome_codes                 : List
is_serious_outcome            : object
outcome_descriptions          : List
indications                   : List
num_indications               : float64


In [11]:
# Convert to proper binary
df['has_serious_reaction'] = df['has_serious_reaction'].astype(int) #It should not be object
df['is_serious_outcome'] = df['is_serious_outcome'].fillna(0).astype(int) #same here

In [12]:
# Check data types
print("\nColumn Data Types:")
print("-"*50)
for col in df.columns:
    dtype = df[col].dtype
    if dtype == 'object':
        # Check if it's a list column
        sample_val = df[col].dropna().iloc[0] if df[col].notna().any() else None
        if isinstance(sample_val, list):
            print(f"{col:30s}: List")
        else:
            print(f"{col:30s}: {dtype}")
    else:
        print(f"{col:30s}: {dtype}")


Column Data Types:
--------------------------------------------------
primaryid                     : int64
age_years                     : float64
age_group                     : category
sex_clean                     : object
occr_country                  : object
is_elderly                    : int64
is_pediatric                  : int64
suspect_drugs                 : List
num_suspect_drugs             : float64
polypharmacy_category         : category
concomitant_drugs             : List
num_concom_drugs              : float64
all_reactions                 : List
num_reactions                 : float64
has_serious_reaction          : int64
reaction_severity             : category
outcome_codes                 : List
is_serious_outcome            : int64
outcome_descriptions          : List
indications                   : List
num_indications               : float64


In [13]:
# Check for duplicate primary IDs


duplicates = df['primaryid'].duplicated().sum()
print(f"Duplicate primaryids: {duplicates:,}")

if duplicates > 0:
    print(f"Found {duplicates} duplicate records!")
    # Show some duplicate examples
    dup_ids = df[df['primaryid'].duplicated()]['primaryid'].head()
    print(f"Example duplicate IDs: {dup_ids.tolist()}")
else:
    print("No duplicate primaryids found!")

# Check for complete row duplicates
complete_duplicates = df.duplicated().sum()
print(f"Complete row duplicates: {complete_duplicates:,}")

Duplicate primaryids: 0
No duplicate primaryids found!


TypeError: unhashable type: 'list'

In [14]:
# Note: Cannot check for complete row duplicates using df.duplicated()
# because the dataset contains list columns (suspect_drugs, all_reactions, etc.)
# which are unhashable and cannot be compared directly.
# Since primaryid is unique, complete row duplicates are impossible anyway.

In [15]:
##OUTLIER DETECTION

# Age outliers
if df['age_years'].notna().any():
    age_stats = df['age_years'].describe()
    print("\nAge Distribution:")
    print(f"  Min: {age_stats['min']:.1f} years")
    print(f"  Max: {age_stats['max']:.1f} years")
    print(f"  Mean: {age_stats['mean']:.1f} years")
    print(f"  Median: {age_stats['50%']:.1f} years")

    # Check for impossible ages
    if age_stats['min'] < 0:
        print(f" Negative ages found: {(df['age_years'] < 0).sum()}")
    if age_stats['max'] > 120:
        print(f" Ages >120 found: {(df['age_years'] > 120).sum()}")

# Drug count outliers
print(f"\nDrug Count Distribution:")
print(f"  Min drugs: {df['num_suspect_drugs'].min()}")
print(f"  Max drugs: {df['num_suspect_drugs'].max()}")
print(f"  Mean: {df['num_suspect_drugs'].mean():.2f}")
extreme_poly = (df['num_suspect_drugs'] > 20).sum()
if extreme_poly > 0:
    print(f" Cases with >20 drugs: {extreme_poly:,}")

# Reaction count outliers
print(f"\nReaction Count Distribution:")
print(f"  Min reactions: {df['num_reactions'].min()}")
print(f"  Max reactions: {df['num_reactions'].max()}")
print(f"  Mean: {df['num_reactions'].mean():.2f}")
extreme_reactions = (df['num_reactions'] > 50).sum()
if extreme_reactions > 0:
    print(f"  Cases with >50 reactions: {extreme_reactions:,}")


Age Distribution:
  Min: 0.0 years
  Max: 8000.0 years
  Mean: 54.7 years
  Median: 59.0 years
 Ages >120 found: 46

Drug Count Distribution:
  Min drugs: 1.0
  Max drugs: 210.0
  Mean: 1.69
 Cases with >20 drugs: 13,909

Reaction Count Distribution:
  Min reactions: 1.0
  Max reactions: 252.0
  Mean: 3.48
  Cases with >50 reactions: 6,633


In [16]:
# Drop rows with age > 120
print(f"Before: {len(df):,} total cases")
print(f"Dropping: {(df['age_years'] > 120).sum()} cases with age > 120")

df = df[(df['age_years'] <= 120) | (df['age_years'].isna())]

print(f"After: {len(df):,} total cases")

Before: 2,848,079 total cases
Dropping: 46 cases with age > 120
After: 2,848,033 total cases


In [17]:
# For num_suspect_drugs column let Just KEEP these cases as-is
#Let association mining handle them naturally
#High drug counts will create their own patterns if significant

In [18]:
#For num_reactions column -> these are likely real cascading effects - keeping them.

In [19]:
##Handling missing values

#concomitant_drugs: Non-suspect drugs (not causing adverse events)
#num_concom_drugs: Count of these drugs
#73% missing means 73% of patients had NO concomitant drugs recorded

# remove concomitant drug columns as using any other approach like putting 0 for them can give false signal during analysis.
df_clean = df.drop(['concomitant_drugs', 'num_concom_drugs'], axis=1)

In [20]:
df_clean.shape

(2848033, 19)

In [21]:
#Handling "polypharmacy_category", "reaction_severity". columns

# Check how many rows will be affected
print("Before removing rows:")
print(f"Total cases: {len(df_clean):,}")
print(f"Missing polypharmacy_category: {df_clean['polypharmacy_category'].isna().sum()}")
print(f"Missing reaction_severity: {df_clean['reaction_severity'].isna().sum()}")

# Remove rows where either column is null
df_clean = df_clean.dropna(subset=['polypharmacy_category', 'reaction_severity'])

# Show results
print(f"\nAfter removing rows:")
print(df_clean.shape)

# Verify no missing values in these columns
print(f"\nVerification:")
print(f"Missing polypharmacy_category: {df_clean['polypharmacy_category'].isna().sum()}")
print(f"Missing reaction_severity: {df_clean['reaction_severity'].isna().sum()}")

Before removing rows:
Total cases: 2,848,033
Missing polypharmacy_category: 37
Missing reaction_severity: 137

After removing rows:
(2847862, 19)

Verification:
Missing polypharmacy_category: 0
Missing reaction_severity: 0


In [22]:
# Drop num_indications column

df_clean = df_clean.drop(['num_indications'], axis=1)
print(f"After dropping: {df_clean.shape}")

After dropping: (2847862, 18)


In [23]:
# Convert NaN / wrong types to empty list [] for indications  column
df_clean["indications"] = df_clean["indications"].apply(
    lambda x: x if isinstance(x, list) else []
)

df_clean.shape

(2847862, 18)

In [24]:
# HANDLE MISSING VALUES IN OUTCOME COLUMNS

# Replace missing lists with empty lists
list_cols = ["outcome_codes", "outcome_descriptions"]

for col in list_cols:
    df_clean[col] = df_clean[col].apply(lambda x: x if isinstance(x, list) else [])

# Replace missing is_serious_outcome with False
df_clean["is_serious_outcome"] = df_clean["is_serious_outcome"].fillna(False)

print(df_clean.shape)

(2847862, 18)


In [25]:
#Handling age_years column

# Replace missing values with -1
df_clean['age_years'] = df_clean['age_years'].fillna(-1)

# Create missing indicator
df_clean['age_years_missing'] = df_clean['age_years'].isna().astype(int)

In [26]:
#This approach keeps all rows (important because age_years has ~40% missing) and preserves the information that the value was originally missing. Replacing missing values
#with -1 gives algorithms (clustering, PCA, association rules) a valid numeric placeholder so they can run without errors. The missing-indicator column
#(age_years_missing) acts as an extra feature that tells the model whether the value was missing, preventing distortion and allowing the model to learn patterns related
#to missingness. This combination is simple, avoids bias from mean/median imputation, and works well for mixed tasks like association mining, clustering, and
#dimensionality reduction.

In [27]:
# Handing age_group column

# Ensure it is categorical
df_clean['age_group'] = df_clean['age_group'].astype('category')

# Add the new category "unknown" safely
df_clean['age_group'] = df_clean['age_group'].cat.add_categories(['unknown'])

# Now fill missing values
df_clean['age_group'] = df_clean['age_group'].fillna('unknown')

In [28]:
# Ensure occr_country is string (not categorical)
df_clean["occr_country"] = df_clean["occr_country"].astype("string")

# Fill missing values
df_clean["occr_country"] = df_clean["occr_country"].fillna("UNKNOWN")

In [29]:
# Check data types
print("\nColumn Data Types:")
print("-"*50)
for col in df.columns:
    dtype = df[col].dtype
    if dtype == 'object':
        # Check if it's a list column
        sample_val = df[col].dropna().iloc[0] if df[col].notna().any() else None
        if isinstance(sample_val, list):
            print(f"{col:30s}: List")
        else:
            print(f"{col:30s}: {dtype}")
    else:
        print(f"{col:30s}: {dtype}")


Column Data Types:
--------------------------------------------------
primaryid                     : int64
age_years                     : float64
age_group                     : category
sex_clean                     : object
occr_country                  : object
is_elderly                    : int64
is_pediatric                  : int64
suspect_drugs                 : List
num_suspect_drugs             : float64
polypharmacy_category         : category
concomitant_drugs             : List
num_concom_drugs              : float64
all_reactions                 : List
num_reactions                 : float64
has_serious_reaction          : int64
reaction_severity             : category
outcome_codes                 : List
is_serious_outcome            : int64
outcome_descriptions          : List
indications                   : List
num_indications               : float64


In [30]:
df_clean.shape

(2847862, 19)

In [31]:
df_clean.head()

Unnamed: 0,primaryid,age_years,age_group,sex_clean,occr_country,is_elderly,is_pediatric,suspect_drugs,num_suspect_drugs,polypharmacy_category,all_reactions,num_reactions,has_serious_reaction,reaction_severity,outcome_codes,is_serious_outcome,outcome_descriptions,indications,age_years_missing
0,1001678125,56.0,middle_age,Female,CA,0,0,"[SANDOSTATIN LAR DEPOT, SANDOSTATIN]",2.0,dual_therapy,"[BLOOD CREATINE INCREASED, FALL, SINUSITIS, SK...",76.0,1,extreme,[OT],0,[Other Serious (Important Medical Event)],[NEUROENDOCRINE TUMOUR],0
1,1002872124,57.0,middle_age,Female,CA,0,0,"[SANDOSTATIN LAR DEPOT, AFINITOR, SANDOSTATIN]",3.0,moderate_poly,"[PNEUMONITIS, ABDOMINAL PAIN UPPER, CARCINOID ...",24.0,1,extreme,"[OT, HO]",1,"[Other Serious (Important Medical Event), Hosp...","[PANCREATIC NEUROENDOCRINE TUMOUR, CARCINOID T...",0
2,100293663,32.0,adult,Male,AU,0,0,"[CYCLOSPORINE, BASILIXIMAB, MYCOPHENOLATE MOFE...",4.0,moderate_poly,"[STAPHYLOCOCCAL INFECTION, MYCOBACTERIUM HAEMO...",4.0,0,moderate,[OT],0,[Other Serious (Important Medical Event)],"[RENAL TRANSPLANT, IMMUNOSUPPRESSANT DRUG THER...",0
3,1005450710,68.0,elderly,Female,US,1,0,"[ENBREL, METHOTREXATE SODIUM]",2.0,dual_therapy,"[DRUG HYPERSENSITIVITY, DRUG ERUPTION]",2.0,0,few,[],0,[],[],0
4,1005762118,57.0,middle_age,Male,CA,0,0,"[EXELON, XOLAIR]",2.0,dual_therapy,"[PHOTOSENSITIVITY REACTION, PARKINSON'S DISEAS...",32.0,0,extreme,[HO],1,[Hospitalization - Initial or Prolonged],"[ANTIBIOTIC PROPHYLAXIS, ASTHMA]",0


#Save the final dataset

In [32]:
# Define save path
save_path = '/content/drive/MyDrive/Dataset/final_dataset/'
os.makedirs(save_path, exist_ok=True)

# Save the cleaned dataset with new name
final_file_path = f'{save_path}Final Dataset.pkl'

print("="*60)
print("SAVING PREPROCESSED DATASET")
print("="*60)

# Save to pickle
df_clean.to_pickle(final_file_path)

# Verify save
file_size = os.path.getsize(final_file_path) / (1024**2)
print(f"âœ… Dataset saved successfully!")
print(f"\nðŸ“Š Final Dataset Info:")
print(f"   Path: {final_file_path}")
print(f"   Size: {file_size:.2f} MB")
print(f"   Shape: {df_clean.shape}")
print(f"   Cases: {df_clean.shape[0]:,}")
print(f"   Features: {df_clean.shape[1]}")

SAVING PREPROCESSED DATASET
âœ… Dataset saved successfully!

ðŸ“Š Final Dataset Info:
   Path: /content/drive/MyDrive/Dataset/final_dataset/Final Dataset.pkl
   Size: 618.87 MB
   Shape: (2847862, 19)
   Cases: 2,847,862
   Features: 19
