Dataset Pre-processing

In [1]:
import pandas as pd
import numpy as np
import random
import os

In [2]:
# File paths

# Target and Raw Folder path
raw_file_dir = os.path.join('..', 'Raw_Datasets', 'RA_data')
target_dir = os.path.join('..', 'Final Datasets', 'RA_final')

# Working File path
snp_file = os.path.join(raw_file_dir, 'RA_top10000_cleaned.csv')
sample_id = os.path.join(raw_file_dir, 'sample_ids.txt')
pheno_siml = os.path.join(raw_file_dir, 'phenotype_RA_simulated_50_50.tsv')
geno_raw = os.path.join(raw_file_dir, 'RA_chr1_6_genotypes_raw.txt')
geno_header = os.path.join(raw_file_dir, 'RA_chr1_6_genotypes_with_header.tsv')
dosage = os.path.join(raw_file_dir, 'RA_chr1_6_dosage_matrix.csv')
pheno_align = os.path.join(raw_file_dir, 'phenotype_RA_aligned.csv')

# Final File path 
pheno_final_extended = os.path.join(target_dir, 'RA_aligned_phenotypes_final_extended.csv')
pheno_final_path = os.path.join(target_dir, 'RA_aligned_phenotypes_final.csv') 
geno_final_path = os.path.join(target_dir, 'RA_aligned_genotypes_final.csv')

In [3]:
samples = pd.read_csv(sample_id, header=None, names=["sample_id"])

# Create 50% labels
n = len(samples)
labels = np.array([1]*int(n/2) + [0]*int(n/2))
np.random.shuffle(labels)

samples["label"] = labels

samples.to_csv(pheno_siml, sep="\t", index=False)

In [4]:
# Creating the Genotypes file with header
with open(sample_id) as f:
    sample_ids = [line.strip() for line in f]

df = pd.read_csv(geno_raw, sep="\t", header=None)

column_names = ["CHROM", "POS"] + sample_ids
df.columns = column_names

df.to_csv(geno_header, sep="\t", index=False)

In [5]:
# Creating the Dosage Matrix
raw = pd.read_csv(geno_header, sep='\t')

with open(sample_id) as f:
    sample_ids = [line.strip() for line in f]

# Extract SNP info and genotypes
snp_ids = raw.iloc[:, 0].astype(str) + "_" + raw.iloc[:, 1].astype(str)
genotypes = raw.iloc[:, 2:]

def gt_to_dosage(gt):
    if gt == "0|0":
        return 0
    elif gt in ["0|1", "1|0"]:
        return 1
    elif gt == "1|1":
        return 2
    else:
        return np.nan 

# Apply conversion to each cell
dosage_matrix = genotypes.applymap(gt_to_dosage)

# Transpose: rows = samples
X = dosage_matrix.T
X.columns = snp_ids
X.index = sample_ids
X_filled = X.fillna(X.mean())  # Impute missing values with column (SNP) mean

X.to_csv(dosage)

  dosage_matrix = genotypes.applymap(gt_to_dosage)


In [6]:
# Creating phenotype file from dosage matrix
X = pd.read_csv(dosage, index_col=0)
y = pd.read_csv(pheno_siml, sep="\t")

y_matched = y[y["sample_id"].isin(X.index)].copy()
X_aligned = X.loc[y_matched["sample_id"]]

assert all(X_aligned.index == y_matched["sample_id"]), "Sample alignment mismatch!"

# Preview
print("Matrix shape:", X_aligned.shape)
print("Phenotype breakdown:\n", y_matched["label"].value_counts())
y_matched.to_csv(pheno_align, index=False)

Matrix shape: (2504, 1612)
Phenotype breakdown:
 label
0    1252
1    1252
Name: count, dtype: int64


In [7]:
# Creating Final files for genotype and phenotype
pheno = pd.read_csv(pheno_align, sep=",") 
dosage_matrix = pd.read_csv(dosage, index_col=0)

# Convert labels to integers (0=control, 1=case)
pheno["label"] = pheno["label"].astype(int)

# Aligning samples between phenotype and genotype data and getting the intersection of Sample IDs
common_samples = list(set(pheno["sample_id"]).intersection(dosage_matrix.index))

# Filter both datasets
pheno_aligned = pheno[pheno["sample_id"].isin(common_samples)].set_index("sample_id")
dosage_aligned = dosage_matrix.loc[common_samples]

# Check alignment
print(f"Original samples: Pheno={len(pheno)}, Geno={len(dosage_matrix)}")
print(f"Aligned samples: {len(common_samples)}")
print("\nPhenotype distribution:")
print(pheno_aligned["label"].value_counts())

# 4. Prepare X and y for model training
X = dosage_aligned.astype(float)  
y = pheno_aligned["label"].values  

# Optional: Save aligned datasets
pheno_aligned.to_csv(pheno_final_path)
X.to_csv(geno_final_path)

Original samples: Pheno=2504, Geno=2504
Aligned samples: 2504

Phenotype distribution:
label
0    1252
1    1252
Name: count, dtype: int64


In [8]:
# Removing unnamed column in genotype file
df = pd.read_csv(geno_final_path)

# Fix the index column
if 'Unnamed: 0' in df.columns:
    df = df.set_index('Unnamed: 0')
    df.index.name = None  

# Verify
print("Fixed DataFrame:")
print(df.head(1))

# Saving corrected file (overwrite original)
df.to_csv(geno_final_path, index=True)  

print("\nFile saved without duplicate index column!")

Fixed DataFrame:
         1_113869057  1_113910209  1_113914390  1_113919261  1_114119620  \
HG03237          0.0          0.0          0.0          0.0          0.0   

         1_114173410  1_114303808  1_114335788  1_114365745  1_114377568  ...  \
HG03237          2.0          0.0          0.0          0.0          2.0  ...   

         6_33349885  6_33389381  6_33403422  6_33408542  6_137897086  \
HG03237         1.0         1.0         1.0         1.0          0.0   

         6_137959235  6_137973068  6_138002175  6_138005515  6_138006504  
HG03237          0.0          0.0          0.0          0.0          0.0  

[1 rows x 1612 columns]

File saved without duplicate index column!


In [9]:
# Removing unnamed column in phenotype file
df = pd.read_csv(pheno_final_path)

# 2. Fix the index column
if 'Unnamed: 0' in df.columns:
    df = df.set_index('Unnamed: 0')
    df.index.name = None  

# Verify
print("Fixed DataFrame:")
print(df.head())

# Saving corrected file (overwrite original)
df.to_csv(pheno_final_path, index=False)  

print("\nFile saved without duplicate index column!")

Fixed DataFrame:
  sample_id  label
0   HG00096      0
1   HG00097      0
2   HG00099      0
3   HG00100      1
4   HG00101      0

File saved without duplicate index column!


In [10]:
# Creating the phenotype file with Age and Sex

phenos = pd.read_csv(pheno_final_path)
phenos["age"] = [random.randint(40,70) for _ in range(len(phenos))]
phenos["sex"] = [random.choice(["M","F"]) for _ in range(len(phenos))]
phenos.to_csv(pheno_final_extended, index=False)
print("Extended phenotype file saved")

Extended phenotype file saved


In [11]:
# Final Verification of all files

genotypes = pd.read_csv(geno_final_path, index_col=0)
phenotypes = pd.read_csv(pheno_final_path)

# Basic phenotype checks
print("=== Phenotype Data Validation ===")
print(f"Total samples: {len(phenotypes)}")
print("\nFirst 5 rows:")
print(phenotypes.head())

print("\nLabel distribution:")
print(phenotypes['label'].value_counts())

# Checking for common issues
print("\n=== Quality Checks ===")
print(f"Missing labels: {phenotypes['label'].isna().sum()}")
print(f"Duplicate sample IDs: {phenotypes['sample_id'].duplicated().sum()}")
print(f"Non-binary labels: {set(phenotypes['label']) - {0, 1}}") 

# Verify alignment with genotypes
common_samples = set(phenotypes['sample_id']).intersection(genotypes.index)
print(f"\nSamples in both files: {len(common_samples)}/{len(phenotypes)}")

mismatched = set(phenotypes['sample_id']).symmetric_difference(genotypes.index)
if mismatched:
    print("\nWarning: Mismatched sample IDs (first 5):")
    print(list(mismatched)[:5])
else:
    print("\nAll sample IDs match perfectly!")

if len(mismatched) > 0:
    print("\nCleaning sample IDs...")
    phenotypes['sample_id'] = phenotypes['sample_id'].str.strip().str.upper()
    genotypes.index = genotypes.index.str.strip().str.upper()
    
    common_samples = set(phenotypes['sample_id']).intersection(genotypes.index)
    phenotypes = phenotypes[phenotypes['sample_id'].isin(common_samples)]
    genotypes = genotypes.loc[common_samples]
    
    print(f"Final aligned samples: {len(common_samples)}")
    
    phenotypes.to_csv(pheno_final_path, index=False)
    genotypes.to_csv(geno_final_path, index=True)
    print("Saved cleaned files!")

=== Phenotype Data Validation ===
Total samples: 2504

First 5 rows:
  sample_id  label
0   HG00096      0
1   HG00097      0
2   HG00099      0
3   HG00100      1
4   HG00101      0

Label distribution:
label
0    1252
1    1252
Name: count, dtype: int64

=== Quality Checks ===
Missing labels: 0
Duplicate sample IDs: 0
Non-binary labels: set()

Samples in both files: 2504/2504

All sample IDs match perfectly!
