In [None]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
import os

# Annotation and Scaling

Step 1: Parsing the Phenotype File

In [None]:
def parse_phenotype_file(filepath):
    df = pd.read_csv(filepath, delimiter='\t', encoding='windows-1252', header=None)
    df.columns = ['Patient_ID', 'Disease', 'Position']
    # Store both Disease and Position in a tuple
    phenotype_dict = {row['Patient_ID']: (row['Disease'], row['Position']) for index, row in df.iterrows()}
    return phenotype_dict

phenotype_file_path = '/mnt/sdb/phenotype-pseudon.txt'
phenotype_data = parse_phenotype_file(phenotype_file_path)
print(phenotype_data)


Me loeme sisse kõik VCFid sõltumata nende eksisteerimisest globals filest ning annoteerime need ära lisades globals filest leitud phenotype data.

In [None]:
def export_to_pandas_and_scale(mt, destination, vcf_filename, phenotype_info):
    disease, position = phenotype_info
    group = classify_disease(disease, position)
    
    csv_destination = os.path.join(destination, group + '-csv')  # Append '-csv' to the group folder

    if not os.path.exists(csv_destination):
        os.makedirs(csv_destination)
    save_path = os.path.join(csv_destination, vcf_filename.replace('.vcf', '.csv'))
    
    mt = mt.drop('vep') # Drop the 'vep' struct to simplify the DataFrame export
    df = mt.rows().flatten().to_pandas()    # Convert to pandas DataFrame
    
    # Add 'phenotype_position' from 'phenotype_info'
    df['phenotype_position'] = position  # Use the unpacked position

    # Mapping and Encoding
    impact_mapping = {'HIGH': 0, 'MODERATE': 1, 'LOW': 2, 'MODIFIER': 3}
    df['IMPACT'] = df['IMPACT'].map(impact_mapping)

    # Ordinal Encoding for gene symbols and gene IDs
    encoder = OrdinalEncoder()
    df[['SYMBOL', 'HGNC_ID']] = encoder.fit_transform(df[['SYMBOL', 'HGNC_ID']])

    # Scaling numeric values
    scaler = MinMaxScaler()
    df['MAX_AF'] = scaler.fit_transform(df[['MAX_AF']])

    # Selecting and rearranging the final DataFrame columns
    final_columns = ['IMPACT', 'SYMBOL', 'HGNC_ID', 'MAX_AF', 'phenotype_position'] 
    df_final = df[final_columns]

    df_final.to_csv(save_path, index=False) # Save the DataFrame to a CSV file
    print(f"Exported {save_path}")
    return save_path  # Ensure this path is returned


In [None]:
def classify_disease(disease, position):
    if disease == 'TERVE':
        return 'positive-group-terve'
    elif disease == 'RV' and position != 'NA':
        return 'positive-group-RV'
    else:
        return 'negative-group'

In [None]:
import hail as hl

def vcfs_to_matrixtable(source, phenotype_data, vep_config_path, base_destination, write=True, export_to_pandas=True, log_file='processed_vcfs_log.txt'):
    files = []
        
    if os.path.isdir(source):
        files = [os.path.join(source, f) for f in os.listdir(source) if f.endswith('.vcf') or f.endswith('.vcf.gz')]
    elif os.path.isfile(source) and (source.endswith('.vcf') or source.endswith('.vcf.gz')):
        files.append(source)
    else:
        raise ValueError("Invalid path or file type. Must be a directory or a VCF file.")

    assert files, "No VCF files found at the specified location."

    hl.init(default_reference='GRCh37')  # Initialize Hail

    # Contig recoding for import_vcf
    contig_recoding = {f"chr{i}": str(i) for i in range(1, 23)}
    contig_recoding.update({"chrX": "X", "chrY": "Y"})
    log_entries = []
    
    processed_vcfs = [] #TOOD: 

    # Import and annotate files in a loop
    try:
        for vcf in files:
        
            patient_code = os.path.basename(vcf).split('_')[0]
            phenotype_info = phenotype_data.get(patient_code, ("NA", "NA"))
            
            disease, position = phenotype_info
            group = classify_disease(disease, position)
            
            destination = os.path.join(base_destination, group)
            destination_path = os.path.join(destination, os.path.basename(vcf).replace('.vcf', '.mt'))
     
            if os.path.exists(destination_path):
                print(f"Skipping {vcf}, as the output file already exists in {destination_path}.")
                continue
                
            if not os.path.exists(destination):
                os.makedirs(destination)
    
    
            mt = hl.import_vcf(vcf, force_bgz=True, reference_genome='GRCh37', contig_recoding=contig_recoding, skip_invalid_loci=True)
            mt = hl.vep(mt, config=vep_config_path)
            
            
            print("VEP output structure:")
            mt.vep.describe()
            
            mt = mt.annotate_globals(phenotype_disease=phenotype_info[0], phenotype_position=phenotype_info[1])
            mt = mt.filter_rows(mt.alleles[1] != "*") # filter star alleles 
            mt = mt.annotate_rows(
                IMPACT=mt.vep.IMPACT, # HIGH/MODERATE
                SYMBOL=mt.vep.SYMBOL, # Categorical
                HGNC_ID=mt.vep.HGNC_ID, # Categorical
                MAX_AF=mt.vep.MAX_AF, # Numeric field, allele frequency
                MAX_AF_POPS=mt.vep.MAX_AF_POPS # Populations with max allele frequency
            )
                    
            if write:
                destination_path = os.path.join(destination, os.path.basename(vcf).replace('.vcf', '.mt'))
                mt.write(destination_path)
                
            csv_path = export_to_pandas_and_scale(mt, destination, os.path.basename(vcf), phenotype_info) if export_to_pandas else 'Not Exported'
            log_entry = f"{vcf}, {phenotype_info[0]}, {phenotype_info[1]}, {destination_path}, {csv_path}"

            log_entries.append(log_entry)
    
                
            if export_to_pandas:
                export_to_pandas_and_scale(mt, destination, os.path.basename(vcf), phenotype_info)
                processed_vcfs.append({'vcf': vcf, 'mt_path': destination_path})
            

    finally:         
        hl.stop()  # Stop Hail context when done
        with open(log_file, 'w') as file:
            file.write('\n'.join(log_entries))
    
    return processed_vcfs, log_entries

# Example usage:
VEP_CONFIG_PATH = '/home/markus/gen-toolbox/src/config/vep_settings.json'
SOURCE_DIR = '/mnt/sdb/TSHC_data_VCF/E01381784_S4.annotated2.vcf'
base_destination_directory = '/home/markus/gen-toolbox/output'
phenotype_data = parse_phenotype_file('/mnt/sdb/phenotype-pseudon.txt') 
log_entries = vcfs_to_matrixtable(SOURCE_DIR, phenotype_data, VEP_CONFIG_PATH, base_destination_directory, write=True, export_to_pandas=True)

print(log_entries)

------------------------------------------------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------------

## Prepare Datasets for Machine Learning

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from glob import glob
import os

def load_data(base_dir, group_name):
    # Use glob to handle multiple CSV files
    csv_dir = os.path.join(base_dir, group_name, group_name + '-csv')
    files = glob(os.path.join(csv_dir, "*.csv"))
    data_list = [pd.read_csv(file) for file in files]
    if data_list:
        data = pd.concat(data_list)
        data['group'] = group_name  # Add a column indicating the group
    else:
        data = pd.DataFrame()
    return data

def load_all_groups(base_directory):
    groups = ["positive-group-terve", "positive-group-RV", "negative-group"]
    data_frames = {}
    for group in groups:
        data_frames[group] = load_data(base_directory, group) # Loading data for each group
        data_frames[group]['label'] = group  # Label the data based on group name
    print(data_frames, "Data frames loaded successfully.")
    return data_frames

def prepare_datasets(data_frames):
    # Sampling for validation set
    print("Starting to prepare datasets...")
    data_terve_val = data_frames["positive-group-terve"].sample(frac=0.66, random_state=42)
    data_rv_val = data_frames["positive-group-RV"].sample(frac=0.33, random_state=42)
    validation_data = pd.concat([data_terve_val, data_rv_val])

    # Remaining data
    data_terve_remain = data_frames["positive-group-terve"].drop(data_terve_val.index)
    data_rv_remain = data_frames["positive-group-RV"].drop(data_rv_val.index)
    remaining_data = pd.concat([data_terve_remain, data_rv_remain, data_frames["negative-group"]])

    # Split remaining data into training and testing
    training_data, testing_data = train_test_split(remaining_data, test_size=0.2, random_state=42)

    return training_data, validation_data, testing_data

base_directory = '/home/markus/gen-toolbox/output'
data_frames = load_all_groups(base_directory)
training_data, validation_data, testing_data = prepare_datasets(data_frames)

# Optionally, save the datasets to files
training_data.to_csv(f"{base_directory}/training_set.csv", index=False)
validation_data.to_csv(f"{base_directory}/validation_set.csv", index=False)
testing_data.to_csv(f"{base_directory}/testing_set.csv", index=False)

# TODO: Load the datasets according to the previous dir paths - csv files are placed under /home/markus/gen-toolbox/output/positive-group-RV/positive-group-RV-csv


## Machine Learning Models
### XGBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

params = {
    'max_depth': 3,  # Maximum depth of a tree
    'objective': 'binary:logistic',  # Objective function (multiclass classification)
    'eta': 0.3,  # Learning rate 
    'eval_metric': 'logloss',
    'random_state': 42
}
num_rounds = 100

# Creating DMatrix for XGBoost
dtrain = xgb.DMatrix(training_data.drop(['label', 'group'], axis=1), label=training_data['label'].map({'positive-group-terve': 1, 'positive-group-RV': 1, 'negative-group': 0}))
dval = xgb.DMatrix(validation_data.drop(['label', 'group'], axis=1), label=validation_data['label'].map({'positive-group-terve': 1, 'positive-group-RV': 1, 'negative-group': 0}))
dtest = xgb.DMatrix(testing_data.drop(['label', 'group'], axis=1), label=testing_data['label'].map({'positive-group-terve': 1, 'positive-group-RV': 1, 'negative-group': 0}))

# Training the model
model = xgb.train(params, dtrain, num_rounds, evals=[(dtrain, 'train'), (dval, 'validation')])

# Predictions
predictions = model.predict(dtest)
# Threshold of 0.5: If the predicted probability is >0.5, classify as 1 (positive); otherwise, 0 (negative).
predictions = [1 if p >= 0.5 else 0 for p in predictions]

# Evaluation
accuracy = accuracy_score(testing_data['label'].map({'positive-group-terve': 1, 'positive-group-RV': 1, 'negative-group': 0}), predictions)
print("Test Accuracy:", accuracy)


## TabPFN

In [None]:
from tabpfn import TabPFNClassifier
import time 

start = time.time()
# Initialize the TabPFN classifier
classifier = TabPFNClassifier(device='cpu', N_ensemble_configurations=32) # 32 different models will be trained

# Fit the model
classifier.fit(training_data.drop('labels', axis=1).values, training_data['labels'].values)

# Predict on the test set
predictions = classifier.predict(testing_data.drop('labels', axis=1).values)

# Evaluate the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(testing_data['labels'].values, predictions)
print("Test Accuracy:", accuracy)

# TODO: Calculate power
# TODO: Calculate precision
# TODO: calculate sensitivity
# TODO: calculate model accuracy
# TODO: calculate perfomance


# TODO: end usecase, load in single VCF using our trained model and predict the phenotype
