In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from glob import glob
import os

## Prepare Datasets for Machine Learning

In [None]:
# Load data
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t', na_values=['NA', 'null', ''])
    return df

# Preprocessing function
def preprocess_data(df):
    # Creating a copy to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Selecting the relevant columns
    relevant_columns = ['IMPACT', 'alleles', 'CHROM', 'REF', 'ALT', 'QUAL', 'AC', 'AF', 'AN', 'DB', 'DP', 'ExcessHet', 'FS', 'MLEAC', 'MLEAF', 'MQ', 'MQRankSum', 'QD', 'ReadPosRankSum', 'SOR', 'MAX_AF', 'BLOCKAVG_min30p3a', 'SNVSB', 'SNVHPOL']
    df = df[relevant_columns]

    # Impact mapping using .loc to avoid SettingWithCopyWarning
    impact_mapping = {'HIGH': 0, 'MODERATE': 1, 'LOW': 2, 'MODIFIER': 3}
    df.loc[:, 'IMPACT'] = df['IMPACT'].map(impact_mapping)

    # Handling categorical columns that are not ordinal
    categorical_cols = ['alleles', 'CHROM', 'REF', 'ALT', 'DB']
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df.loc[:, col] = le.fit_transform(df[col].astype(str))  # Ensure all data are string
        label_encoders[col] = le

    # Handle NaN before converting True/False to 1/0
    df['BLOCKAVG_min30p3a'] = df['BLOCKAVG_min30p3a'].fillna(-1)  # Placeholder for NaN
    df['BLOCKAVG_min30p3a'] = df['BLOCKAVG_min30p3a'].astype(int)

    # Fill remaining missing values with np.nan (if applicable)
    df.fillna(value=np.nan, inplace=True)

    return df

# Main function to load, preprocess, and save data
def process_and_save_file(input_file_path, output_file_path):
    df = load_data(input_file_path)
    df_processed = preprocess_data(df)
    df_processed.to_csv(output_file_path, index=False, sep='\t')
    print(f"Data processed and saved to {output_file_path}")

# File paths
input_file_path = '/mnt/sdb/markus-bsc-thesis-data/machine-learning/negative_group_aggregated.tsv'
output_file_path = '/mnt/sdb/markus-bsc-thesis-data/machine-learning/ML_prepped_negative_group_aggregated.tsv'

# Process and save the file
process_and_save_file(input_file_path, output_file_path)

## Divide Data into Training Testing and Validation Sets

In [None]:
def load_data(base_dir, group_name):
    tsv_dir = os.path.join(base_dir, group_name, 'scaled')
    files = glob(os.path.join(tsv_dir, "*.tsv"))
    data_list = [pd.read_csv(file, sep='\t') for file in files]
    if data_list:
        data = pd.concat(data_list)
        data['group'] = group_name
    else:
        data = pd.DataFrame()
    return data

def load_all_groups(base_directory):
    groups = ["positive-group", "negative-group"]  # Updated list of groups
    data_frames = {}
    for group in groups:
        data_frames[group] = load_data(base_directory, group)
        data_frames[group]['label'] = group
    #print(data_frames, "Data frames loaded successfully.")
    return data_frames

def prepare_datasets(data_frames):
    data_positive = data_frames["positive-group"]
    data_negative = data_frames["negative-group"]
    
    # Splitting positive data into training, validation, and testing sets
    data_pos_train, data_pos_test = train_test_split(data_positive, test_size=0.2, random_state=42)
    data_pos_train, data_pos_val = train_test_split(data_pos_train, test_size=0.33, random_state=42)

    # Concatenating with negative data
    training_data = pd.concat([data_pos_train, data_negative.sample(n=len(data_pos_train), random_state=42)])
    validation_data = pd.concat([data_pos_val, data_negative.sample(n=len(data_pos_val), random_state=42)])
    testing_data = pd.concat([data_pos_test, data_negative.sample(n=len(data_pos_test), random_state=42)])

    print("Finished dataset preparation...")
    #print(training_data, "Training data")
    #print(validation_data, "Validation data")
    #print(testing_data, "Testing data")
    return training_data, validation_data, testing_data

base_directory = '/mnt/sdb/markus-bsc-thesis-data/machine-learning'
data_frames = load_all_groups(base_directory)
training_data, validation_data, testing_data = prepare_datasets(data_frames)

# Optionally, save the datasets to files
training_data.to_csv(f"{base_directory}/training_set.csv", index=False)
validation_data.to_csv(f"{base_directory}/validation_set.csv", index=False)
testing_data.to_csv(f"{base_directory}/testing_set.csv", index=False)

## Train model using XGBoost

In [None]:
# Load data - ensure this is after you've loaded the data with pd.read_csv or similar
training_data = pd.read_csv(f"{base_directory}/training_set.csv")
validation_data = pd.read_csv(f"{base_directory}/validation_set.csv")
testing_data = pd.read_csv(f"{base_directory}/testing_set.csv")

scaler = StandardScaler()
training_data_scaled = scaler.fit_transform(training_data.drop(['label'], axis=1))
training_data_scaled = pd.DataFrame(training_data_scaled, columns=training_data.columns[:-1])
training_data_scaled['label'] = training_data['label'].map({'positive-group': 1, 'negative-group': 0})

scaler = StandardScaler()
validation_data_scaled = scaler.fit_transform(validation_data.drop(['label'], axis=1))
validation_data_scaled = pd.DataFrame(validation_data_scaled, columns=validation_data.columns[:-1])
validation_data_scaled['label'] = validation_data['label'].map({'positive-group': 1, 'negative-group': 0})

scaler = StandardScaler()
testing_data_scaled = scaler.fit_transform(testing_data.drop(['label'], axis=1))
testing_data_scaled = pd.DataFrame(testing_data_scaled, columns=testing_data.columns[:-1])
testing_data_scaled['label'] = testing_data['label'].map({'positive-group': 1, 'negative-group': 0})


# Define relevant columns and categorical columns, including label for preprocessing
relevant_columns = ['IMPACT', 'alleles', 'CHROM', 'REF', 'ALT', 'QUAL', 'AC', 'AF', 'AN', 'DB', 'DP', 'ExcessHet', 'FS', 'MLEAC', 'MLEAF', 'MQ', 'MQRankSum', 'QD', 'ReadPosRankSum', 'SOR', 'MAX_AF', 'BLOCKAVG_min30p3a', 'SNVSB', 'SNVHPOL', 'label']
categorical_cols = ['alleles', 'CHROM', 'REF', 'ALT', 'DB']

# Prepare datasets for training by selecting relevant columns and encoding categorical features
def prepare_for_training(data, relevant_columns, categorical_cols):
    data = data[relevant_columns].copy()  # Select relevant columns, including label
    label_encoder = LabelEncoder()
    for col in categorical_cols:
        data[col] = label_encoder.fit_transform(data[col])
    return data

training_data = prepare_for_training(training_data, relevant_columns, categorical_cols)
validation_data = prepare_for_training(validation_data, relevant_columns, categorical_cols)
testing_data = prepare_for_training(testing_data, relevant_columns, categorical_cols)

print(training_data.describe())
print(training_data.head())

# Convert to DMatrix, preserving label
dtrain = xgb.DMatrix(training_data.drop(['label'], axis=1), label=training_data['label'].map({'positive-group': 1, 'negative-group': 0}))
dval = xgb.DMatrix(validation_data.drop(['label'], axis=1), label=validation_data['label'].map({'positive-group': 1, 'negative-group': 0}))
dtest = xgb.DMatrix(testing_data.drop(['label'], axis=1), label=testing_data['label'].map({'positive-group': 1, 'negative-group': 0}))

# Parameters for XGBoost
params = {
    'max_depth': 5,
    'objective': 'binary:logistic',
    'eta': 0.1,
    'eval_metric': 'logloss',
    'random_state': 42
}
num_rounds = 100

# Training the model
model = xgb.train(params, dtrain, num_rounds, evals=[(dtrain, 'train'), (dval, 'validation')])

# Predictions and Evaluation
predictions_proba = model.predict(dtest)
predictions = [1 if p >= 0.5 else 0 for p in predictions_proba]
accuracy = accuracy_score(testing_data['label'].map({'positive-group': 1, 'negative-group': 0}), predictions)
#print("Test Accuracy:", accuracy)

# Plotting feature importance
xgb.plot_importance(model)