# Preprocessing

## Imports

### (ATTENTION: To use the standard pandas not the GPU version, uncomment the pandas import and comment the cudf comment)

In [None]:
import cudf as pd
# import pandas as pd
from cudf import DataFrame
import os
import time
import numpy as np

## Global variables

In [None]:
start_time = time.time()

path_to_data = "data/"

## Preprocessing

### Preprocessing healthy: dataset | Drop operation

In [None]:
raw_healthy_csv_path = os.path.join(path_to_data, "raw_healthy_data.csv")

raw_healthy_df = pd.read_csv(raw_healthy_csv_path, index_col=0, header=0,  sep=',')

raw_healthy_df.drop('Description', axis=1, inplace=True)

if raw_healthy_df.dtypes.nunique() > 1:
    raise ValueError("DataFrame contains multiple data types, which is not supported.")

raw_healthy_df = raw_healthy_df.T

raw_healthy_df.head()

### Preprocessing unhealthy: dataset | Drop operation

In [None]:
raw_unhealthy_csv_path = os.path.join(path_to_data, "raw_unhealthy_data.csv")

raw_unhealthy_df = pd.read_csv(raw_unhealthy_csv_path, index_col=0, header=0, sep=',')

raw_unhealthy_df.drop('gene_name', inplace=True)
raw_unhealthy_df.drop('gene_type', inplace=True)

### Preprocessing healthy: gencode processor

In [None]:
temp_column_names = raw_healthy_df.columns.str.split('.').str[0]

columns_to_keep_mask = ~temp_column_names.duplicated(keep='first')

print("Total columns before removal:", len(raw_healthy_df.columns))

raw_healthy_df_cleaned = raw_healthy_df.loc[:, columns_to_keep_mask]
raw_healthy_df_cleaned.columns = temp_column_names[columns_to_keep_mask]

raw_healthy_df = raw_healthy_df_cleaned

del raw_healthy_df_cleaned  # Free memory

print("Total columns after removal:", len(raw_healthy_df.columns))

### Preprocessing unhealthy: gencode processor

In [None]:
temp_column_names = raw_unhealthy_df.columns.str.split('.').str[0]

columns_to_keep_mask = ~temp_column_names.duplicated(keep='first')

print("Total columns before removal:", len(raw_unhealthy_df.columns))

raw_unhealthy_df_cleaned = raw_unhealthy_df.loc[:, columns_to_keep_mask]
raw_unhealthy_df_cleaned.columns = temp_column_names[columns_to_keep_mask]

raw_unhealthy_df = raw_unhealthy_df_cleaned

del raw_unhealthy_df_cleaned  # Free memory

print("Total columns after removal:", len(raw_unhealthy_df.columns))

### Preprocessing healthy: Convert dtypes to int32 and drop NaN

In [None]:
raw_healthy_df = raw_healthy_df.astype(np.int32)
raw_healthy_df.dropna(axis=1, inplace=True)

### Preprocessing unhealthy: Convert dtypes to int32 and drop NaN

In [None]:
raw_unhealthy_df = raw_unhealthy_df.astype(np.int32)
raw_unhealthy_df.dropna(axis=1, inplace=True)

### Preprocessing: selecting only common genes

In [None]:
matching_genes = raw_healthy_df.columns.intersection(raw_unhealthy_df.columns)

unhealthy_df_rows_length = len(raw_unhealthy_df)

raw_healthy_df_filtered = raw_healthy_df[matching_genes]

raw_unhealthy_df_filtered = raw_unhealthy_df[matching_genes]

raw_healthy_df_filtered = raw_healthy_df_filtered.drop(raw_healthy_df_filtered.index[-(len(raw_healthy_df_filtered) - unhealthy_df_rows_length):])

if len(raw_healthy_df_filtered) != unhealthy_df_rows_length:
    raise ValueError("The number of rows in the healthy DataFrame does not match the unhealthy DataFrame after slicing.")

print(f"Healthy DataFrame rows after slicing: {len(raw_healthy_df_filtered)}")
print(f"Unhealthy DataFrame rows: {unhealthy_df_rows_length}")

print("Healthy DataFrame columns:", len(raw_healthy_df_filtered.columns))
print("Unhealthy DataFrame columns:", len(raw_unhealthy_df_filtered.columns))

raw_healthy_df = raw_healthy_df_filtered

raw_unhealthy_df = raw_unhealthy_df_filtered

del raw_healthy_df_filtered
del raw_unhealthy_df_filtered

### Preprocessing: add condition column

In [None]:
raw_unhealthy_df["condition"] = 0
raw_healthy_df["condition"] = 1

### Preprocessing: change index name

In [None]:
# raw_unhealthy_df_filtered.index.name = "sample_id"
# raw_healthy_df_filtered.index.name = "sample_id"

## Debug checks (can be commented out)

In [None]:
# raw_healthy_df_filtered.info()
# raw_healthy_df_filtered.head()

In [None]:
# raw_healthy_df_filtered.info()
# raw_unhealthy_df_filtered.head()

## Final checks before merge

In [None]:
are_column_names_same_regardless_order = set(raw_healthy_df.columns) == set(raw_unhealthy_df.columns)

if not are_column_names_same_regardless_order:
    raise ValueError("Column names in healthy and unhealthy DataFrames do not match.")

if raw_healthy_df.duplicated().any():
    raise ValueError("Healthy DataFrame contains duplicate rows.")

if raw_unhealthy_df.duplicated().any():
    raise ValueError("Unhealthy DataFrame contains duplicate rows.")

if raw_healthy_df.columns.duplicated().any():
    raise ValueError("Healthy DataFrame contains duplicate columns.")

if raw_unhealthy_df.columns.duplicated().any():
    raise ValueError("Unhealthy DataFrame contains duplicate columns.")

## Do merge

In [None]:
merged_df: DataFrame = pd.concat([raw_healthy_df, raw_unhealthy_df], axis=0)


merged_df.to_parquet(f"{path_to_data}merged_data.pq")

In [None]:
end_time = time.time()

print(f"Data processing completed in {end_time - start_time:.2f} seconds.")

# Statistics and Plots

## Imports

In [None]:
import cudf as pd
import cuml as sklearn
import matplotlib.pyplot as plt
import seaborn as sns

## Global Variables

In [None]:
path_to_data = "data/"

## Plots

### PCA Scatterplot

In [None]:
df = pd.read_parquet(f"{path_to_data}merged_data.pq")

x = df.drop(columns=["condition"])

y = df["condition"]

gene_columns = x.columns
mean_healthy = x[y == 1][gene_columns].mean()
mean_unhealthy = x[y == 0][gene_columns].mean()
mean_diff = (mean_healthy - mean_unhealthy).abs()

print("\nHead of Mean Differences (for top 5 genes):")
print(mean_diff.head())

k_genes = 10000

top_k_genes = mean_diff.nlargest(k_genes).index.to_pandas()

x_selected = x[top_k_genes]

print(f"\nOriginal number of genes: {x.shape[1]}")

print(f"Number of genes after aggressive selection (top {k_genes} by mean difference): {x_selected.shape[1]}")

scaler = sklearn.preprocessing.StandardScaler()

x_scaled = scaler.fit_transform(x_selected)

print("Shape of x_scaled:", x_scaled.shape)

pca = sklearn.decomposition.PCA(n_components=2)
pca_df = pca.fit_transform(x_scaled)

print("Shape of principal components:", pca_df.shape)

pca_df.columns = ["PC1", "PC2"]
pca_df.index = x_selected.index
pca_df["condition"] = y
pca_df = pca_df.to_pandas()

print(f"Shape of PCA DataFrame: {pca_df.shape}")
print("\nExplained Variance Ratio:")
print(f"PC1: {pca.explained_variance_ratio_[0]:.4f}")
print(f"PC2: {pca.explained_variance_ratio_[1]:.4f}")
print(f"Total Explained Variance (PC1 + PC2): {pca.explained_variance_ratio_.sum():.4f}")

print("Generating PCA plot...")
plt.figure(figsize=(10, 8))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="condition", palette='viridis', alpha=0.7, s=50)

plt.title('PCA of Gene Expression Data (Healthy vs. Unhealthy Samples)')
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.2f}% Variance Explained)')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.2f}% Variance Explained)')
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(title='Condition')
plt.show()

## Investigation plot

# Model training

## Imports

In [None]:
import cudf as pd
import cuml as sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt
import seaborn as sns

## Global Variables

In [None]:
path_to_data = "data/"

## KNN

### Preparation

In [None]:
print("Loading merged data from Parquet file...")

df = pd.read_parquet(f"{path_to_data}merged_data.pq")

x = df.drop(columns=["condition"])

gene_columns = x.columns

y = df["condition"]

print("Data loaded successfully.")

### Variance thresholding

In [None]:
print("Applying variance thresholding...")

print(f"Original number of genes: {len(gene_columns)}")

x_np = x.to_numpy()

selector = VarianceThreshold(threshold=0.1)
selector.fit(x_np)

selector_gene_mask = selector.get_support()

gene_columns_temp = gene_columns[selector_gene_mask]
x_filtered_variance = x[gene_columns_temp]

print(f"Number of genes after variance thresholding: {len(gene_columns_temp)}")

### Expression filtering

In [None]:
print("Low expression filtering...")

print(f"Number of genes before low expression filtering: {len(x_filtered_variance.columns)}")

gene_means = x_filtered_variance.mean()

low_expression_threshold = 0.1

gene_columns = gene_means[gene_means > low_expression_threshold].index.to_pandas()

x_filtered_low_expression = x_filtered_variance[gene_columns]

print(f"Number of genes after low expression filtering: {len(gene_columns)}")

### Train test split

In [None]:
print("Splitting data into training and testing sets...")

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x_filtered_low_expression, y, test_size=0.2, random_state=42, stratify=y
)

print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

### Agressive feature selection

In [None]:
print("Aggressive gene selection based on mean differences...")

print("Number of genes before selection:", x_train.shape[1])

mean_healthy = x_train[y_train == 1][gene_columns].mean()

mean_unhealthy = x_train[y_train == 0][gene_columns].mean()

mean_diff = (mean_healthy - mean_unhealthy).abs()

k_genes = 2000

print(f"Selecting top {k_genes} genes based on mean differences...")

top_k_genes = mean_diff.nlargest(k_genes).index.to_pandas()

x_train_selected = x_train[top_k_genes]
x_test_selected = x_test[top_k_genes]

print(f"Number of genes after aggressive selection: {x_train_selected.shape[1]}")

### Scaling gene expression data

In [None]:
print("Scaling selected gene expression data...")

scaler = sklearn.preprocessing.StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_selected)
x_test_scaled = scaler.transform(x_test_selected)

print("Shape of x_train_scaled:", x_train_scaled.shape)
print("Shape of x_test_scaled:", x_test_scaled.shape)

## Logistical regression

### Logistic regression (to prevent perfect seperators)

In [None]:
print("Training logistical regression model...")

logreg_model = sklearn.linear_model.LogisticRegression(
    penalty='l2',
    C=0.0001, 
    solver='qn',
    max_iter=1000, 
)

logreg_model.fit(x_train_scaled, y_train)

print("Model training completed.")

### Logistic regression prediction and evaluation

In [None]:
y_pred_logreg = logreg_model.predict(x_test_scaled)

y_test_np = y_test.to_numpy()
y_pred_logreg_np = y_pred_logreg.to_numpy()

accuracy = accuracy_score(y_test_np, y_pred_logreg_np)
precision = precision_score(y_test_np, y_pred_logreg_np, average='binary')
recall = recall_score(y_test_np, y_pred_logreg_np, average='binary')
f1 = f1_score(y_test_np, y_pred_logreg_np, average='binary')

print(f"Logistic Regression Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")