In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gzip
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve

# path to the series matrix file
path = "/home/rhea/PD_dataset_training_self_project/GSE6613_series_matrix.txt.gz"

# loading the data
df = pd.read_csv(path, sep="\t", comment="!", index_col=0, compression = "gzip")

df.head()

Unnamed: 0_level_0,GSM153404,GSM153405,GSM153406,GSM153407,GSM153408,GSM153409,GSM153410,GSM153411,GSM153412,GSM153413,...,GSM153499,GSM153500,GSM153501,GSM153502,GSM153503,GSM153504,GSM153505,GSM153506,GSM153507,GSM153508
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1007_s_at,114.5,105.1,145.7,168.7,94.3,96.2,155.8,99.8,100.9,106.4,...,153.8,103.7,132.8,195.5,82.6,28.2,163.9,29.0,104.8,32.8
1053_at,64.4,58.4,52.5,45.4,51.3,50.7,42.2,6.7,44.0,55.5,...,44.0,70.5,61.9,52.2,61.2,67.6,101.9,46.8,74.9,78.1
117_at,206.3,179.8,192.0,263.6,211.9,149.5,157.3,216.6,230.5,224.1,...,268.9,48.6,193.6,132.5,165.5,146.0,291.4,246.2,251.2,210.0
121_at,507.0,497.8,346.3,430.7,485.5,437.6,424.1,678.7,434.1,592.8,...,403.6,559.1,569.7,532.8,525.7,603.9,479.7,271.9,506.1,270.4
1255_g_at,34.5,18.0,40.1,40.5,22.8,6.4,27.4,5.2,34.7,18.6,...,44.8,39.5,30.5,65.8,54.3,19.3,61.0,40.8,29.5,82.2


In [36]:
def get_characteristics(matrix_file, prefix):
    """
    Reads the matrix file and returns the line containing the sample conditions.

    Args:
        matrix_file (str): The path to the matrix file.
        prefix (str): The phrase to search for the sample condition metadata.

    Returns:
        str or None: The sample conditions, or None if not found.
    """

    opener = gzip.open if matrix_file.endswith(".gz") else open
    
    sample_conditions = []
    with opener(matrix_file, 'rt', encoding = 'utf-8', errors = 'replace') as file:
        for line in file:
            line = line.rstrip("\n")
            if line.startswith(prefix):
                rest = line[len(prefix):].strip()
                rest = rest.lstrip(":= \t")        # removing any leading punctuation (colon or equals) if exists
                if rest:
                    labels = rest.split('\t')
                    for i in labels:
                        sample_conditions.append(i.strip('"'))
    return sample_conditions 

prefix = '!Sample_characteristics_ch1'
labels_raw = get_characteristics(path, prefix)
print(labels_raw)

["Parkinson's disease", 'healthy control', 'healthy control', "Parkinson's disease", "Parkinson's disease", 'neurological disease control', "Parkinson's disease", "Parkinson's disease", "Parkinson's disease", "Parkinson's disease", "Parkinson's disease", "Parkinson's disease", "Parkinson's disease", "Parkinson's disease", "Parkinson's disease", 'healthy control', "Parkinson's disease", "Parkinson's disease", "Parkinson's disease", 'healthy control', "Parkinson's disease", 'healthy control', 'neurological disease control', 'healthy control', 'healthy control', 'healthy control', "Parkinson's disease", 'neurological disease control', "Parkinson's disease", 'healthy control', "Parkinson's disease", "Parkinson's disease", "Parkinson's disease", "Parkinson's disease", 'neurological disease control', "Parkinson's disease", 'neurological disease control', "Parkinson's disease", "Parkinson's disease", "Parkinson's disease", 'healthy control', "Parkinson's disease", "Parkinson's disease", 'neur

In [40]:
# Encoding the PD condition as 1 and controls as 0

encoded_condition = []
for x in list(labels_raw):
    s = str(x).lower()
    if "parkinson" in s:
        encoded_condition.append(1)
    else:
        # both "neurological disease control" and "healthy control" fall here
        encoded_condition.append(0)

encoded_condition = np.array(encoded_condition)
np.unique(y, return_counts=True)

(array([0, 1]), array([55, 50]))

In [44]:
samples = df.columns[1:]     # parsing smaple IDs
sample_ids = samples.tolist()

# Obtaining the expression values for each sample, mainiting the order of sample IDs

expr = df.copy()
expr = expr.T                # transposing so rows are samples, cols are probes/genes
expr = expr.loc[sample_ids]  # match same sample order

X = expr.values

[[105.1  58.4 179.8 ...   7.   14.4   5.2]
 [145.7  52.5 192.  ...   1.8  28.8   4.4]
 [168.7  45.4 263.6 ...   1.5   9.5   3.6]
 ...
 [ 29.   46.8 246.2 ...   8.6   6.    3.1]
 [104.8  74.9 251.2 ...  35.2   4.1  21.5]
 [ 32.8  78.1 210.  ...  11.5  22.    5.9]]


In [46]:
# Coercing expression values to numeric values only
expr_numeric = expr.apply(pd.to_numeric, errors='coerce')

# Checking to see if the error after coercion is too large due to data drop

# before = expr.shape[1]
# after  = expr_numeric.dropna(axis=1, how='any').shape[1]
# print(before, after, 1 - after/before)

# Imputing the value using median, if data loss very huge
# expr_numeric = expr_numeric.fillna(expr_numeric.median())

# Dropping all NA and invalid columns, since data loss is not large (<1% probes)
expr_numeric = expr_numeric.dropna(axis=1, how='any')


22283 22283 0.0
