# PREPROCESSING

In [28]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

## Proteins.csv

Contains protein expression data across various samples. Each column represents a sample, with identifiers indicating the condition, sex, age, and disease stage. This data is important for studying the patterns of protein expression in relation to Alzheimer's disease.​

In [131]:
proteins = pd.read_csv('csv_files/Proteins.csv')

In [132]:
proteins.head(10)

Unnamed: 0,TUBA1B,0.025812249,0.114868547,0.039467029,0.164589862,-0.096525527,0.019700681,-0.002591546,0.099907368,0.042163492,...,0.024021325,-0.023119985,0.261665059,0.179903164,-0.067031579,0.250224495,-0.124336088,0.08165204,0.00944257,-0.06840385
0,ACTG1,0.000416,-0.05416,0.100341,0.023625,-0.06712,0.048944,0.076832,0.09189,0.024801,...,-0.162245,0.014562,0.118578,-0.028608,-0.208031,-0.037726,-0.101082,0.031508,0.0859,-0.225454
1,TUBB4B,-0.088755,-0.067824,0.084958,0.072881,-0.044365,0.065666,0.017401,0.045955,0.045929,...,-0.103476,-0.118308,0.125296,-0.100258,-0.095145,0.15044,-0.143795,0.10073,-0.005605,-0.147657
2,HBB,0.157828,-0.827798,-0.050724,-0.675763,-1.104865,-0.443659,0.476736,-0.545614,-0.15328,...,0.024241,0.097263,0.415171,0.296723,0.30729,-0.09984,-0.031416,-0.273378,0.206035,0.581175
3,GAPDH,0.072899,-0.007606,-0.02354,0.136452,0.012224,0.051385,0.06725,0.028715,0.057526,...,-0.160431,0.010148,0.231075,0.234622,-0.078019,0.144545,-0.03171,0.163577,0.085359,0.019764
4,HBA2,0.154047,-0.956292,-0.050428,-0.563935,-1.276744,-0.498043,0.382287,-0.676703,-0.030467,...,-0.011693,0.118344,0.456283,0.305501,0.306064,-0.131145,0.003923,-0.238403,0.374431,0.461211
5,SPTAN1,-0.045855,-0.03556,0.022052,-0.072983,0.090556,0.019037,0.071292,-0.025271,-0.042318,...,-0.077548,0.022194,-0.030857,-0.080586,-0.002678,0.028117,-0.066086,0.011087,0.028705,-0.076349
6,DPYSL2,0.06595,0.001286,0.014256,0.18976,0.018954,0.030938,-0.091357,0.032064,0.104706,...,-0.096669,0.024178,0.37223,0.138455,0.008448,0.204549,-0.0003,0.285529,-0.055993,-0.022201
7,SPTBN1,-0.015957,0.005202,-0.000139,-0.020462,0.044539,-0.052377,0.044861,-0.041211,-0.012246,...,-0.036425,-0.017915,-0.051819,-0.051165,0.055617,0.003923,-0.010211,0.033795,-0.005548,0.042559
8,CKB,-0.039686,-0.121929,0.012872,0.114658,-0.107376,-0.019236,0.091971,-0.051485,0.110854,...,0.019868,-0.024531,0.099417,0.135772,-0.123357,0.033652,0.070175,0.12706,0.047799,0.138158
9,PKM,0.081944,-0.071066,0.162457,0.092396,0.079172,0.149398,0.013927,-0.00441,-0.039993,...,-0.058542,-0.055328,0.09514,-0.130894,-0.142615,0.163489,-0.099725,-0.085582,-0.030203,-0.033967


In [133]:
# Transpose the DataFrame
proteins_transpose = proteins.transpose()
proteins_transpose.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3321,3322,3323,3324,3325,3326,3327,3328,3329,3330
TUBA1B,ACTG1,TUBB4B,HBB,GAPDH,HBA2,SPTAN1,DPYSL2,SPTBN1,CKB,PKM,...,AGFG2,PTRH2,TMED5,PODXL2,CUL4A,REEP6,NPC1,DSCR3,HAX1,VKORC1L1
0.025812249,0.000416,-0.088755,0.157828,0.072899,0.154047,-0.045855,0.06595,-0.015957,-0.039686,0.081944,...,-0.086443,,-0.633261,0.159727,,0.429527,-0.001607,0.590137,-0.116056,
0.114868547,-0.05416,-0.067824,-0.827798,-0.007606,-0.956292,-0.03556,0.001286,0.005202,-0.121929,-0.071066,...,0.03406,,-0.087358,-0.002128,,-0.011036,,,-0.171433,
0.039467029,0.100341,0.084958,-0.050724,-0.02354,-0.050428,0.022052,0.014256,-0.000139,0.012872,0.162457,...,,-0.410899,-0.177223,,-0.179217,-0.527957,,-0.790274,,
0.164589862,0.023625,0.072881,-0.675763,0.136452,-0.563935,-0.072983,0.18976,-0.020462,0.114658,0.092396,...,-0.33279,,,,,0.213148,,-0.128936,,
-0.096525527,-0.06712,-0.044365,-1.104865,0.012224,-1.276744,0.090556,0.018954,0.044539,-0.107376,0.079172,...,0.591717,1.484447,,0.261398,,0.23453,,0.623527,0.02367,
0.019700681,0.048944,0.065666,-0.443659,0.051385,-0.498043,0.019037,0.030938,-0.052377,-0.019236,0.149398,...,,-0.236246,0.551475,1.272439,,,,0.452177,,
-0.002591546,0.076832,0.017401,0.476736,0.06725,0.382287,0.071292,-0.091357,0.044861,0.091971,0.013927,...,,0.017237,0.137227,-0.819154,0.244657,-2.164349,-0.235973,-0.569174,0.188316,-0.407177
0.099907368,0.09189,0.045955,-0.545614,0.028715,-0.676703,-0.025271,0.032064,-0.041211,-0.051485,-0.00441,...,0.060379,-0.056896,,-0.28865,,-0.98305,,-0.312086,0.697252,
0.042163492,0.024801,0.045929,-0.15328,0.057526,-0.030467,-0.042318,0.104706,-0.012246,0.110854,-0.039993,...,-0.198928,,,0.137645,0.574422,0.237895,,,,


In [134]:
proteins_transpose.columns = proteins_transpose.iloc[0]
proteins_transpose = proteins_transpose.drop(proteins_transpose.index[0])

In [135]:
proteins_transpose.head(10)

TUBA1B,ACTG1,TUBB4B,HBB,GAPDH,HBA2,SPTAN1,DPYSL2,SPTBN1,CKB,PKM,...,AGFG2,PTRH2,TMED5,PODXL2,CUL4A,REEP6,NPC1,DSCR3,HAX1,VKORC1L1
0.025812249,0.000416,-0.088755,0.157828,0.072899,0.154047,-0.045855,0.06595,-0.015957,-0.039686,0.081944,...,-0.086443,,-0.633261,0.159727,,0.429527,-0.001607,0.590137,-0.116056,
0.114868547,-0.05416,-0.067824,-0.827798,-0.007606,-0.956292,-0.03556,0.001286,0.005202,-0.121929,-0.071066,...,0.03406,,-0.087358,-0.002128,,-0.011036,,,-0.171433,
0.039467029,0.100341,0.084958,-0.050724,-0.02354,-0.050428,0.022052,0.014256,-0.000139,0.012872,0.162457,...,,-0.410899,-0.177223,,-0.179217,-0.527957,,-0.790274,,
0.164589862,0.023625,0.072881,-0.675763,0.136452,-0.563935,-0.072983,0.18976,-0.020462,0.114658,0.092396,...,-0.33279,,,,,0.213148,,-0.128936,,
-0.096525527,-0.06712,-0.044365,-1.104865,0.012224,-1.276744,0.090556,0.018954,0.044539,-0.107376,0.079172,...,0.591717,1.484447,,0.261398,,0.23453,,0.623527,0.02367,
0.019700681,0.048944,0.065666,-0.443659,0.051385,-0.498043,0.019037,0.030938,-0.052377,-0.019236,0.149398,...,,-0.236246,0.551475,1.272439,,,,0.452177,,
-0.002591546,0.076832,0.017401,0.476736,0.06725,0.382287,0.071292,-0.091357,0.044861,0.091971,0.013927,...,,0.017237,0.137227,-0.819154,0.244657,-2.164349,-0.235973,-0.569174,0.188316,-0.407177
0.099907368,0.09189,0.045955,-0.545614,0.028715,-0.676703,-0.025271,0.032064,-0.041211,-0.051485,-0.00441,...,0.060379,-0.056896,,-0.28865,,-0.98305,,-0.312086,0.697252,
0.042163492,0.024801,0.045929,-0.15328,0.057526,-0.030467,-0.042318,0.104706,-0.012246,0.110854,-0.039993,...,-0.198928,,,0.137645,0.574422,0.237895,,,,
0.140697359,0.08091,0.097695,-0.226023,-0.106895,-0.316395,-0.073514,0.016917,-0.091123,-0.000417,0.067669,...,-0.127688,-0.173111,0.234793,-0.018004,-0.339277,-0.96954,,-0.080315,0.193889,


In [136]:
# checking for missing values

In [137]:
proteins_missing_val = proteins_transpose.isnull().any()
proteins_missing_val

TUBA1B
ACTG1       False
TUBB4B      False
HBB         False
GAPDH       False
HBA2        False
            ...  
REEP6        True
NPC1         True
DSCR3        True
HAX1         True
VKORC1L1     True
Length: 3331, dtype: bool

In [146]:
# Reset index for proper handling
proteins_transpose.reset_index(drop=True, inplace=True)

In [152]:
proteins_transpose.head(10)

TUBA1B,ACTG1,TUBB4B,HBB,GAPDH,HBA2,SPTAN1,DPYSL2,SPTBN1,CKB,PKM,...,AGFG2,PTRH2,TMED5,PODXL2,CUL4A,REEP6,NPC1,DSCR3,HAX1,VKORC1L1
0,0.000416,-0.088755,0.157828,0.072899,0.154047,-0.045855,0.06595,-0.015957,-0.039686,0.081944,...,-0.086443,,-0.633261,0.159727,,0.429527,-0.001607,0.590137,-0.116056,
1,-0.05416,-0.067824,-0.827798,-0.007606,-0.956292,-0.03556,0.001286,0.005202,-0.121929,-0.071066,...,0.03406,,-0.087358,-0.002128,,-0.011036,,,-0.171433,
2,0.100341,0.084958,-0.050724,-0.02354,-0.050428,0.022052,0.014256,-0.000139,0.012872,0.162457,...,,-0.410899,-0.177223,,-0.179217,-0.527957,,-0.790274,,
3,0.023625,0.072881,-0.675763,0.136452,-0.563935,-0.072983,0.18976,-0.020462,0.114658,0.092396,...,-0.33279,,,,,0.213148,,-0.128936,,
4,-0.06712,-0.044365,-1.104865,0.012224,-1.276744,0.090556,0.018954,0.044539,-0.107376,0.079172,...,0.591717,1.484447,,0.261398,,0.23453,,0.623527,0.02367,
5,0.048944,0.065666,-0.443659,0.051385,-0.498043,0.019037,0.030938,-0.052377,-0.019236,0.149398,...,,-0.236246,0.551475,1.272439,,,,0.452177,,
6,0.076832,0.017401,0.476736,0.06725,0.382287,0.071292,-0.091357,0.044861,0.091971,0.013927,...,,0.017237,0.137227,-0.819154,0.244657,-2.164349,-0.235973,-0.569174,0.188316,-0.407177
7,0.09189,0.045955,-0.545614,0.028715,-0.676703,-0.025271,0.032064,-0.041211,-0.051485,-0.00441,...,0.060379,-0.056896,,-0.28865,,-0.98305,,-0.312086,0.697252,
8,0.024801,0.045929,-0.15328,0.057526,-0.030467,-0.042318,0.104706,-0.012246,0.110854,-0.039993,...,-0.198928,,,0.137645,0.574422,0.237895,,,,
9,0.08091,0.097695,-0.226023,-0.106895,-0.316395,-0.073514,0.016917,-0.091123,-0.000417,0.067669,...,-0.127688,-0.173111,0.234793,-0.018004,-0.339277,-0.96954,,-0.080315,0.193889,


In [158]:
# replace missing values with mean
#proteins_mean = proteins_transpose[['NPC1']].mean()
#proteins_mean

TUBA1B
NPC1   -0.041679
dtype: object

In [159]:
#proteins_transpose['NPC1'].fillna(proteins_mean, inplace=True)

In [161]:
proteins_transpose.fillna(0, inplace=True)

In [162]:
proteins_missing_val_1 = proteins_transpose.isnull().any()
proteins_missing_val_1

TUBA1B
ACTG1       False
TUBB4B      False
HBB         False
GAPDH       False
HBA2        False
            ...  
REEP6       False
NPC1        False
DSCR3       False
HAX1        False
VKORC1L1    False
Length: 3331, dtype: bool

In [163]:
proteins_transpose.to_csv("csv_files/proteins_new.csv", index=False)

## meta.csv

Provides experimental conditions, sample IDs, age, sex, diagnosis, tissue type, and batch information for each sample.

In [78]:
meta = pd.read_csv('csv_files/meta.csv')

In [79]:
meta.head(10)

Unnamed: 0,GSM,Age,Sex,Diagnosis,Tissue,Batch
0,GSM3577568,93,Female,Alzheimer's disease,Brain; fusiform gyrus,1
1,GSM3577569,77,Male,Alzheimer's disease,Brain; fusiform gyrus,1
2,GSM3577570,92,Male,Alzheimer's disease,Brain; fusiform gyrus,1
3,GSM3577571,85,Male,Alzheimer's disease,Brain; fusiform gyrus,1
4,GSM3577572,83,Male,Alzheimer's disease,Brain; fusiform gyrus,1
5,GSM3577573,85,Male,Alzheimer's disease,Brain; fusiform gyrus,1
6,GSM3577574,93,Female,Alzheimer's disease,Brain; fusiform gyrus,1
7,GSM3577575,96,Male,Alzheimer's disease,Brain; fusiform gyrus,1
8,GSM3577576,96,Female,Alzheimer's disease,Brain; fusiform gyrus,1
9,GSM3577577,88,Female,Alzheimer's disease,Brain; fusiform gyrus,1


In [80]:
# checking for missing values

In [81]:
meta_missing_val = meta.isnull().any()
meta_missing_val

GSM          False
Age           True
Sex           True
Diagnosis    False
Tissue       False
Batch        False
dtype: bool

In [82]:
meta_missing_val = meta.isnull().sum()
meta_missing_val

GSM          0
Age          1
Sex          1
Diagnosis    0
Tissue       0
Batch        0
dtype: int64

In [83]:
# encode the sex column
le_sex = LabelEncoder()
meta['Sex_n'] = le_sex.fit_transform(meta['Sex'])

In [84]:
# replace the missing values with mean()
Sex_mean = meta['Sex_n'].mean()
Sex_mean

0.9571428571428572

In [85]:
# since the mean is 0.9 and sex can't be in decimals so rounding of the value
round(Sex_mean)

1

In [86]:
# replace the NA valuesfor male=1, female=0
meta['Sex_n'].fillna(round(Sex_mean), inplace=True)

In [87]:
# replace the NA valuesfor male=1, female=0 for the string column
meta['Sex'].fillna('Male', inplace=True)

In [88]:
meta.head(10)

Unnamed: 0,GSM,Age,Sex,Diagnosis,Tissue,Batch,Sex_n
0,GSM3577568,93,Female,Alzheimer's disease,Brain; fusiform gyrus,1,0
1,GSM3577569,77,Male,Alzheimer's disease,Brain; fusiform gyrus,1,1
2,GSM3577570,92,Male,Alzheimer's disease,Brain; fusiform gyrus,1,1
3,GSM3577571,85,Male,Alzheimer's disease,Brain; fusiform gyrus,1,1
4,GSM3577572,83,Male,Alzheimer's disease,Brain; fusiform gyrus,1,1
5,GSM3577573,85,Male,Alzheimer's disease,Brain; fusiform gyrus,1,1
6,GSM3577574,93,Female,Alzheimer's disease,Brain; fusiform gyrus,1,0
7,GSM3577575,96,Male,Alzheimer's disease,Brain; fusiform gyrus,1,1
8,GSM3577576,96,Female,Alzheimer's disease,Brain; fusiform gyrus,1,0
9,GSM3577577,88,Female,Alzheimer's disease,Brain; fusiform gyrus,1,0


In [89]:
# Convert 'Age' column to numeric, coercing errors to NaN
meta['Age'] = pd.to_numeric(meta['Age'], errors='coerce')

# Replace NaN values with 0 in 'Age' column
meta['Age'].fillna(0, inplace=True)

# Convert 'Age' column to integer
meta['Age'] = meta['Age'].astype(int)

In [90]:
# replace the missing values with mean()
Age_mean = meta['Age'].mean()
Age_mean

64.85714285714286

In [91]:
# replace the NA values with mean
meta['Age'].fillna(meta['Age'].mean(), inplace=True)

In [92]:
meta.head(10)

Unnamed: 0,GSM,Age,Sex,Diagnosis,Tissue,Batch,Sex_n
0,GSM3577568,93,Female,Alzheimer's disease,Brain; fusiform gyrus,1,0
1,GSM3577569,77,Male,Alzheimer's disease,Brain; fusiform gyrus,1,1
2,GSM3577570,92,Male,Alzheimer's disease,Brain; fusiform gyrus,1,1
3,GSM3577571,85,Male,Alzheimer's disease,Brain; fusiform gyrus,1,1
4,GSM3577572,83,Male,Alzheimer's disease,Brain; fusiform gyrus,1,1
5,GSM3577573,85,Male,Alzheimer's disease,Brain; fusiform gyrus,1,1
6,GSM3577574,93,Female,Alzheimer's disease,Brain; fusiform gyrus,1,0
7,GSM3577575,96,Male,Alzheimer's disease,Brain; fusiform gyrus,1,1
8,GSM3577576,96,Female,Alzheimer's disease,Brain; fusiform gyrus,1,0
9,GSM3577577,88,Female,Alzheimer's disease,Brain; fusiform gyrus,1,0


In [93]:
meta_missing_val = meta.isnull().any()
meta_missing_val

GSM          False
Age          False
Sex          False
Diagnosis    False
Tissue       False
Batch        False
Sex_n        False
dtype: bool

In [97]:
meta.to_csv("csv_files/meta_new.csv", index=False)

# combinedSTARcounts.csv

Contains STAR count data from RNA seq experiments. It lists the number of unmapped reads across different samples, with each column representing a specific sample. This raw count data will require further normalization and differential expression analysis.​

In [69]:
combinedSTAR = pd.read_csv('csv_files/combinedSTARcounts.csv')

In [70]:
combinedSTAR.head(10)

Unnamed: 0,N_unmapped,./GSM1376252,./GSM1376253,./GSM1376254,./GSM1376255,./GSM1376256,./GSM1376257,./GSM1376258,./GSM1376259,./GSM1376260,...,./GSM4403304,./GSM4403307,./GSM4403308,./GSM4403309,./GSM4403310,./GSM4403311,./GSM4636679,./GSM4636680,./GSM4636681,./GSM4636682
0,N_multimapping,0,0,0,0,0,0,0,0,0,...,4947058,6332941,4128773,3722245,2963347,3127395,2856189,4964848,854938,9773683
1,N_noFeature,0,0,0,0,0,0,0,0,0,...,6677514,5517055,5791810,3760436,5354065,4291684,5515906,3673654,853466,3756792
2,N_ambiguous,0,0,0,0,0,0,0,0,0,...,125207,118670,141932,161794,162896,179798,118270,56726,16497,103998
3,DDX11L1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,WASH7P,0,0,0,0,0,0,0,0,0,...,0,62,0,23,0,0,0,0,0,0
5,MIR6859-3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,MIR6859-2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,MIR6859-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,MIR6859-4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,MIR1302-2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
# checking for missing values

In [72]:
combinedSTAR_missing_val = combinedSTAR.isnull().any()
combinedSTAR_missing_val

N_unmapped      False
./GSM1376252    False
./GSM1376253    False
./GSM1376254    False
./GSM1376255    False
                ...  
./GSM4403311    False
./GSM4636679    False
./GSM4636680    False
./GSM4636681    False
./GSM4636682    False
Length: 768, dtype: bool

In [73]:
combinedSTAR_missing_val = combinedSTAR.isnull().sum()
combinedSTAR_missing_val

N_unmapped      0
./GSM1376252    0
./GSM1376253    0
./GSM1376254    0
./GSM1376255    0
               ..
./GSM4403311    0
./GSM4636679    0
./GSM4636680    0
./GSM4636681    0
./GSM4636682    0
Length: 768, dtype: int64

In [98]:
combinedSTAR.to_csv("csv_files/combinedSTAR_new.csv", index=False)

# Microarray.csv

Contains Microarray expression data for cross-validation and comparative analysis with RNA-seq data.​

In [100]:
microarray = pd.read_csv('csv_files/Microarray.csv')

In [101]:
microarray.head(10) 

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V2037,V2038,V2039,V2040,V2041,V2042,V2043,V2044,V2045,V2046
0,A1BG,5.582546,5.316083,5.639426,5.531861,5.492184,5.619112,5.424081,5.370087,5.597164,...,5.503977,5.435669,5.291581,5.293762,5.545232,5.413534,5.326086,5.315767,5.220495,5.657885
1,A4GALT,4.588665,4.567148,4.336331,4.464754,4.454753,4.382402,4.352616,4.478181,4.816772,...,4.559906,4.542629,4.410823,4.337349,4.479923,4.414422,4.521428,4.608279,4.48442,4.604231
2,A4GNT,4.729963,4.607885,4.419786,4.691284,4.967895,4.542279,4.597191,4.618374,5.458812,...,4.604645,4.593086,4.841435,4.994085,4.337664,4.805862,4.600341,4.278393,4.554369,4.418984
3,AAAS,5.201235,4.771399,5.024912,5.222125,5.172302,4.980001,5.13889,5.060142,5.04199,...,5.044219,4.994378,4.879115,5.0191,4.932031,4.966196,5.009232,4.92818,4.983588,5.001185
4,AACS,5.51677,5.090253,5.129032,5.502365,5.142271,5.62466,5.486132,5.519051,5.317086,...,5.12691,4.503967,5.678005,5.670121,5.413771,5.265686,5.389198,5.054624,4.97682,5.459347
5,AADAT,4.32105,4.223453,4.564527,4.36615,4.431567,4.151446,4.441678,4.442235,4.316715,...,4.308273,4.382805,4.222056,4.282696,4.307498,4.231064,4.393983,4.120549,4.18131,4.310877
6,AAK1,5.403211,5.35578,5.159841,5.443067,5.223076,5.635608,5.593506,5.427302,5.406536,...,5.555753,5.738224,5.531037,5.130048,5.610595,5.472773,5.670954,5.78174,5.749025,5.588248
7,AAMP,4.682736,4.615632,4.6212,4.537199,4.595105,4.687976,4.806862,4.681141,4.454123,...,4.776131,4.99035,4.738362,4.7443,4.69691,4.77763,4.629461,4.830475,4.762774,4.68509
8,AARS,5.779412,5.892408,5.458466,5.836839,5.531098,5.783979,5.766949,5.546404,5.625244,...,5.542206,5.485895,5.510172,5.619324,5.566223,5.592551,5.57267,5.709698,5.518494,5.450246
9,AASDH,4.476559,4.67746,4.310792,4.407091,4.347562,4.303371,4.44679,4.317594,4.316065,...,4.620338,4.314468,4.265021,4.371578,4.237609,4.511817,4.429191,4.552967,4.488417,4.312414


In [102]:
# checking for missing values

In [103]:
microarray_missing_val = microarray.isnull().any()
microarray_missing_val

V1       False
V2       False
V3       False
V4       False
V5       False
         ...  
V2042    False
V2043    False
V2044    False
V2045    False
V2046    False
Length: 2046, dtype: bool

In [104]:
microarray_missing_val = microarray.isnull().sum()
microarray_missing_val

V1       0
V2       0
V3       0
V4       0
V5       0
        ..
V2042    0
V2043    0
V2044    0
V2045    0
V2046    0
Length: 2046, dtype: int64

In [105]:
microarray.to_csv("csv_files/microarray_new.csv", index=False)

# miRNA

File includes microRNA expression profiles across different samples. Each column represents a specific sample, and this data will aid in identifying post-transcriptional regulatory mechanisms in Alzheimer's disease.​

In [106]:
miRNA = pd.read_csv('csv_files/miRNA.csv')
miRNA.head(10)

Unnamed: 0,miRNA_IDS,GSM3403761,GSM3403762,GSM3403763,GSM3403764,GSM3403765,GSM3403766,GSM3403767,GSM3403768,GSM3403769,...,GSM3405261,GSM3405262,GSM3405263,GSM3405264,GSM3405265,GSM3405266,GSM3405267,GSM3405268,GSM3405269,GSM3405270
0,MIMAT0000062,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,4.151464,1.547006,2.31427,1.183973,2.441098,3.093713,2.854996,3.629315,2.138062,1.896355
1,MIMAT0000063,2.307579,2.50538,1.983125,1.560269,3.302472,4.319297,1.964171,0.697365,1.491916,...,4.151464,1.547006,2.31427,1.183973,1.57132,3.517899,2.854996,3.013971,2.138062,1.664736
2,MIMAT0000064,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,4.151464,1.547006,2.31427,2.286674,1.57132,3.093713,2.854996,3.013971,2.138062,3.253439
3,MIMAT0000065,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,4.151464,1.547006,2.31427,1.183973,1.57132,3.093713,2.854996,3.013971,2.138062,4.077636
4,MIMAT0000066,2.307579,1.503044,1.549877,2.232974,4.79347,4.319297,0.575922,-0.180237,1.491916,...,4.151464,1.547006,2.31427,2.777865,1.57132,3.093713,2.854996,3.013971,2.138062,1.664736
5,MIMAT0000067,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,4.151464,1.547006,2.31427,1.183973,1.57132,3.093713,2.854996,3.013971,2.138062,1.664736
6,MIMAT0000068,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,4.971346,1.547006,2.31427,2.593765,1.57132,3.093713,2.854996,3.013971,2.138062,1.664736
7,MIMAT0000069,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,4.151464,1.547006,2.31427,1.183973,1.57132,3.093713,2.854996,3.013971,2.138062,1.664736
8,MIMAT0000070,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,4.151464,3.397922,2.31427,1.183973,1.821418,3.093713,2.854996,3.013971,2.138062,1.664736
9,MIMAT0000071,2.843409,3.349936,3.081569,1.890881,3.650599,4.319297,0.575922,0.556748,3.346453,...,4.151464,3.541742,2.31427,1.183973,1.57132,3.093713,2.854996,3.013971,2.138062,1.664736


In [107]:
miRNA_missing_val = miRNA.isnull().any()
miRNA_missing_val

miRNA_IDS     False
GSM3403761    False
GSM3403762    False
GSM3403763    False
GSM3403764    False
              ...  
GSM3405266    False
GSM3405267    False
GSM3405268    False
GSM3405269    False
GSM3405270    False
Length: 1310, dtype: bool

In [108]:
miRNA.to_csv("csv_files/miRNA_new.csv", index=False)

# AlzheimerTermsNonRedundant.txt  

File is a curated list of pathways associated with Alzheimer's disease, including relevant genes and URLs for further reference. This data is essential for pathway enrichment analysis and understanding the specific biological processes involved in Alzheimer's pathology.​

In [None]:
# no prepsrocessing needs to be done on this file