# Ανάλυση Γονιδιακής Έκφρασης από GEO (Καρκίνος Πνεύμονα) Ανάλυση δεδομένων από GEO datasets (GSE43458, GSE31210, GSE19804) με στόχο την ταυτοποίηση σημαντικών γονιδίων στον καρκίνο του πνεύμονα.


# Φόρτωση GEO Datasets

In [30]:
import pandas as pd
import numpy as np

In [31]:
# Φόρτωση datasets
df1 = pd.read_csv("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE31nnn/GSE31210/matrix/GSE31210_series_matrix.txt.gz",
                  compression='gzip', sep='\t', comment='!', low_memory=False)

df2 = pd.read_csv("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE19nnn/GSE19804/matrix/GSE19804_series_matrix.txt.gz",
                  compression='gzip', sep='\t', comment='!', low_memory=False)

# Εύρεση Κοινών Γονιδίων και Προεπεξεργασία

In [32]:
# Βεβαιώνουμε ότι η πρώτη στήλη (ID_REF) είναι κοινή και στα τρία datasets
genes_1 = set(df1['ID_REF'])
genes_2 = set(df2['ID_REF'])

In [33]:
common_genes = genes_1.intersection(genes_2)

In [34]:
# Φιλτράρισμα των datasets μόνο για κοινά γονίδια
df1_filtered = df1[df1['ID_REF'].isin(common_genes)].set_index('ID_REF')
df2_filtered = df2[df2['ID_REF'].isin(common_genes)].set_index('ID_REF')

In [35]:
# Ταξινόμηση index ώστε να είναι ακριβώς ίδια σειρά
df1_filtered = df1[df1['ID_REF'].isin(common_genes)].set_index('ID_REF').sort_index()
df2_filtered = df2_filtered.sort_index()

In [36]:
# Τελικός έλεγχος για την ταύτιση index
assert all(df1_filtered.index == df2_filtered.index)

In [37]:
# Συνένωση των δεδομένων
df_combined = pd.concat([df1_filtered, df2_filtered,], axis=1)

# Έλεγχος Τύπου Αναγνωριστικών (ID_REF)

In [38]:
print("GSE43458 IDs:", df1['ID_REF'].head())
print("GSE31210 IDs:", df2['ID_REF'].head())



GSE43458 IDs: 0    1007_s_at
1      1053_at
2       117_at
3       121_at
4    1255_g_at
Name: ID_REF, dtype: object
GSE31210 IDs: 0    1007_s_at
1      1053_at
2       117_at
3       121_at
4    1255_g_at
Name: ID_REF, dtype: object


# Μετατροπή Αναγνωριστικών σε Gene Symbols

In [39]:
# Φόρτωσε το αρχείο που κατέβασες
annotation = pd.read_csv("datasets\GPL570-55999.txt", sep="\t", comment='#', low_memory=False)

# Δες ποιες στήλες έχει
print(annotation.columns)


Index(['ID', 'GB_ACC', 'SPOT_ID', 'Species Scientific Name', 'Annotation Date',
       'Sequence Type', 'Sequence Source', 'Target Description',
       'Representative Public ID', 'Gene Title', 'Gene Symbol',
       'ENTREZ_GENE_ID', 'RefSeq Transcript ID',
       'Gene Ontology Biological Process', 'Gene Ontology Cellular Component',
       'Gene Ontology Molecular Function'],
      dtype='object')


In [40]:
# Δημιουργούμε ένα λεξικό probe → gene
id_to_gene = dict(zip(annotation['ID'], annotation['Gene Symbol']))

# Εφαρμόζεις αυτό το mapping στα datasets GSE31210 και GSE19804
df2['Gene_Symbol'] = df2['ID_REF'].map(id_to_gene)
df1['Gene_Symbol'] = df1['ID_REF'].map(id_to_gene)


# Ομαδοποίηση Ανά Γονίδιο και Υπολογισμός Έκφρασης


In [41]:
# Βήμα 1: Κρατάμε μόνο τις αριθμητικές στήλες (δηλαδή τα δείγματα)
df2_numeric = df2.select_dtypes(include='number')
df1_numeric = df1.select_dtypes(include='number')

# Βήμα 2: Προσθέτουμε ξανά τα gene symbols ως στήλη
df2_numeric['Gene_Symbol'] = df2['Gene_Symbol']
df1_numeric['Gene_Symbol'] = df1['Gene_Symbol']

# Βήμα 3: Κάνουμε groupby και υπολογίζουμε τον μέσο όρο για κάθε γονίδιο
df2_grouped = df2_numeric.groupby('Gene_Symbol').mean()
df1_grouped = df1_numeric.groupby('Gene_Symbol').mean()


# Τελική Συνένωση Όλων των Δεδομένων


In [42]:
# Συνένωση κατά γονίδιο (γραμμές), οι στήλες (δείγματα) δεν επαναλαμβάνονται
combined_df = pd.concat([df2_grouped, df1_grouped], axis=1)

# Δες τι φτιάξαμε
print(combined_df.shape)
combined_df.head()


(23520, 366)


Unnamed: 0_level_0,GSM494556,GSM494557,GSM494558,GSM494559,GSM494560,GSM494561,GSM494562,GSM494563,GSM494564,GSM494565,...,GSM773776,GSM773777,GSM773778,GSM773779,GSM773780,GSM773781,GSM773782,GSM773783,GSM773784,GSM773785
Gene_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,6.376,6.51445,6.9353,7.98724,7.93168,7.53104,6.9225,7.62828,7.2402,7.26707,...,50.813641,51.98287,46.984982,24.260811,50.404438,52.734867,52.912245,43.067164,40.169184,7.322948
A1BG-AS1,5.31502,5.03275,5.46136,5.81214,5.95315,5.80037,5.30221,5.58133,5.49403,5.34265,...,61.151523,76.560795,91.945839,68.984255,7.022719,51.320599,14.745603,64.812755,119.829274,8.107551
A1CF,7.397725,5.081705,4.75918,4.592725,4.65241,4.582735,4.88637,4.365795,4.51583,4.965115,...,25.84943,33.761611,47.827019,42.357287,26.44256,46.527127,33.422373,51.864429,31.129011,41.863779
A2M,9.5485,9.202655,9.744765,9.553915,9.420875,9.600145,8.897035,8.92862,9.50293,9.243995,...,8782.817786,9298.270346,9957.928594,9745.025715,9916.936047,9662.780993,10082.322964,13647.352951,7967.622052,10045.380457
A2M-AS1,6.20626,6.28462,8.05477,5.69879,6.21434,6.52642,4.72729,6.21728,7.03301,5.97737,...,295.4885,392.5211,373.3923,363.7442,279.1856,327.0634,473.1294,420.2055,284.4724,460.2588


# φορτώση δεδομένων και οργάνωση για ανάλυση

In [None]:
# Διαβάζει τα sample IDs
def load_clean_ids(path):
    with open(path, 'r') as f:
        return [line.strip().replace('"', '') for line in f if line.strip()]

cancer_samples = load_clean_ids(r"cancer-normal/cancer.txt")
normal_samples = load_clean_ids(r"cancer-normal/normal.txt")

# Φιλτράρει όσα υπάρχουν στο combined_df
cancer_samples = [s for s in cancer_samples if s in combined_df.columns]
normal_samples = [s for s in normal_samples if s in combined_df.columns]

print(f"Καρκινικά δείγματα: {len(cancer_samples)}")
print(f"Φυσιολογικά δείγματα: {len(normal_samples)}")


labels = pd.Series(
    ['cancer'] * len(cancer_samples) + ['normal'] * len(normal_samples),
    index=cancer_samples + normal_samples,
    name='label'
)

# Τελικό dataset (δείγματα x γονίδια + label)
expression_data = combined_df[cancer_samples + normal_samples].T
expression_data['label'] = labels

# ✅ Επιβεβαίωση
print("Τελικό σχήμα:", expression_data.shape)
expression_data.head()


Καρκινικά δείγματα: 286
Φυσιολογικά δείγματα: 80
📐 Τελικό σχήμα: (366, 23521)


Gene_Symbol,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A4GALT,A4GNT,AA06,...,ZYG11B,ZYX,ZZEF1,ZZZ3,abParts /// IGKC /// IGKV4-1 /// IGKV4-1,av27s1 /// TRAV39 /// TRAV39,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// MIRLET7BHG /// RP4-695O20__B.10,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// RP4-695O20__B.10,mir-223,label
GSM773540,47.90095,46.825111,72.714534,2228.91157,175.34674,22.831965,15.927107,12.925406,73.018596,101.41695,...,1030.160047,167.8929,345.420567,778.35785,6.886894,22.436699,13.24217,111.81361,13.521504,cancer
GSM773541,51.64328,8.059568,23.870435,4198.206655,142.8026,23.447401,31.932787,12.202017,46.112976,68.965219,...,2070.380533,487.5092,137.000604,1601.05855,528.10193,4.38036,4.805737,46.9255,105.879995,cancer
GSM773542,40.98813,10.234848,13.798074,4695.59322,142.81548,11.082931,31.783636,40.485352,51.89819,50.38601,...,1690.349143,384.4377,227.002517,1343.98745,210.150816,6.839572,3.302328,62.00354,205.846188,cancer
GSM773543,28.38625,8.138013,40.220118,6329.20517,106.7527,16.412923,39.129192,32.542745,19.044909,31.693239,...,2315.6907,330.22475,200.465752,1483.14315,262.97957,27.579258,9.878414,83.4741,21.877117,cancer
GSM773544,42.09003,10.482582,26.090287,3881.23013,240.99803,17.330212,23.747617,16.26797,32.913082,51.03189,...,1500.67139,274.2612,296.169413,1095.8436,609.263698,10.045916,2.294671,75.77886,17.893558,cancer
