In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import GEOparse

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

Libraries imported successfully!
NumPy version: 2.4.1
Pandas version: 2.3.3


In [2]:
# Choose your dataset
# GSE68849 - Lung cancer vs. normal tissue
# GSE50760 - Colorectal cancer progression
# GSE114762 - Drug response

geo_accession = 'GSE68849' 

print(f"Downloading {geo_accession} from GEO database...")
print("This may take 2-5 minutes depending on dataset size...")

# Download the dataset
gse = GEOparse.get_GEO(geo=geo_accession, destdir='../data/raw/')

print(f"✓ Download complete!")
print(f"Dataset: {gse.name}")
print(f"Number of samples: {len(gse.gsms)}")

18-Jan-2026 13:13:19 DEBUG utils - Directory ../data/raw/ already exists. Skipping.
18-Jan-2026 13:13:19 INFO GEOparse - Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE68nnn/GSE68849/soft/GSE68849_family.soft.gz to ../data/raw/GSE68849_family.soft.gz


Downloading GSE68849 from GEO database...
This may take 2-5 minutes depending on dataset size...


100%|██████████| 21.7M/21.7M [00:00<00:00, 29.5MB/s]
18-Jan-2026 13:13:20 DEBUG downloader - Size validation passed
18-Jan-2026 13:13:20 DEBUG downloader - Moving /var/folders/lp/7xzmk01d31zd2kb15mbmvbnm0000gn/T/tmp8b61u795 to /Users/tasdid/Documents/Projects/gene-expression-analysis/data/raw/GSE68849_family.soft.gz
18-Jan-2026 13:13:20 DEBUG downloader - Successfully downloaded ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE68nnn/GSE68849/soft/GSE68849_family.soft.gz
18-Jan-2026 13:13:20 INFO GEOparse - Parsing ../data/raw/GSE68849_family.soft.gz: 
18-Jan-2026 13:13:20 DEBUG GEOparse - DATABASE: GeoMiame
18-Jan-2026 13:13:20 DEBUG GEOparse - SERIES: GSE68849
18-Jan-2026 13:13:20 DEBUG GEOparse - PLATFORM: GPL10558
18-Jan-2026 13:13:21 DEBUG GEOparse - SAMPLE: GSM1684095
18-Jan-2026 13:13:21 DEBUG GEOparse - SAMPLE: GSM1684096
18-Jan-2026 13:13:21 DEBUG GEOparse - SAMPLE: GSM1684097
18-Jan-2026 13:13:21 DEBUG GEOparse - SAMPLE: GSM1684098
18-Jan-2026 13:13:21 DEBUG GEOparse - SAMPLE: GSM1684

✓ Download complete!
Dataset: GSE68849
Number of samples: 10


In [3]:
# Extract the expression data into a pandas DataFrame
expression_data = gse.pivot_samples('VALUE')

print(f"Expression matrix shape: {expression_data.shape}")
print(f"Genes: {expression_data.shape[0]:,}")
print(f"Samples: {expression_data.shape[1]:,}")
print(f"\nFirst few rows and columns:")
expression_data.head()

Expression matrix shape: (47321, 10)
Genes: 47,321
Samples: 10

First few rows and columns:


name,GSM1684095,GSM1684096,GSM1684097,GSM1684098,GSM1684099,GSM1684100,GSM1684101,GSM1684102,GSM1684103,GSM1684104
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ILMN_1343291,23599.32,22303.32,25980.17,24776.12,24776.12,26775.34,25980.17,25980.17,23599.32,26775.34
ILMN_1343295,6405.335,7732.652,6553.479,5296.746,7699.219,6430.156,6648.162,4801.034,8646.443,5654.935
ILMN_1651199,95.19509,106.2397,97.54575,94.69421,106.2895,108.2523,95.39828,102.891,83.56386,88.30685
ILMN_1651209,95.65199,97.22574,105.7544,100.5472,119.326,109.0526,111.9581,94.74513,115.3237,107.9074
ILMN_1651210,101.8298,97.09778,125.6475,102.2923,99.58566,119.7902,103.8358,105.9598,102.4458,94.93552
