In [15]:
# ! pip install pandas numpy scipy matplotlib seaborn scikit-learn mygene

In [16]:
import pandas as pd
from scipy.stats import ttest_ind

# Load the data
df = pd.read_csv('GSE19804_series_matrix.txt', sep="\t", comment='!', index_col=0)

df = df.transpose()

# OPTIONAL: drop any columns not gene probes
# For example, keep only float columns (gene expression)
df = df.select_dtypes(include='float64')

# Step 1: Assign sample labels (first 60 = tumor, next 60 = normal)
labels = ['Tumor'] * 60 + ['Normal'] * 60
df['label'] = labels

# Step 2: Split data
tumor = df[df['label'] == 'Tumor'].drop(columns=['label'])
normal = df[df['label'] == 'Normal'].drop(columns=['label'])

# Step 3: Perform t-test for each gene
results = []
for gene in tumor.columns:
    stat, p = ttest_ind(tumor[gene], normal[gene])
    results.append((gene, stat, p))

# Step 4: Convert to DataFrame and sort by p-value
result_df = pd.DataFrame(results, columns=['Gene', 'T-statistic', 'P-value'])
result_df = result_df.sort_values('P-value')

# Show top 10 differentially expressed genes
print(result_df.head(10))


              Gene  T-statistic       P-value
26336  217046_s_at   -20.023918  9.296289e-40
25211  215918_s_at   -20.015457  9.662377e-40
11972  202524_s_at   -18.973455  1.204164e-37
37794    228540_at   -18.342796  2.387803e-36
39723    230469_at   -18.326222  2.584557e-36
48899    239650_at   -18.020563  1.120214e-35
3845    1557729_at   -17.851949  2.528399e-35
15654    206208_at   -17.714688  4.918132e-35
15655  206209_s_at   -17.377253  2.549939e-34
27056    217771_at    17.266428  4.391610e-34


In [17]:
import pandas as pd
import mygene

mg = mygene.MyGeneInfo()

# Convert probe list
probes = result_df['Gene'].tolist()
annotations = mg.querymany(probes, scopes='reporter', fields='symbol', species='human')

# Turn into DataFrame
annot_df = pd.DataFrame(annotations)

# Remove rows with no symbol
annot_df = annot_df[annot_df['symbol'].notna()]

# Map gene symbols back to your result_df
probe_to_symbol = dict(zip(annot_df['query'], annot_df['symbol']))
result_df['GeneSymbol'] = result_df['Gene'].map(probe_to_symbol)

# Drop rows with no gene symbol
result_df = result_df.dropna(subset=['GeneSymbol'])

print(result_df[['Gene', 'GeneSymbol', 'P-value']].head())


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
1861 input query terms found dup hits:	[('205200_at', 2), ('222738_at', 2), ('227452_at', 2), ('223531_x_at', 2), ('224448_s_at', 2), ('225
11130 input query terms found no hit:	['237390_at', '236383_at', '235642_at', '1557094_at', '243172_at', '242340_at', '238151_at', '238891


              Gene GeneSymbol       P-value
26336  217046_s_at       AGER  9.296289e-40
25211  215918_s_at     SPTBN1  9.662377e-40
11972  202524_s_at     SPOCK2  1.204164e-37
37794    228540_at        QKI  2.387803e-36
39723    230469_at      RTKN2  2.584557e-36


In [None]:

import matplotlib.pyplot as plt
import numpy as np


plt.figure(figsize=(10, 6))
plt.scatter(result_df['T-statistic'], result_df['P-value'],
            c=(result_df['P-value'] < 0.05), cmap='coolwarm', alpha=0.7)
plt.xlabel('T-statistic')
plt.ylabel('-log10(p-value)')
plt.title('Volcano Plot of Differential Gene Expression')
plt.axhline(-np.log10(0.05), color='gray', linestyle='--')
plt.show()

ValueError: Key backend: 'module://matplotlib_inline.backend_inline' is not a valid value for backend; supported values are ['gtk3agg', 'gtk3cairo', 'gtk4agg', 'gtk4cairo', 'macosx', 'nbagg', 'notebook', 'qtagg', 'qtcairo', 'qt5agg', 'qt5cairo', 'tkagg', 'tkcairo', 'webagg', 'wx', 'wxagg', 'wxcairo', 'agg', 'cairo', 'pdf', 'pgf', 'ps', 'svg', 'template']

In [None]:
# Only keep genes with p-value < 0.05
significant_genes = result_df[result_df['P-value'] < 0.05]

df2 = df[significant_genes["Gene"].values]
df2.shape

(120, 25709)

In [None]:
df2.head()

ID_REF,217046_s_at,215918_s_at,202524_s_at,228540_at,230469_at,239650_at,1557729_at,206208_at,206209_s_at,217771_at,...,208327_at,230724_s_at,227795_at,1557309_at,220258_s_at,1562409_s_at,228752_at,1563298_at,1569097_at,206917_at
GSM494556,7.68281,6.53858,8.60144,6.83675,4.19828,4.11814,6.27737,6.89603,6.21657,12.1128,...,6.50076,5.40516,6.68314,3.81622,7.10997,6.83689,8.16486,4.37974,4.37751,4.98437
GSM494557,7.3199,5.98864,8.44409,6.7838,3.9729,3.40786,5.38056,6.84327,6.95,8.75553,...,6.34675,5.76333,6.97842,4.05884,7.30112,7.4129,6.48879,4.18676,4.53679,4.60314
GSM494558,7.17118,6.16938,9.89768,7.10125,4.20079,4.56416,6.78433,8.58719,8.86094,9.15013,...,6.43341,5.41595,7.08009,3.98211,7.0638,7.23086,7.32753,4.45175,4.36626,5.16514
GSM494559,6.76456,5.97161,7.96763,6.7271,3.74731,3.96403,6.3798,6.91088,5.84398,9.42616,...,6.30243,6.91349,6.47317,3.83431,7.2166,6.98912,6.83328,3.84752,4.2335,4.50363
GSM494560,7.28154,5.51378,7.50781,7.11012,3.94579,4.82867,5.99469,6.45942,5.11954,10.937,...,5.69766,6.05344,6.38801,3.92579,7.25245,6.75474,6.61974,4.11094,4.44954,4.56833


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Use top 50 genes
top_genes = result_df['Gene'].head(50)
X = df[top_genes]
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))


Accuracy: 0.8333333333333334


In [19]:
pip install matplotlib-inline

Note: you may need to restart the kernel to use updated packages.
