<a href="https://colab.research.google.com/github/NikhilTailor9733/BreastCancerDetection/blob/main/BreastCancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Step 1: Load the GEO matrix file
df = pd.read_csv("/content/breastcancer/GSE15852_series_matrix.txt", sep='\t', comment='!', quotechar='"' , index_col=0)

# Step 2: Transpose (samples as rows, genes as columns)
df = df.transpose()

# Step 3: Save to CSV
df.to_csv("/content/breastcancer/breastcancerdataset.csv", index=True)

print("✅ File converted successfully!")
print("Saved as: breastcancerdataset.csv")
print("Shape:", df.shape)


✅ File converted successfully!
Saved as: breastcancerdataset.csv
Shape: (86, 2833)


In [None]:
df.head(3)

ID_REF,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,203297_s_at,203298_s_at,203299_s_at,203300_x_at,203301_s_at,203302_at,203303_at,203304_at,203305_at,203306_s_at
GSM398074,1881.8,78.0658,1299.98,3086.72,353.89,248.635,1381.12,31.2698,128.276,724.081,...,970.532,235.725,1171.56,1656.81,648.54,205.046,1235.09,644.133,1054.01,723.556
GSM398075,2317.51,61.354,775.547,2335.15,303.653,356.766,745.809,59.8119,95.1752,101.793,...,1066.51,487.538,1139.91,564.659,1311.74,295.966,1516.42,735.727,606.149,906.968
GSM398076,1553.86,80.0525,1103.74,3139.65,523.873,297.729,983.263,130.43,92.6413,97.817,...,1877.95,237.877,1818.28,490.861,235.447,190.834,1551.83,1039.66,1795.6,926.587


In [None]:
df.shape

(86, 2833)

In [None]:
df.columns

Index(['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at',
       '1316_at', '1320_at', '1405_i_at', '1431_at',
       ...
       '203297_s_at', '203298_s_at', '203299_s_at', '203300_x_at',
       '203301_s_at', '203302_at', '203303_at', '203304_at', '203305_at',
       '203306_s_at'],
      dtype='object', name='ID_REF', length=2833)

In [None]:
import gzip, itertools
path = "/content/breastcancer/GSE15852_series_matrix.txt.gz"

with gzip.open(path, 'rt', encoding='utf-8', errors='ignore') as f:
    for i, line in enumerate(itertools.islice(f, 0, 40), 1):
        print(i, line.strip())


1 !Series_title	"Expression data from human breast tumors and their paired normal tissues"
2 !Series_geo_accession	"GSE15852"
3 !Series_status	"Public on Apr 28 2009"
4 !Series_submission_date	"Apr 27 2009"
5 !Series_last_update_date	"Aug 10 2018"
6 !Series_pubmed_id	"20097481"
7 !Series_summary	"Microarray is widely used to monitor gene expression changes in breast cancer. The transcriptomic changes in breast cancer is commonly occured during the transition of normal cells to cancerous cells. This is the first study on gene expression profiling of multi ethnic of Malaysian breast cancer patients (Malays, Chinese and Indian). We aim to identify differentially expressed genes between tumors and normal tissues. We have identified a set of 33 significant differentially expressed genes in the tumor vs. normal group at p<0.001."
8 !Series_summary	"We study the gene expression patterns of 43 breast tumors and their paired normal control by using Affymetrix genechip U133A.  We have identified

In [None]:
import pandas as pd
import gzip
import re

meta_path = "/content/breastcancer/GSE15852_series_matrix.txt.gz"

# Step 1: Read the matrix lines
with gzip.open(meta_path, 'rt', encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()

# Step 2: Find the !Sample_title line
title_line = None
for line in lines:
    if line.startswith("!Sample_title"):
        title_line = line
        break

if not title_line:
    raise ValueError("No !Sample_title line found in metadata!")

# Step 3: Extract labels (each tab-separated value after the first column)
parts = title_line.strip().split("\t")[1:]  # skip the first "!Sample_title"
labels = [re.sub(r'["]', '', p).strip().split()[0].capitalize() for p in parts]  # "Normal BC001" -> "Normal"

print("Extracted labels count:", len(labels))
print("Unique labels found:", set(labels))

# Step 4: Load numeric dataset
df = pd.read_csv("/content/breastcancer/breastcancerdataset.csv", index_col=0)
print("Matrix shape:", df.shape)

# Step 5: Assign diagnosis column based on these labels
if len(labels) != len(df):
    print("⚠️ Warning: label count does not match sample count!")
else:
    df["Diagnosis"] = [1 if l.lower() == "cancer" else 0 for l in labels]

    # Save ready dataset
    out = "/content/breastcancer/breastcancerdataset_ready.csv"
    df.to_csv(out)
    print("✅ Saved:", out)
    print("Final shape:", df.shape)
    print(df["Diagnosis"].value_counts())


Extracted labels count: 86
Unique labels found: {'Cancer', 'Normal'}
Matrix shape: (86, 2833)
✅ Saved: /content/breastcancer/breastcancerdataset_ready.csv
Final shape: (86, 2834)
Diagnosis
0    43
1    43
Name: count, dtype: int64


In [None]:
df.columns

Index(['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at',
       '1316_at', '1320_at', '1405_i_at', '1431_at',
       ...
       '203298_s_at', '203299_s_at', '203300_x_at', '203301_s_at', '203302_at',
       '203303_at', '203304_at', '203305_at', '203306_s_at', 'Diagnosis'],
      dtype='object', length=2834)

In [None]:
df = pd.read_csv("/content/breastcancer/breastcancerdataset_ready.csv", index_col=0)


In [None]:
# fill missing numeric values with column mean
df = df.fillna(df.mean(numeric_only=True))
print("✅ Missing values filled successfully.")
print("Remaining NaN count:", df.isnull().sum().sum())


✅ Missing values filled successfully.
Remaining NaN count: 0


In [None]:
from sklearn.model_selection import train_test_split
X = df.drop("Diagnosis", axis=1)
y = df["Diagnosis"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred_lr = lr.predict(X_test_scaled)
#testing accuracy
accuracy_score(y_test, y_pred_lr)*100

94.44444444444444

In [None]:
#training accuracy
accuracy_score(y_train, lr.predict(X_train_scaled))*100

100.0

In [None]:
confusion_matrix(y_test, y_pred_lr)

array([[9, 0],
       [1, 8]])

In [None]:
print (classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95         9
           1       1.00      0.89      0.94         9

    accuracy                           0.94        18
   macro avg       0.95      0.94      0.94        18
weighted avg       0.95      0.94      0.94        18



In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_scaled, y_train)


In [None]:
print("RF Test Accuracy:", accuracy_score(y_test, rf.predict(X_test_scaled)) * 100)
print("RF Train Accuracy:", accuracy_score(y_train, rf.predict(X_train_scaled))*100)

RF Test Accuracy: 100.0
RF Train Accuracy: 100.0
