In [1]:
import os

folders = [
    "data/raw", "data/processed", "results/plots", "models"
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)


In [2]:
import pandas as pd

# Path to the .txt/.cct file
file_path = r"C:\Users\sanja\cfDNA_LungCancer_ML\data\raw\Human__TCGA_LUAD__BDGSC__miRNASeq__HS_miR__01_28_2016__BI__Gene__Firehose_RPM_log2.cct[1].txt"

# Load the file assuming it's tab-separated
try:
    df = pd.read_csv(file_path, sep='\t', engine='python')

    # Save to CSV format
    output_path = r"C:\Users\sanja\cfDNA_LungCancer_ML\data\processed\miRNA_TCGA_LUAD.csv"
    df.to_csv(output_path, index=False)

    print(" File successfully converted to CSV.")
    print(" Saved at:", output_path)
    print(" Preview:")
    display(df.head())

except Exception as e:
    print(" Error while processing file:", e)



 File successfully converted to CSV.
 Saved at: C:\Users\sanja\cfDNA_LungCancer_ML\data\processed\miRNA_TCGA_LUAD.csv
 Preview:


Unnamed: 0,attrib_name,TCGA.05.4384,TCGA.05.4390,TCGA.05.4396,TCGA.05.4405,TCGA.05.4410,TCGA.05.4415,TCGA.05.4417,TCGA.05.4424,TCGA.05.4425,...,TCGA.NJ.A4YG,TCGA.NJ.A4YI,TCGA.NJ.A4YP,TCGA.NJ.A4YQ,TCGA.NJ.A55A,TCGA.NJ.A55O,TCGA.NJ.A55R,TCGA.NJ.A7XG,TCGA.O1.A52J,TCGA.S2.AA1A
0,hsa-let-7a-1,13.8766,11.7425,14.0194,12.9428,12.715,13.0099,12.151,12.9538,13.7344,...,13.1164,13.0787,12.9371,11.5804,12.6737,13.726,12.3826,12.6324,11.9579,13.2691
1,hsa-let-7a-2,14.8745,12.7576,15.0255,13.9327,13.7157,14.0169,13.1524,13.9443,14.7439,...,14.1031,14.0725,13.9439,12.596,13.6404,14.7255,13.3917,13.6234,12.9604,14.2701
2,hsa-let-7a-3,13.8822,11.7578,14.0367,12.9499,12.7252,13.0417,12.1721,12.9644,13.7445,...,13.1158,13.0935,12.947,11.5914,12.6664,13.7416,12.3986,12.6361,11.9678,13.2772
3,hsa-let-7b,13.8259,13.0601,14.5902,14.217,13.7465,12.6094,13.1777,14.0479,14.5261,...,13.7042,13.5944,14.3297,12.3857,13.3484,13.8372,12.8563,13.4904,12.5495,14.037
4,hsa-let-7c,10.6177,7.608,11.1171,11.1093,10.3613,9.2237,9.483,10.6913,10.8448,...,9.9743,10.2977,9.5997,8.9539,11.6456,10.6305,9.2932,10.1604,10.7978,10.9685


In [3]:
import pandas as pd

# Path to the methylation dataset (.gz file)
file_path = r"C:\Users\sanja\cfDNA_LungCancer_ML\data\raw\Human__TCGA_LUAD__JHU_USC__Methylation__Meth450__01_28_2016__BI__CpG__Firehose_Methylation_Prepocessor.cct.gz"

# Read compressed tab-separated file directly
try:
    df = pd.read_csv(file_path, sep='\t', compression='gzip', engine='python')

    # Optional: Check structure
    print("Dataset loaded successfully.")
    print("Shape:", df.shape)
    print("Columns:", df.columns[:10].tolist())

    # Save to CSV
    output_path = r"C:\Users\sanja\cfDNA_LungCancer_ML\data\processed\methylation_TCGA_LUAD.csv"
    df.to_csv(output_path, index=False)
    print(f"Saved converted file to: {output_path}")

except Exception as e:
    print("Error while converting file:", e)


Dataset loaded successfully.
Shape: (336284, 459)
Columns: ['attrib_name', 'TCGA.05.4384', 'TCGA.05.4390', 'TCGA.05.4396', 'TCGA.05.4405', 'TCGA.05.4410', 'TCGA.05.4415', 'TCGA.05.4417', 'TCGA.05.4424', 'TCGA.05.4425']
Saved converted file to: C:\Users\sanja\cfDNA_LungCancer_ML\data\processed\methylation_TCGA_LUAD.csv


In [4]:
import os
from nbformat import v4 as nbf

# Base project directory
base_dir = r"C:\Users\sanja\cfDNA_LungCancer_ML"

# Create main folders
folders = [
    "data/raw",
    "data/processed",
    "notebooks",
    "results"
]

for folder in folders:
    path = os.path.join(base_dir, folder)
    os.makedirs(path, exist_ok=True)

# Notebook names and titles
notebooks = {
    "1_Data_Preprocessing.ipynb": "# Data Preprocessing\n\nThis notebook handles loading, cleaning, and preparing the cfDNA and miRNA data.",
    "2_Data_Integration_and_Labeling.ipynb": "#  Data Integration and Labeling\n\nThis notebook merges datasets and assigns class labels.",
    "3_Model_Training_and_Evaluation.ipynb": "# Model Training and Evaluation\n\nThis notebook builds ML models and evaluates performance.",
    "4_Feature_Analysis_and_Visualization.ipynb": "#  Feature Analysis & Visualization\n\nThis notebook shows feature importance, PCA, t-SNE, etc.",
    "5_Research_Report_and_Literature_Review.ipynb": "# Research Report & Literature Review\n\nThis notebook contains markdown write-up for final submission."
}

# Create each notebook with a markdown title cell
for nb_name, md_text in notebooks.items():
    nb_path = os.path.join(base_dir, "notebooks", nb_name)
    nb = nbf.new_notebook()
    nb.cells.append(nbf.new_markdown_cell(md_text))
    with open(nb_path, "w", encoding="utf-8") as f:
        f.write(nbf.writes(nb))

# Create README.md
readme_path = os.path.join(base_dir, "README.md")
with open(readme_path, "w", encoding="utf-8") as f:
    f.write("# 🧬 cfDNA + miRNA Lung Cancer Detection Project\n\nThis project integrates cfDNA methylation and miRNA expression data to predict early-stage lung cancer using machine learning.")

# Create requirements.txt
req_path = os.path.join(base_dir, "requirements.txt")
with open(req_path, "w", encoding="utf-8") as f:
    f.write("pandas\nnumpy\nscikit-learn\nmatplotlib\nseaborn\nnotebook\n")

print("Project structure and files created successfully.")


Project structure and files created successfully.


In [5]:
import os

# Define base path
base_path = r"C:\Users\sanja\cfDNA_LungCancer_ML"

# Create 'app' folder
app_folder = os.path.join(base_path, "app")
os.makedirs(app_folder, exist_ok=True)

# Create empty Streamlit file
streamlit_file = os.path.join(app_folder, "streamlit_app.py")
with open(streamlit_file, "w") as f:
    f.write("# Streamlit app will go here\n")

# Optionally create README.md
readme_file = os.path.join(app_folder, "README.md")
with open(readme_file, "w") as f:
    f.write("# Streamlit App for Lung Cancer Prediction\n")

print(f"'app/' folder created at: {app_folder}")
print(f"Streamlit app file created at: {streamlit_file}")


'app/' folder created at: C:\Users\sanja\cfDNA_LungCancer_ML\app
Streamlit app file created at: C:\Users\sanja\cfDNA_LungCancer_ML\app\streamlit_app.py
