# Overall Project



In [None]:
# Standard Library

# Third Party Library
from Bio import Entrez
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from fastapi.testclient import TestClient

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Local Library


In [None]:
# Set up NCBI Entrez
Entrez.email = "your_email@example.com"  # Replace with your email


In [None]:
# Define a function to acquire data from NCBI (using E-utilities)
def fetch_ncbi_data(search_term, db="pubmed"):
    import datetime
    handle = Entrez.esearch(db=db, term=search_term)
    record = Entrez.read(handle)
    ids = record["IdList"]
    summary_handle = Entrez.efetch(
        db=db, 
        id=ids, 
        retmode="xml",
        rettype="abstract", # 
        retmax=3, # limit to x results
        ) 
    summary_record = Entrez.read(summary_handle)
     # Convert to a list of dictionaries for easier processing
    data = []
    for article in summary_record["PubmedArticle"]:
        article_data = {
            "title": article["MedlineCitation"]["Article"]["ArticleTitle"],
            "abstract": article["MedlineCitation"]["Article"]["Abstract"]["AbstractText"],
            "authors": [author["LastName"] for author in article["MedlineCitation"]["Article"]["AuthorList"]],
            "journal": article["MedlineCitation"]["Article"]["Journal"]["Title"],
            "publication_date": article["MedlineCitation"]["Article"]["Journal"]["PubDate"]["Year"],
            "accessed": datetime.datetime.now(),
            "version": article["MedlineCitation"]["Article"]["Version"],
            "url": article["MedlineCitation"]["Article"]["URL"]
            # Add other desired fields...
        }
        data.append(article_data)
        print(data)
    return summary_record


In [None]:
# Fetch data from NCBI
search_term = "genomics AND cancer AND canine"
ncbi_data = fetch_ncbi_data(search_term)
data =pd.DataFrame(ncbi_data)


In [None]:
def fetch_ncbi_geo_data(search_term, platform=None, organism=None):
    import datetime
    handle = Entrez.esearch(db="gds", term=search_term)
    record = Entrez.read(handle)
    ids = record["IdList"]
    data = []
    for gse_id in ids:
        try:
            summary_handle = Entrez.efetch(db="gds", id=gse_id, rettype="full", retmode="xml")
            summary_record = Entrez.read(summary_handle)
            series_matrix_url = summary_record[0]["Series"]["Data_Processed_Set"]["Data_Processed_Set_List"][0]["Data_Processed_Set_Series"]["Data_Processed_Set_Series_List"][0]["Data_Processed_Set_Series_Platform"]["URL"]
            dataset_info = {
                "gene_symbol": summary_record[0]["Series"]["Gene Symbol"],
                "also_known_as": summary_record[0]["Series"]["Also known as"],
                "gse_id": gse_id,
                "platform": summary_record[0]["Series"]["Platform_Ref"]["ID"],
                "organism": summary_record[0]["Series"]["Organism"]["Organism_Name"],
                "title": summary_record[0]["Series"]["Title"],
                "summary": summary_record[0]["Series"]["Summary"],
                "series_matrix_url": series_matrix_url,
                "accessed": datetime.datetime.now(),
                "updated" : summary_record[0]["Series"]["Updated On"],
                "version": summary_record[0]["Series"]["Version"],
                "url": summary_record[0]["Series"]["URL"],
                "source": "NCBI GEO",
                "accession": summary_record[0]["Series"]["Series_ID"],
                "version": summary_record[0]["Series"]["Series_Version"],
                "pubmed_ids": summary_record[0]["Pubmed_IDs"].split(";"),
                "sample_count": len(summary_record[0]["Series"]["Samples"]),
                "refseq_status": summary_record[0]["Series"],
                "orthologs": summary_record[0]["Series"]["Orthologs"],
                "expression": summary_record[0]["Series"]["Expression"],
                "gene_type": summary_record[0]["Series"]["Gene Type"],
                "location": summary_record[0]["Series"]["Location"],
                "exon_count": summary_record[0]["Series"]["Exon Count"],
                "transcript_count": summary_record[0]["Series"]["Transcript Count"],
                "expressions": summary_record[0]["Series"]["Expressions"],
            }
            data.append(dataset_info)
        except Exception as e:
            print(f"Error fetching GSE {gse_id}: {e}")
    return data

def download_series_matrix(series_matrix_url, filename="series_matrix.txt"):
    import requests
    response = requests.get(series_matrix_url)
    response.raise_for_status()
    print(f"Downloading Series Matrix from {series_matrix_url}")
    with open(filename, "wb") as file:
        file.write(response.content)
    print(f"Series Matrix file downloaded to {filename}")

# Example usage:

geo_datasets = fetch_ncbi_geo_data(search_term, organism="Homo sapiens")


In [None]:
geo_datasets 


In [None]:
# --- Differential Gene Expression Analysis (Simplified Example) ---

def analyze_geo_data(series_matrix_file):
    # ... (load Series Matrix file using pandas or your preferred method)
    # ... (perform differential gene expression analysis using DESeq2 or edgeR)
    # Example: 
    # Assuming you have the data loaded as a pandas DataFrame called 'data'
    # ... (perform your analysis logic)
    from deseq2 import DESeqDataSetFromMatrix, DeSeq

    data = pd.read_csv("series_matrix.txt", sep="\t", index_col=0)
    design_matrix = pd.get_dummies(data["your_categorical_column"])

    dds = DESeqDataSetFromMatrix(
        countData=data.iloc[:, 1:],
        colData=design_matrix,
        design=~0 + design_matrix.columns)

    dds = dds.DESeq(dds)

    results = dds.results(contrast=[design_matrix.columns[1], design_matrix.columns[0]])

    print(results.info())
    print(results.head())
    print(results.summary())
    print(results.describe())

    # differentially_expressed_genes = [
    #     {"gene_id": "ENSG00000123456", "log2FoldChange": 1.5, "pvalue": 0.01},
    #     {"gene_id": "ENSG00000234567", "log2FoldChange": -2.0, "pvalue": 0.005}
    # ]
    # return {"differentially_expressed_genes": differentially_expressed_genes}
    return results


In [None]:

# --- FastAPI ---

app = FastAPI()

@app.get("/geo_data")
async def get_geo_data(search_term: str, platform: str = None, organism: str = None):
    geo_datasets = fetch_ncbi_geo_data(search_term, platform, organism)
    results = []
    for dataset in geo_datasets:
        try:
            series_matrix_url = dataset["series_matrix_url"]
            series_matrix_file = download_series_matrix(series_matrix_url)
            analysis_results = await analyze_geo_data(series_matrix_file)
            results.append({
                "gse_id": dataset["gse_id"],
                "analysis_results": analysis_results
            })
        except Exception as e:
            print(f"Error processing GSE {dataset['gse_id']}: {e}")
            results.append({"gse_id": dataset['gse_id'], "error": str(e)})

    return JSONResponse(results)


In [None]:

# --- Test the API ---

client = TestClient(app)
response = client.get("/geo_data?search_term=genomics%20AND%20cancer%20AND%20human")
print(response.json())


In [None]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.drop("your_target_column", axis=1),
                                                    data["your_target_column"],
                                                    test_size=0.2)



In [None]:
# Train a machine learning model (example: logistic regression)
model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
