Enter description and fancy title here. Should include general directions as well. Make it *fancy*.

In [4]:
"""Recieve user input for genes of interest to be analyzed. This section will intialize
a GUI, ask the user to input a list of genes to analyze, and check this list against a
dictionary of genes and IDs from Ensembl. A list of valid genes and IDs will be generated
and returned as a pair of values for future functions to utilize."""

# Import necessary modules and establish variables and dictionaries.
import tkinter as tk
from tkinter import simpledialog, messagebox
import pandas as pd

genedict = {"RIGI": "ENSG00000107201", "IRF7": "ENSG00000276561", "IFNB1": "ENSG00000171855",
             "TLR3": "ENSG00000164342", "IFNL1": "ENSG00000291872", "IL6": "ENSG00000136244",
             "SPRR2A": "ENSG00000241794", "SFN": "ENSG00000175793", "COA3": "ENSG00000183978",
             "VAMP8": "ENSG00000118640", "LAG3": "ENSG00000089692", "IL2": "ENSG00000109471"}

genelist = []

# The basis of a GUI; creates a window, sets parameters, and populates that window with buttons.
class UserInputListGUI:
    def __init__(self, master, window_width=400, window_height=300):
        self.master = master
        self.master.title("Gene of Interest Input List")

        self.listbox = tk.Listbox(self.master)
        self.listbox.pack(pady=10)

        add_button = tk.Button(self.master, text="Add Gene", command=self.add_item)
        add_button.pack(pady=5)

        close_button = tk.Button(self.master, text="Check and Close", command=self.check_and_close)
        close_button.pack(pady=10)

        self.master.geometry(f"{int(window_width)}x{int(window_height)}")

# Defines the funtion for adding items to the list in the GUI as well as for further analysis.        
    def add_item(self):
        GOI_input = simpledialog.askstring("GoI Input", "If a gene contains Greek letters, enter them as their English \
equivalent and without special punctuation. For example, if you are looking for 'IFN-beta', enter 'IFNB'.\nPlease enter a gene:")
        if GOI_input:
            genelist.append(GOI_input.upper())
            self.listbox.insert(tk.END, GOI_input.upper())

# Defines the function for checking the list generated by the user against the dictionary.
# Will warn user if any genes are not in this dictionary before closing the window.
# Allows the user to add more genes as long as at least one is invalid, such as from an incorrect input.
    def check_and_close(self):
        if genelist:
            valid_items = genedict

            invalid_items = [item for item in genelist if item not in valid_items]

            if invalid_items:
                message = f"WARNING! Invalid genes: {', '.join(invalid_items)}\nDo you still want to close?"
                response = messagebox.askyesno("Invalid Genes", message)
                if not response:
                    return

            self.master.destroy()
        else:
            messagebox.showwarning("Empty List", "The list is empty. Please add items before closing.")

# Pulls together all of the above GUI functions before running them.
def main():
    root = tk.Tk()
    app = UserInputListGUI(root)
    root.mainloop()

if __name__ == "__main__":
    main()

# Converts all user-inputted genes to an ID from Ensembl as long as they are in the dictionary.
# Returns both the valid list of genes and IDs.
def convertID():
    validlist = [gene for gene in genelist if gene in genedict]
    convertlist = [genedict[gene] for gene in genelist if gene in genedict]
    return (validlist, convertlist)

GOIlist, IDlist = convertID()

"""Check IDs against each set of data and isolate rows containing genes of interest."""
# Define a function to check if the entered values are included in a row from entered data
def checkID(data, IDs):
    matching_rows = pd.DataFrame()
    for ID in IDs:
        row_to_add = data[data.apply(lambda row: row.astype(str).str.contains(ID, case=False, na=False).any(), axis=1)]
        matching_rows = pd.concat([matching_rows, row_to_add], ignore_index=True)
        
    # Print the entire row(s) if matches are found
    if not matching_rows.empty:
        print(f"Matching row(s) for {IDs}:\n{matching_rows}")
    else:
        print(f"No matching rows found for {IDs}.")
    return matching_rows

# Define a function to consolidate info about matching values into one list
def consolidate(data):
    analysis_data = data[paraInt]
    return analysis_data

# Set parameters for analysis
paraInt = ['gene', 'n_reads', 'probability_modified', 'kmer', 'mod_ratio']

# MOCK DATA
# Input the CSV file path, making sure to replace *** with path to your CSV file
csv_file_path = r'C:\Users\crues\Documents\Python\Project\Data\24Nmockhbec_0.9.csv'

# Read the CSV file into a pandas DataFrame and change names of columns
mockDF = pd.read_csv(csv_file_path)
mockDF.columns = ['ensembl_id1', 'ensembl_id2', 'transcript_id1', 'transcript_id2',
                  'gene_full', 'gene', 'bp', 'biotype', 'transcript_position', 'n_reads',
                  'probability_modified', 'kmer', 'mod_ratio']

# Display the first five rows of the DataFrame as a table
print('MOCK DATA:')
print(mockDF.head())

# Run functions and get data for analysis
mock_complete_data = checkID(mockDF, IDlist)
mock_analysis = consolidate(mock_complete_data)

# INFECTED DATA
# Input the CSV file path, making sure to replace *** with path to your CSV file
csv_file_path = r'C:\Users\crues\Documents\Python\Project\Data\24npr8hbec_0.9.csv'

# Read the CSV file into a pandas DataFrame and change names of columns
infDF = pd.read_csv(csv_file_path)
infDF.columns = ['ensembl_id1', 'ensembl_id2', 'transcript_id1', 'transcript_id2',
                  'gene_full', 'gene', 'bp', 'biotype', 'transcript_position', 'n_reads',
                  'probability_modified', 'kmer', 'mod_ratio']

# Display the first five rows of the DataFrame as a table
print('INFECTED DATA:')
print(infDF.head())

# Run functions and get data for analysis
inf_complete_data = checkID(infDF, IDlist)
inf_analysis = consolidate(inf_complete_data)

"""Use data from genes of interest to perform analysis and generate necessary figures."""

MOCK DATA:
          ensembl_id1         ensembl_id2        transcript_id1  \
0  ENST00000344843.11   ENSG00000242485.5  OTTHUMG00000002916.3   
1   ENST00000196061.4  ENSG00000083444.16  OTTHUMG00000002393.4   
2   ENST00000196061.4  ENSG00000083444.16  OTTHUMG00000002393.4   
3   ENST00000196061.4  ENSG00000083444.16  OTTHUMG00000002393.4   
4   ENST00000196061.4  ENSG00000083444.16  OTTHUMG00000002393.4   

         transcript_id2   gene_full    gene    bp         biotype  \
0  OTTHUMT00000008139.1  MRPL20-201  MRPL20   721  protein_coding   
1  OTTHUMT00000006865.1   PLOD1-201   PLOD1  2940  protein_coding   
2  OTTHUMT00000006865.1   PLOD1-201   PLOD1  2940  protein_coding   
3  OTTHUMT00000006865.1   PLOD1-201   PLOD1  2940  protein_coding   
4  OTTHUMT00000006865.1   PLOD1-201   PLOD1  2940  protein_coding   

   transcript_position  n_reads  probability_modified   kmer  mod_ratio  
0                  548       25              0.988229  GGACT   0.800000  
1                 2538 

'Use data from genes of interest to perform analysis and generate necessary figures.'