### Nanopore Data Analyzer

Analyze .csv files attained from nanopore sequencing. Make sure the path to your data (including the file itself) is placed into the first few lines of code. Additionally, follow all directions when inputting genes of interest.

In [None]:
"""Recieve user input for genes of interest to be analyzed. This section will intialize
a GUI, ask the user to input a list of genes to analyze, and check this list against a
dictionary of genes and IDs from Ensembl. A list of valid genes and IDs will be generated
and returned as a pair of values for future functions to utilize."""

# Input the CSV file path for each data set, making sure to replace *** with path to your CSV file
mock_csv_file_path = r'***'

inf_csv_file_path = r'***'

# Import necessary modules
import tkinter as tk
from tkinter import simpledialog, messagebox
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Establish dictionary of genes and IDs
genedict = {"RIGI": "ENSG00000107201", "IRF7": "ENSG00000276561", "IFNB1": "ENSG00000171855",
             "TLR3": "ENSG00000164342", "IFNL1": "ENSG00000291872", "IL6": "ENSG00000136244",
             "SPRR2A": "ENSG00000241794", "SFN": "ENSG00000175793", "COA3": "ENSG00000183978",
             "VAMP8": "ENSG00000118640", "LAG3": "ENSG00000089692", "IL2": "ENSG00000109471"}

# Generate an mpty list for the GUI
genelist = []

# Establish parameters for analysis
paraInt = ['gene', 'n_reads', 'probability_modified', 'kmer', 'mod_ratio']
data_columns = ['ensembl_id1', 'ensembl_id2', 'transcript_id1', 'transcript_id2',
           'gene_full', 'gene', 'bp', 'biotype', 'transcript_position', 'n_reads',
           'probability_modified', 'kmer', 'mod_ratio']

# The basis of a GUI; creates a window, sets parameters, and populates that window with buttons.
class UserInputListGUI:
    def __init__(self, master, window_width=325, window_height=450):
        self.master = master
        self.master.title("Gene of Interest Input List")

        self.listbox = tk.Listbox(self.master, height = 20)
        self.listbox.pack(pady=10)

        add_button = tk.Button(self.master, text="Add Gene", command=self.add_item)
        add_button.pack(pady=5)

        close_button = tk.Button(self.master, text="Check and Close", command=self.check_and_close)
        close_button.pack(pady=10)

        self.master.geometry(f"{int(window_width)}x{int(window_height)}")

# Defines the funtion for adding items to the list in the GUI as well as for further analysis.        
    def add_item(self):
        GOI_input = simpledialog.askstring("GoI Input", "If a gene contains Greek letters, enter them as their English \
equivalent and without special punctuation. For example, if you are looking for 'IFN-beta', enter 'IFNB'.\nPlease enter a gene:")
        if GOI_input:
            genelist.append(GOI_input.upper())
            self.listbox.insert(tk.END, GOI_input.upper())

# Defines the function for checking the list generated by the user against the dictionary.
# Will warn user if any genes are not in this dictionary before closing the window.
# Allows the user to add more genes as long as at least one is invalid, such as from an incorrect input.
    def check_and_close(self):
        if genelist:
            valid_items = genedict

            invalid_items = [item for item in genelist if item not in valid_items]

            if invalid_items:
                message = f"WARNING! Invalid genes: {', '.join(invalid_items)}\nDo you still want to close?"
                response = messagebox.askyesno("Invalid Genes", message)
                if not response:
                    return

            self.master.destroy()
        else:
            messagebox.showwarning("No Genes Selected", "The list is empty. Please add genes before closing.")

# Pulls together all of the above GUI functions before running them.
def main():
    root = tk.Tk()
    app = UserInputListGUI(root)
    root.mainloop()

if __name__ == "__main__":
    main()

# Converts all user-inputted genes to an ID from Ensembl as long as they are in the dictionary.
# Returns both the valid list of genes and IDs.
def convertID():
    validlist = [gene for gene in genelist if gene in genedict]
    convertlist = [genedict[gene] for gene in genelist if gene in genedict]
    return (validlist, convertlist)

GOIlist, IDlist = convertID()

"""Check IDs against each set of data and isolate rows containing genes of interest."""
# Define a function to check if the entered values are included in a row from entered data
def checkID(data, IDs, genes):
    matching_rows = pd.DataFrame()
    for ID in IDs:
        row_to_add = data[data.apply(lambda row: row.astype(str).str.contains(ID, case=False, na=False).any(), axis=1)]
        matching_rows = pd.concat([matching_rows, row_to_add], ignore_index=True)
        
    # Print the entire row(s) if matches are found
    if not matching_rows.empty:
        print(f"Matching row(s) for {', '.join(genes)}:\n{matching_rows}")
    else:
        print(f"No matching rows found for {', '.join(genes)}.")
    return matching_rows

# Define a function to consolidate info about matching values into one list
def consolidate(data):
    analysis_data = data[paraInt]
    return analysis_data

# MOCK DATA
# Read the CSV file into a pandas DataFrame and change names of columns
mockDF = pd.read_csv(mock_csv_file_path)
mockDF.columns = data_columns

# Header for mock data
print('MOCK DATA:')

# Run functions and get data for analysis
mock_complete_data = checkID(mockDF, IDlist, GOIlist)
mock_analysis = consolidate(mock_complete_data)
print(f'\nGenerating consolidated data for mock samples...\n{mock_analysis}\n')

# INFECTED DATA
# Read the CSV file into a pandas DataFrame and change names of columns
infDF = pd.read_csv(inf_csv_file_path)
infDF.columns = data_columns

# Header for infected data
print('INFECTED DATA:')

# Run functions and get data for analysis
inf_complete_data = checkID(infDF, IDlist, GOIlist)
inf_analysis = consolidate(inf_complete_data)
print(f'\nGenerating consolidated data for infected samples...\n{inf_analysis}\n')

"""Use data from genes of interest to perform analysis and generate necessary figures."""

# Define a function to calculate number of modified transcripts
def calcModTran(data):
    modified_transcripts = {}
    for index, row in data.iterrows():
        # Extract values from the n_reads and mod_ratio columns
        total_reads = row['n_reads']
        ratio_of_modified = row['mod_ratio']
        
        # Perform the multiplication
        modified_reads = total_reads * ratio_of_modified
        
        # Store the result in the dictionary with a unique key
        key = f"{row['gene']}_{index}_mock"  # Use index to create a unique key
        modified_transcripts[key] = modified_reads

    return modified_transcripts

mock_values = calcModTran(mock_analysis)
inf_values = calcModTran(inf_analysis)

# Extract keys from dictionaries
mock_keys = list(mock_values.keys())
inf_keys = list(inf_values.keys())

# Initialize a dictionary to store percentage differences
percentage_differences = {}

# Calculate percentage differences
for i in range(min(len(mock_keys), len(inf_keys))):
    mock_value = mock_values[mock_keys[i]]
    inf_value = inf_values[inf_keys[i]]

    # Calculate percentage difference using the correct formula
    percentage_diff = abs((inf_value - mock_value) / ((inf_value + mock_value) / 2)) * 100

    # Modify the key to end with "_difference"
    new_key = mock_keys[i].replace("_mock", "_difference")
    percentage_differences[new_key] = percentage_diff

# Extract data for plotting
keys = list(mock_values.keys())
index = np.arange(len(keys))

# Ensure that mock and infected pairs are next to each other
bar_width = 0.35
fig, ax = plt.subplots(figsize=(10, 6))

# Bar graph for mock and infected values
bar1 = ax.bar(index, list(mock_values.values()), bar_width, label='Mock')
bar2 = ax.bar(index + bar_width, (inf_values.values()), bar_width, label='Infected')

# Annotate with percentage differences
for i, (mock_val, inf_val, diff) in enumerate(zip(list(mock_values.values()), list(inf_values.values()), list(percentage_differences.values()))):
    max_val = max(mock_val, inf_val)
    ax.annotate(f'{diff:.2f}%', (index[i] + bar_width / 2, max_val + 6), ha='center', va='center', color='red')

# Set labels and title
ax.set_xlabel('Transcripts')
ax.set_ylabel('Number of Modified Transcripts')
ax.set_title('Mock and Infected Transcript Comparison')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(keys)
ax.legend()

# Replace "_mock" with "_transcripts" in the keys for xticklabels
xticklabels = [key.replace("_mock", "_transcripts") for key in keys]

# Set xticklabels
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(xticklabels)

# Set xticklabels font size
ax.tick_params(axis='x', labelsize=8) 

# Display the graph
plt.show()