### Author: Aman Virmani

This code develops the finalized algorithm we used to detect a step-like function in our exons (regions of interest). This data is then exported into a csv (in descending order) for analysis of our results.

In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Read in regions of interest data (exonic regions), log2 change in coverage data, and individual coverage files
df_roi = pd.read_csv('ChrROIs.csv')
df_roi = df_roi[['Chr','Start','End','AvgCoverage']]
df_full = pd.read_csv('Log2_change_in_coverage.csv')
df_h6 = pd.read_csv('hour_6.csv')
df_h2 = pd.read_csv('hour_2.csv')

In [3]:
# Algorithm to detect a "step"-like change in coverage, and append a statistic that is respective to severity of "step"

# Create a dataframe to hold statistics
df_mag_step = pd.DataFrame(columns = ['Start','End','Difference','Pos'])

# For each exonic region, start by looking at the middle third region
for start,end in zip(df_roi['Start'],df_roi['End']):
    max_info = [start,end,0,0]
    max_diff = float('-inf')
    first_pos = int(((end - start)/3) + start)
    last_pos = int(end - ((end - start)/3))
    
    # For each position in the region, find the largest difference in average log2 change in coverage 
    # between the right side of the position and the left side of the position
    for pos in df_full['ID'].iloc[first_pos:last_pos+1]:

        left_win_len = pos-start
        right_win_len = end - pos

        left_avg = (df_full['log2Diff_6_minus_2'].iloc[start:pos].sum())/left_win_len
        right_avg = (df_full['log2Diff_6_minus_2'].iloc[pos+1:end+1].sum())/right_win_len

        diff = abs(right_avg - left_avg)

        if max_diff < diff:
            max_diff = diff
            max_info = [start,end,max_diff,pos]
    
    # if either the hour2 or hour6 data is 0 over the range (0.1 since we added a constant), don't append
    if (df_h6['value'].iloc[start] == 0.1 and df_h6['value'].iloc[end] == 0.1) or (df_h2['value'].iloc[start] == 0.1 and df_h2['value'].iloc[end] == 0.1):
        continue
    
    # Append the maximum difference, start position, end position, and position of the maximum difference to dataframe
    else:
        a_series = pd.Series(max_info, index = df_mag_step.columns)
        df_mag_step = df_mag_step.append(a_series, ignore_index=True)
        
    
    
        

In [4]:
# Sort dataframe based on largest difference
df_mag_step_sorted = df_mag_step.sort_values(by=['Difference'], ascending=False)

In [5]:
# Export data to csv for further analysis
df_mag_step_sorted.to_csv('Sorted_isoform_detections.csv',index=False)