In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# read SGAtools data and visualize the first 20 rows

data = pd.read_excel(".xlsx",index_col=0,sheet_name='Combined data').sort_values('Score')

data.head(20)


In [None]:
# read SGAtools data and visualize the last 20 rows
# note that there are 'NaN' in certain Score values - why would this be?

data.tail(20)


In [None]:
#remove all Score values for which there is a 'jackknife' assignment

idx = data['Additional information'].str.contains('JK',na=False).values
data.loc[idx,'Score']=np.nan

data.head(30)

In [None]:
#remove all Score values for which the p-Value is > 0.05
#remove all Score values for which the p-Value is NaN 

idx = data['p-Value']>0.05
data.loc[idx,'Score']=np.nan

idx = data['p-Value'].isna()
data.loc[idx,'Score']=np.nan

data.head(30)


In [None]:
# visualize distribution of Score

data['Score'].hist(bins=100,alpha=0.5,color='firebrick')
plt.title("Distribution of S-scores")
plt.ylabel('Frequency')
plt.xlabel('S-score')
plt.show();

In [None]:
# calculate z-scores based on Score mean and standard deviation

data['Z-score']=(data['Score']-data['Score'].mean())/data['Score'].std(ddof=0)

data.head(30)


In [None]:
# visualize distribution of Z-scores

data['Z-score'].hist(bins=100,alpha=0.5,color='firebrick')
plt.title("Distribution of Z-scores")
plt.ylabel('Frequency')
plt.xlabel('S-score')

plt.axvline(-2, color='black', linestyle='dashed', linewidth=1)
plt.axvline(2, color='black', linestyle='dashed', linewidth=1)

plt.show();

In [None]:
# identify positive and negative genetic interactions for genes that have a 
# z-score above and below a specified cutoff
# print out the proportion of genes that have been classified as positive/negative interactors

cutoff = 2 # specify desired z-score cutoff

top_hits = data[(data['Z-score'] >= data['Z-score'].mean()+cutoff)]
bottom_hits = data[(data['Z-score'] <= data['Z-score'].mean()-cutoff)]

print("proportion of screen with positive interactions: "+str(len(top_hits)/len(data)))
print("proportion of screen with negative interactions: "+str(len(bottom_hits)/len(data)))


In [None]:
# see 30 top hits (positive genetic interactors)

top_hits.sort_values(by='Z-score',ascending=False).head(30)

In [None]:
# see 30 bottom hits (negative genetic interactors)

bottom_hits.sort_values(by='Z-score').head(30)

In [None]:
# export the negative and positive genetic interactors with the array name and final SGA score

data[['Array Name','Score','Z-score']].to_csv("",index=False)
