In [18]:
import pandas as pd
import matplotlib.pyplot as plt

# Data

In [19]:
df1 = pd.read_csv('raw_data/VarScan_results_alignment_1.csv')
df2 = pd.read_csv('raw_data/VarScan_results_alignment_2.csv')
df3 = pd.read_csv('raw_data/VarScan_results_alignment_3.csv')
df_roommate = pd.read_csv('raw_data/VarScan_results_roommate.csv')

df1.frequency = df1.frequency.apply(lambda x: x.strip('%')).astype(float)
df2.frequency = df2.frequency.apply(lambda x: x.strip('%')).astype(float)
df3.frequency = df3.frequency.apply(lambda x: x.strip('%')).astype(float)
df_roommate.frequency = df_roommate.frequency.apply(lambda x: x.strip('%')).astype(float)

# Calculate the average and standard deviation

In [23]:
av1 = df1.frequency.mean()
std1 = df1.frequency.std()

av2 = df2.frequency.mean()
std2 = df2.frequency.std()

av3 = df3.frequency.mean()
std3 = df3.frequency.std()

print('1st control')
print('Average:', av1)
print('Standard deviation:', std1)

print('2st control')
print('Average:', av2)
print('Standard deviation:', std2)

print('3st control')
print('Average:', av3)
print('Standard deviation:', std3)

1st control
Average: 0.25649122807017544
Standard deviation: 0.07172594738880801
2st control
Average: 0.2369230769230769
Standard deviation: 0.05237640770866741
3st control
Average: 0.250327868852459
Standard deviation: 0.07803775182808968


In [24]:
df = pd.concat([df1, df2, df3])

av = df.frequency.mean()
std = df.frequency.std()

print('Average:', av)
print('Standard deviation:', std)

Average: 0.24829411764705883
Standard deviation: 0.06898268773972734


In [28]:
df_roommate['significant'] = df_roommate.frequency.apply(lambda x: x < av - 3 * std or x > av + 3 * std)
df_roommate[df_roommate.significant == True]

Unnamed: 0,position,reference_base,alternative_base,frequency,significant
0,72,A,G,99.96,True
1,117,C,T,99.82,True
4,307,C,T,0.94,True
10,774,T,C,99.96,True
14,999,C,T,99.86,True
18,1260,A,C,99.94,True
20,1458,T,C,0.84,True


# Epitopes

In [38]:
epitopes = {
    'A': [122, 124, 126, 130, 131, 132, 133, 135, 137, 138, 140, 142, 143, 144, 145, 146, 150, 152, 168],
    'B': [128, 129, 155, 156, 157, 158, 159, 160, 163, 165, 186, 187, 188, 189, 190, 192, 193, 194, 196, 197, 198],
    'C': [44, 45, 46, 47, 48, 50, 51, 53, 54, 273, 275, 276, 278, 279, 280, 294, 297, 299, 300, 304, 305, 307, 
          308, 309, 310, 311, 312],
    'D': [96, 102, 103, 117, 121, 167, 170 , 171, 172, 173, 174, 175, 176, 177, 179, 182, 201, 203, 207, 208, 209, 
          212, 213, 214, 215, 216, 217, 218, 219, 226, 227, 228, 229, 230, 238, 240, 242, 244, 246, 247, 248],
    'E': [57, 59, 62, 63, 67, 75, 78, 80, 81, 82, 83, 86, 87, 88, 91, 92, 94, 109, 260, 261, 262, 265]
}

def find_epitope(x):
    protein_pos = (x - 1) // 3 + 1
    for k in epitopes.keys(): 
        if protein_pos in epitopes[k]: 
            return k
    return None

df_roommate['epitope'] = df_roommate.position.apply(find_epitope)
df_roommate

Unnamed: 0,position,reference_base,alternative_base,frequency,significant,epitope
0,72,A,G,99.96,True,
1,117,C,T,99.82,True,
2,254,A,G,0.17,False,
3,276,A,G,0.17,False,E
4,307,C,T,0.94,True,D
5,340,T,C,0.17,False,
6,389,T,C,0.22,False,A
7,691,A,G,0.17,False,
8,722,A,G,0.2,False,
9,744,A,G,0.17,False,D


# Save result

In [40]:
df_roommate.to_csv('raw_data/VarScan_results_roommate_sign_epits.csv')