In [9]:
import sys
import os
import random
import numpy as np
import pandas as pd
from datetime import datetime
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

# ---------------------------
# 1) Parse Command-Line Args
# ---------------------------
# Example call: python script.py 100 my_data.csv my_saved_model.h5 20 8
# 1) samplesize = 100
# 2) infilename = 'my_data.csv'
# 3) targetmodel = 'my_saved_model.h5'
# 4) targetweight = 20  (Not used here, but remains as a parameter)
# 5) middle = 8
samplesize = 10000
infilename = "output/NV_nomask_covid300_10000.csv"
targetweight = 98
modelpath = "saved_model/lstm_cl7_98_covid19_position_20250308235700"     # Instead of building, we'll load this saved model

# ---------------------------
# 2) Constants and Setup
# ---------------------------
nucleotide_to_int = {'a': 0, 'c': 1, 'g': 2, 't': 3}
int_to_nucleotide = {0: 'a', 1: 'c', 2: 'g', 3: 't'}

num_classes = 4
seq_length = 16
NV_Len = 32  # Number of numeric features

# ---------------------------
# 3) Read CSV Data
# ---------------------------
dfm = pd.read_csv(infilename)

# Filter or sample data as needed
df_build = dfm.sample(samplesize, random_state=2008).reset_index(drop=True).copy()

#mask_train = dfm['mask1']
mask_train = 8
# Extract numeric features (32 columns)
featurelist = [
    'n_a', 'n_c', 'n_t', 'n_g',
    'mu_a', 'mu_c', 'mu_t', 'mu_g',
    'moment_a_2', 'moment_c_2', 'moment_t_2', 'moment_g_2',
    'moment_a_3', 'moment_c_3', 'moment_t_3', 'moment_g_3',
    'moment_a_4', 'moment_c_4', 'moment_t_4', 'moment_g_4',
    'moment_a_5', 'moment_c_5', 'moment_t_5', 'moment_g_5',
    'moment_a_6', 'moment_c_6', 'moment_t_6', 'moment_g_6',
    'moment_a_7', 'moment_c_7', 'moment_t_7', 'moment_g_7'
]
features = df_build[featurelist].copy()

# Reshape for LSTM: (samples, timesteps=32, features_per_timestep=1)
X_input = features.values.reshape((features.shape[0], features.shape[1], 1))

# ---------------------------
# 4) Load Existing Model
# ---------------------------
# If your model has a custom loss or custom layers, you need custom_objects dict
# For example, if you need `mask_loss`, define it and pass: custom_objects={'custom_loss': mask_loss(...)}
def mask_loss(mask_train, weight):
    def custom_loss(y_true, y_pred):
        # same as original definition if needed
        cross_entropy_loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
        return cross_entropy_loss  # For simplicity, or the full code from before
    return custom_loss

# Custom objects dictionary, only if your model needs it
custom_objects = {
    'custom_loss': mask_loss(mask_train, targetweight)
}

model = load_model(modelpath, custom_objects=custom_objects)

# ---------------------------
# 5) Predict
# ---------------------------
predictions = model.predict(X_input)  
# 'predictions' is shape (samplesize, seq_length, num_classes)

# Convert to integer-coded nucleotides
predicted_nucleotides = np.argmax(predictions, axis=-1)  
# Now shape: (samplesize, seq_length)

# ---------------------------
# 6) Convert Predictions Back to Nucleotides
# ---------------------------
predicted_sequences = [
    ''.join(int_to_nucleotide[nuc] for nuc in seq)
    for seq in predicted_nucleotides
]

# ---------------------------
# 7) Print or Save Results
# ---------------------------
for i, seq in enumerate(predicted_sequences):
    print(f"Sample {i} -> Predicted subsequence: {seq}")



df_results = pd.DataFrame({
    "id": df_build["id"],
    "Position": df_build["Position"],
    #"Maskpos_1": df_build["mask1"],
    #"Maskpos_2": df_build["mask2"],
    "Original_Sequence": df_build["Sequence"],
    "Predicted_Sequence": predicted_sequences
})
df_results["Overall_Accuracy"] = [
    sum(ori == pred for ori, pred in zip(original, predicted)) / len(original)
    for original, predicted in zip(df_results["Original_Sequence"], df_results["Predicted_Sequence"])
]
print(df_results.head(10))

output_file = "NV_nomask_2_COVID_prediction.csv"
df_results.to_csv("NV_nomask_2_COVID_prediction.csv")


Sample 0 -> Predicted subsequence: aaattaaattggcaaa
Sample 1 -> Predicted subsequence: gacattctctctctca
Sample 2 -> Predicted subsequence: ttaactttaatctcac
Sample 3 -> Predicted subsequence: aaattgtgggctcaat
Sample 4 -> Predicted subsequence: atctgtgtggctgtca
Sample 5 -> Predicted subsequence: tgcttgcactgatgac
Sample 6 -> Predicted subsequence: cattgttaacactcca
Sample 7 -> Predicted subsequence: actcaatttgcttttg
Sample 8 -> Predicted subsequence: gttgttgaatgtctta
Sample 9 -> Predicted subsequence: cggtagcacaccttgt
Sample 10 -> Predicted subsequence: ctatccccatgtgatt
Sample 11 -> Predicted subsequence: ttcttcacgacattgg
Sample 12 -> Predicted subsequence: ctgggtaggtacatgt
Sample 13 -> Predicted subsequence: attacagatgctgtag
Sample 14 -> Predicted subsequence: ggctgcgttatagctt
Sample 15 -> Predicted subsequence: attttggaggaataca
Sample 16 -> Predicted subsequence: aatgatatcctttcac
Sample 17 -> Predicted subsequence: cacttgcagatgctgg
Sample 18 -> Predicted subsequence: cttagtagaagttgaa
Sam

In [11]:
df_results

Unnamed: 0,id,Position,Original_Sequence,Predicted_Sequence,Overall_Accuracy
0,Eta|hCoV-19/Canada/ABPHL-07105/2021,977,aaattaaattggcaaa,aaattaaattggcaaa,1.0
1,Zeta|hCoV-19/Brazil/PB-FIOCRUZ-29646/2020,7804,gacattctctctctca,gacattctctctctca,1.0
2,Kappa|hCoV-19/Germany/MV-RKI-I-138459/2021,29623,ttaactttaatctcac,ttaactttaatctcac,1.0
3,Omicron|hCoV-19/France/HDF-IPP05865/2022,11854,aaattgtgggctcaat,aaattgtgggctcaat,1.0
4,Kappa|hCoV-19/India/KA-RFNB-10512/2021,37,atctgtgtggctgtca,atctgtgtggctgtca,1.0
...,...,...,...,...,...
9995,Delta|hCoV-19/USA/NY-MSHSPSP-PV33493/2021,1142,aatgtgcctttcaact,aatgtgcctttcaact,1.0
9996,Delta|hCoV-19/Wales/MILK-2D2FACE/2021,24299,ggcaaaattcaagact,ggcaaaattcaagact,1.0
9997,Gamma|hCoV-19/Spain/CT-HUVH-E22723/2021,14184,acaaagccttacatta,acaaagccttacatta,1.0
9998,Alpha|hCoV-19/Germany/NW-RKI-I-146006/2021,26625,tcctctggctgttatg,tcctctggctgttatg,1.0


In [48]:
for i, seq in enumerate(predicted_sequences):
    print(f"Sample {i} -> Predicted subsequence: {seq}")



df_results = pd.DataFrame({
    "Position": df_build["targetpos"],
    "Record_Num": df_build["id"],
    "Original_Sequence": df_build["subseq"],
    "masked_og_seq" : df_build["maskseq"],
    "Predicted_Sequence": predicted_sequences
})

print(df_results.head(10))

output_file = "HIV_predictions.csv"
df_results.to_csv("HIV_predictions.csv")


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [19]:
df_results = pd.DataFrame({
    "Position": df_build["targetpos"],
    "Record_Num": df_build["id"],
    "Original_Sequence": df_build["subseq"],
    "Predicted_Sequence": predicted_sequences
})


df_results.to_csv("HIVpredictions.csv")


In [6]:
df_build

Unnamed: 0.1,Unnamed: 0,id,startpos,maskpos,targetpos,subseq,maskseq,n_a,n_c,n_t,...,moment_t_6,moment_g_6,moment_a_7,moment_c_7,moment_t_7,moment_g_7,pos_a,pos_c,pos_g,pos_t
0,148553,MK115290,9059,16,13,cagttggacccccaaa,cagttggaccccNaaa,5.25,5.25,2.25,...,11.880903,2.753992,-50.471244,-16.487984,20.882172,2.671894,"[2, 8, 14, 15, 16]","[1, 9, 10, 11, 12]","[3, 6, 7]","[4, 5, 13]"
1,57436,AF286238,4,16,12,tcgaagcgaagtacca,tcgaagcgaagNacca,6.25,4.25,1.25,...,216.255005,1.828590,0.832662,-58.867382,0.000000,-0.190982,"[4, 5, 9, 10, 13, 16]","[2, 7, 14, 15]","[3, 6, 8, 11]","[1, 12]"
2,90505,JQ316127,1579,16,5,gtattgccctatcagc,gtatNgccctatcagc,3.25,5.25,4.25,...,6.567249,296.768399,-122.169716,1.531735,0.303146,430.250722,"[3, 11, 14]","[7, 8, 9, 13, 16]","[1, 6, 15]","[2, 4, 5, 10, 12]"
3,65568,MK303350,5940,16,12,caagcacaaatgatag,caagcacaaatNatag,8.25,3.25,2.25,...,0.276245,1131.501073,0.057865,0.254329,0.003227,-52.872398,"[2, 3, 6, 8, 9, 10, 13, 15]","[1, 5, 7]","[4, 16]","[11, 12, 14]"
4,145156,MH897917,6741,16,11,cgatttccggataaaa,cgatttccggNtaaaa,5.25,3.25,4.25,...,4.099389,19.787065,-57.684677,-12.287030,5.066673,-30.234087,"[3, 13, 14, 15, 16]","[1, 7, 8]","[2, 9, 10]","[4, 5, 6, 11, 12]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155533,101001,MF373131,6872,16,8,gacaataggagaaata,gacaataNgagaaata,9.25,1.25,2.25,...,203.352556,56.089312,-0.072027,0.000000,13.008142,-95.861896,"[2, 4, 5, 7, 10, 12, 13, 14, 16]",[3],"[1, 9, 11]","[6, 8, 15]"
155534,144749,MH897912,6014,16,2,caattcgaccgagtcg,cNattcgaccgagtcg,3.25,5.25,3.25,...,82.856590,7.719815,-3.958981,-1.950136,162.016656,-5.838140,"[3, 8, 12]","[1, 6, 9, 10, 15]","[7, 11, 13, 16]","[2, 4, 5, 14]"
155535,110822,MF373175,5357,16,3,aaaataggaaactgtt,aaNataggaaactgtt,7.25,1.25,4.25,...,30.853112,11.005265,-0.026205,0.000000,-51.285933,12.577646,"[1, 2, 4, 6, 9, 10, 11]",[12],"[7, 8, 14]","[3, 5, 13, 15, 16]"
155536,45060,FJ388910,1135,16,1,aggggggcaataagaa,Nggggggcaataagaa,6.25,1.25,1.25,...,122.070312,2.318020,-2.709518,0.000000,0.000000,2.613394,"[9, 10, 12, 13, 15, 16]",[8],"[2, 3, 4, 5, 6, 7, 14]","[1, 11]"


In [14]:
df_results = pd.DataFrame({
    "id": df_build["id"],
    "Position": df_build["maskpos"],
    "Maskpos": df_build["targetpos"],
    "Original_Sequence": df_build["subseq"].str.upper(),
    "Predicted_Sequence": predicted_sequences
})

In [34]:
df_results['Position'].value_counts(dropna=False)

Position
16    155538
Name: count, dtype: int64

In [36]:
df_results['Target_nu'] = df_results.apply(lambda row: row['Original_Sequence'][row['Maskpos'] - 1], axis=1)
df_results['Predicted_nu'] = df_results.apply(lambda row: row['Predicted_Sequence'][row['Maskpos'] - 1], axis=1)
df_results['correct'] = np.where(df_results['Target_nu'] == df_results['Predicted_nu'],1,0)
df_results

Unnamed: 0,id,Position,Maskpos,Original_Sequence,Predicted_Sequence,Target_nu,Predicted_nu,correct
0,MK115290,16,13,CAGTTGGACCCCCAAA,CAGTTGGACCCCAAAA,C,A,0
1,AF286238,16,12,TCGAAGCGAAGTACCA,TCGAAGCGAAGCACCA,T,C,0
2,JQ316127,16,5,GTATTGCCCTATCAGC,GTATTGCCCTATCAGC,T,T,1
3,MK303350,16,12,CAAGCACAAATGATAG,CAAGCACAAATTATAG,G,T,0
4,MH897917,16,11,CGATTTCCGGATAAAA,CGATTTCCGGATAAAA,A,A,1
...,...,...,...,...,...,...,...,...
155533,MF373131,16,8,GACAATAGGAGAAATA,GACAATAGGAGAAATA,G,G,1
155534,MH897912,16,2,CAATTCGACCGAGTCG,CTATTCGACCGAGTCG,A,T,0
155535,MF373175,16,3,AAAATAGGAAACTGTT,AAAATAGGAAACTGTT,A,A,1
155536,FJ388910,16,1,AGGGGGGCAATAAGAA,TGGGGGGCAATAAGAA,A,T,0


In [38]:
df_results[['Maskpos', 'correct']].value_counts(dropna=False)

Maskpos  correct
1        0          6500
2        0          6453
3        0          6418
16       0          6315
11       0          6297
5        0          6290
7        0          6281
8        0          6268
4        0          6266
10       0          6264
6        0          6259
12       0          6249
14       0          6247
15       0          6218
9        0          6211
13       0          6193
15       1          3559
13       1          3555
14       1          3515
9        1          3506
12       1          3486
16       1          3478
10       1          3459
6        1          3446
8        1          3443
11       1          3433
7        1          3427
4        1          3425
5        1          3405
3        1          3269
2        1          3227
1        1          3176
Name: count, dtype: int64

In [5]:
df_results[(df_results['id'] == 'MK115290') & (df_results['Position'] == 9072)]

Unnamed: 0,id,Position,Maskpos,Original_Sequence,Predicted_Sequence,Target_nu,Predicted_nu,correct
0,MK115290,9072,13,CAGTTGGACCCCCAAA,CAGTTGGACCCCAAAA,C,A,0


In [46]:
df_results[(df_results['id'] == 'MK115290') & (df_results['Position'] == 16)][['Predicted_nu']].value_counts(dropna=False)

Predicted_nu
A               27
C                8
G                7
T                4
Name: count, dtype: int64

In [7]:
predicted_sequences

['TCAACAAAGCCTTATA',
 'TGTGTAATGCTGAAAA',
 'TAAGAGGCAAGAAATT',
 'CTGAAAGGGGGGTGGG',
 'ATAATACTCATACGGA',
 'ATGAAACTAAATGGGT',
 'TATATAACAGGGCCAT',
 'TGTAAATCTCAGCCAC',
 'ACACTGCCACTGATAT',
 'AAATGGATAACAACTT',
 'ATTGCCAGTATCATCA',
 'AATAATACTAGAGGGA',
 'AAACGGTTCCACCCTT',
 'ATCTCCCGTTATATTA',
 'ATAATAACATTTGGTA',
 'ATTGTTACTACAGCTT',
 'TACAAATAGCACCAAT',
 'AAAACAGGCAACTGTA',
 'AGAAAAAAATTTTAAT',
 'CCTGGATGCGGGGAAA',
 'TTAGAAACTATGGAAA',
 'TAGATGAGAGGCAGCA',
 'CCATGCCACTTACCCA',
 'AGACGCCTGACATTTA',
 'GCATTTGATCCGGAAG',
 'ACGACTCTGAGACTGG',
 'AGTACCACTAGTAGAA',
 'TGATGAAATGAATGAA',
 'TCCTACTAGTAGCGAG',
 'AGACAGTAGAAGGAGA',
 'TTGGACTCTCTTTGTG',
 'GAACACAACCCTAGAA',
 'ACCGGCGGCGCCCAGC',
 'CTCAAGTAAAGAAATA',
 'TCTATTTGTATAAGAC',
 'TCCCATCCTTTGTATA',
 'CCAGCATGTTTGAAGA',
 'GGATGGGGAGAAGAGG',
 'CTATTACTAATTTCAC',
 'TTAGCTCTCCCAGTAG',
 'TAAGTACATCCTGACC',
 'TAATGGTGGGAACTAC',
 'GCATTGGACAAGCACT',
 'AGCTGACGCCCGGCAA',
 'TATTACACTAGCTGAA',
 'CAGGGTAGATATACAA',
 'TACTAGAGAGACAAAT',
 'TAGCTACAATA

In [23]:
infilename

'out.csv'

AttributeError: module 'pandas' has no attribute 'read'

## check performance

In [5]:
covid_check = pd.read_csv("COVID_predictions.csv")
hiv_check = pd.read_csv("HIV_predictions.csv")

In [7]:
covid_check

Unnamed: 0.1,Unnamed: 0,id,Position,Maskpos,Original_Sequence,Predicted_Sequence,Target_nu,Predicted_nu,correct
0,0,Delta|hCoV-19/England/QEUH-20A8462/2021,10960,11,ACCACTGGTTATTACT,CCTTCCCTTACGGGGA,A,C,0
1,1,Delta|hCoV-19/England/MILK-2CE551F/2021,1696,10,TGAAAGGTTCGGATTA,CCTTCCCTTCAGTAGG,C,C,1
2,2,Iota|hCoV-19/USA/RI-CDC-QDX24520530/2021,6032,15,AATTTGCTGATGATCT,TCTCCGTCCTAAAAAA,C,A,0
3,3,Beta|hCoV-19/Turkey/HSGM-7845/2021,29696,8,TGTGGAAAGATCGACC,CCTTCCCTTATGGTGG,A,T,0
4,4,Alpha|hCoV-19/England/ALDP-EE9E8D/2021,25152,6,TTTTACAGCTGGCTTG,CCTTCCCTAATTTTTT,C,C,1
...,...,...,...,...,...,...,...,...,...
6604,6604,Iota|hCoV-19/USA/NY-CDC-LC0023793/2021,28336,8,AATTAACGCCAATAGC,CCTTCCCTTATGGTGG,G,T,0
6605,6605,Epsilon|hCoV-19/USA/CA-CDPH-UC578/2021,23856,6,GTGCTCTAACTGGAAT,CCTTCCCTAATTTTTT,C,C,1
6606,6606,Eta|hCoV-19/Canada/ABPHL-12092/2021,5744,11,TAAAGAAACTCTGTAT,CCTTCCCTTACGGGGA,C,C,1
6607,6607,Eta|hCoV-19/England/HSLL-155A782/2021,27360,6,GCTACGTGTGAGCTTT,CCTTCCCTAATTTTTT,G,C,0


In [9]:
hiv_check

Unnamed: 0.1,Unnamed: 0,id,Position,Maskpos,Original_Sequence,Predicted_Sequence,Target_nu,Predicted_nu,correct
0,0,MK115290,9072,13,CAGTTGGACCCCCAAA,CAGTTGGACCCCAAAA,C,A,0
1,1,AF286238,16,12,TCGAAGCGAAGTACCA,TCGAAGCGAAGCACCA,T,C,0
2,2,JQ316127,1584,5,GTATTGCCCTATCAGC,GTATTGCCCTATCAGC,T,T,1
3,3,MK303350,5952,12,CAAGCACAAATGATAG,CAAGCACAAATTATAG,G,T,0
4,4,MH897917,6752,11,CGATTTCCGGATAAAA,CGATTTCCGGATAAAA,A,A,1
...,...,...,...,...,...,...,...,...,...
155533,155533,MF373131,6880,8,GACAATAGGAGAAATA,GACAATAGGAGAAATA,G,G,1
155534,155534,MH897912,6016,2,CAATTCGACCGAGTCG,CTATTCGACCGAGTCG,A,T,0
155535,155535,MF373175,5360,3,AAAATAGGAAACTGTT,AAAATAGGAAACTGTT,A,A,1
155536,155536,FJ388910,1136,1,AGGGGGGCAATAAGAA,TGGGGGGCAATAAGAA,A,T,0


In [115]:
covid_check[['Maskpos','correct']].value_counts().sort_index()

Maskpos  correct
1        0          314
         1           97
2        0          283
         1          128
3        0          283
         1          128
4        0          283
         1          128
5        0          314
         1           97
6        0          314
         1           97
7        0          314
         1           97
8        0          284
         1          129
9        0          284
         1          129
10       0          316
         1           97
11       0          316
         1           97
12       0          332
         1           82
13       0          332
         1           82
14       0          332
         1           83
15       0          334
         1           84
16       0          314
         1          105
Name: count, dtype: int64

In [None]:
# Reshape with pivot_table() for a pivot table structure
pivot_table = pd.pivot_table(df, values='Value', index='Category', columns='Subcategory', aggfunc='sum')
print("\nPivot Table:\n", pivot_table)

# Handle missing values (optional)
pivot_table_filled = pd.pivot_table(df, values='Value', index='Category', columns='Subcategory', aggfunc='sum', fill_value=0)
print("\nPivot Table with Filled Values:\n", pivot_table_filled)

# Multiple aggregation functions
pivot_table_multi = pd.pivot_table(df, values='Value', index='Category', columns='Subcategory', aggfunc=[np.sum, np.mean])
print("\nPivot Table with Multiple Aggregation:\n", pivot_table_multi)


In [117]:
grouped_COVID = covid_check.groupby(['id', 'Position', 'Target_nu', 'Predicted_nu'])['correct'].count().reset_index()
grouped_COVID

Unnamed: 0,id,Position,Target_nu,Predicted_nu,correct
0,Alpha|hCoV-19/Belgium/Jessa_11-2119-002221/2021,21536,A,A,4
1,Alpha|hCoV-19/Belgium/Jessa_11-2119-002221/2021,21536,A,C,6
2,Alpha|hCoV-19/Belgium/Jessa_11-2119-002221/2021,21536,A,G,1
3,Alpha|hCoV-19/Belgium/Jessa_11-2119-002221/2021,21536,A,T,5
4,Alpha|hCoV-19/Bulgaria/21BG-EU_002140_Pl20/2021,16,C,A,4
...,...,...,...,...,...
1658,Zeta|hCoV-19/USA/CO-CDC-ASC210029043/2021,6064,G,T,5
1659,Zeta|hCoV-19/USA/WA-UW-21041469819/2021,2160,G,A,4
1660,Zeta|hCoV-19/USA/WA-UW-21041469819/2021,2160,G,C,6
1661,Zeta|hCoV-19/USA/WA-UW-21041469819/2021,2160,G,G,1


In [119]:
# Reshape with pivot_table() for a pivot table structure
pivot_table = pd.pivot_table(grouped_COVID, values='correct', index=['id','Position','Target_nu'], columns='Predicted_nu', aggfunc='sum').reset_index()
print("\nPivot Table:\n", pivot_table)


Pivot Table:
 Predicted_nu                                               id  Position  \
0             Alpha|hCoV-19/Belgium/Jessa_11-2119-002221/2021     21536   
1             Alpha|hCoV-19/Bulgaria/21BG-EU_002140_Pl20/2021        16   
2             Alpha|hCoV-19/Canada/QC-1nDJXQ-2120103813A/2021     21280   
3               Alpha|hCoV-19/Canada/QC-1nIRE-U41918299Q/2021     13312   
4                       Alpha|hCoV-19/Denmark/DCGC-93213/2021     23136   
..                                                        ...       ...   
414                          Zeta|hCoV-19/Brazil/SP-1007/2021     29872   
415                  Zeta|hCoV-19/Brazil/SP-757331-NBD14/2021      8816   
416                           Zeta|hCoV-19/Japan/IC-0493/2020     15120   
417                 Zeta|hCoV-19/USA/CO-CDC-ASC210029043/2021      6064   
418                   Zeta|hCoV-19/USA/WA-UW-21041469819/2021      2160   

Predicted_nu Target_nu    A    C    G    T  
0                    A  4.0  6.0  1.0  

In [121]:
pivot_table['max'] = pivot_table[["A", "C", "T", "G"]].max(axis=1)
pivot_table['N'] = pivot_table[["A", "C", "T", "G"]].sum(axis=1)
pivot_table

Predicted_nu,id,Position,Target_nu,A,C,G,T,max,N
0,Alpha|hCoV-19/Belgium/Jessa_11-2119-002221/2021,21536,A,4.0,6.0,1.0,5.0,6.0,16.0
1,Alpha|hCoV-19/Bulgaria/21BG-EU_002140_Pl20/2021,16,C,4.0,6.0,1.0,5.0,6.0,16.0
2,Alpha|hCoV-19/Canada/QC-1nDJXQ-2120103813A/2021,21280,T,4.0,6.0,1.0,5.0,6.0,16.0
3,Alpha|hCoV-19/Canada/QC-1nIRE-U41918299Q/2021,13312,C,4.0,6.0,1.0,5.0,6.0,16.0
4,Alpha|hCoV-19/Denmark/DCGC-93213/2021,23136,T,4.0,6.0,1.0,5.0,6.0,16.0
...,...,...,...,...,...,...,...,...,...
414,Zeta|hCoV-19/Brazil/SP-1007/2021,29872,C,1.0,,1.0,,1.0,2.0
415,Zeta|hCoV-19/Brazil/SP-757331-NBD14/2021,8816,T,4.0,6.0,1.0,5.0,6.0,16.0
416,Zeta|hCoV-19/Japan/IC-0493/2020,15120,T,4.0,6.0,1.0,5.0,6.0,16.0
417,Zeta|hCoV-19/USA/CO-CDC-ASC210029043/2021,6064,G,4.0,6.0,1.0,5.0,6.0,16.0


In [123]:
nu = ['A', 'C', 'T', 'G']
for n in nu:
    pivot_table[f'{n}_max'] = np.where(pivot_table[n] == pivot_table['max'], n, '')


In [125]:
# Apply the function to each row
pivot_table['E'] = pivot_table.apply(lambda row: [row['A_max'], row['C_max'], row['T_max'], row['G_max']], axis=1)
pivot_table['F'] = pivot_table['E'].apply(lambda row: [x for x in row if x != ''])
pivot_table['final'] = pivot_table['F'].apply(lambda row: np.random.choice(row))
pivot_table['correct'] = np.where(pivot_table['Target_nu'] == pivot_table['final'], 1,0)
pivot_table

Predicted_nu,id,Position,Target_nu,A,C,G,T,max,N,A_max,C_max,T_max,G_max,E,F,final,correct
0,Alpha|hCoV-19/Belgium/Jessa_11-2119-002221/2021,21536,A,4.0,6.0,1.0,5.0,6.0,16.0,,C,,,"[, C, , ]",[C],C,0
1,Alpha|hCoV-19/Bulgaria/21BG-EU_002140_Pl20/2021,16,C,4.0,6.0,1.0,5.0,6.0,16.0,,C,,,"[, C, , ]",[C],C,1
2,Alpha|hCoV-19/Canada/QC-1nDJXQ-2120103813A/2021,21280,T,4.0,6.0,1.0,5.0,6.0,16.0,,C,,,"[, C, , ]",[C],C,0
3,Alpha|hCoV-19/Canada/QC-1nIRE-U41918299Q/2021,13312,C,4.0,6.0,1.0,5.0,6.0,16.0,,C,,,"[, C, , ]",[C],C,1
4,Alpha|hCoV-19/Denmark/DCGC-93213/2021,23136,T,4.0,6.0,1.0,5.0,6.0,16.0,,C,,,"[, C, , ]",[C],C,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414,Zeta|hCoV-19/Brazil/SP-1007/2021,29872,C,1.0,,1.0,,1.0,2.0,A,,,G,"[A, , , G]","[A, G]",G,0
415,Zeta|hCoV-19/Brazil/SP-757331-NBD14/2021,8816,T,4.0,6.0,1.0,5.0,6.0,16.0,,C,,,"[, C, , ]",[C],C,0
416,Zeta|hCoV-19/Japan/IC-0493/2020,15120,T,4.0,6.0,1.0,5.0,6.0,16.0,,C,,,"[, C, , ]",[C],C,0
417,Zeta|hCoV-19/USA/CO-CDC-ASC210029043/2021,6064,G,4.0,6.0,1.0,5.0,6.0,16.0,,C,,,"[, C, , ]",[C],C,0


In [127]:
pivot_table[['N','correct']].value_counts()

N     correct
16.0  0          314
      1           97
2.0   0            3
9.0   0            2
1.0   0            1
3.0   1            1
5.0   0            1
Name: count, dtype: int64

In [129]:
pivot_table.to_csv("COVID_RESULT", index=False)