# Load your predicted values

In [3]:
import pandas as pd

your_uploaded_data = '' # in csv format and at least 2 columns ('mutant' and one of ['Activity_cutinases', 'Activity_proteingym', 'Expression', 'Stability'])
predicted_data = pd.read_csv(your_uploaded_data)

In [4]:
# @title Spearman correlation

# choose dataset to compare
experimental_assay = 'Activity_cutinases' # @param ["Activity_proteingym", "Expression", "Stability", "Activity_cutinases"]
experimental_data = pd.read_csv(f'colab_datasets_{experimental_assay}.csv')


if experimental_assay == 'Activity_proteingym':
  # merge predicted and ground truth dataframes
  merged_df = pd.merge(experimental_data[['mutant', 'mutated_sequence', experimental_assay, 'protein', 'mean_qtmscore_petases']],
                      predicted_data,
                      on='mutated_sequence',
                      suffixes=('_actual', '_predicted'))
  # calculate spearman correlation with pandas corr() by protein
  grouped = merged_df.groupby('protein_actual')
  data = {
      'protein': [],
      'spearman_correlation': [],
      'mean_qtmscore': []
  }

  for group in grouped.groups:
      group_df = grouped.get_group(group)
      correlation = group_df[experimental_assay + '_actual'].corr(group_df[experimental_assay + '_predicted'], method='spearman')
      data['protein'].append(group)
      data['spearman_correlation'].append(correlation)
      data['mean_qtmscore'].append(group_df['mean_qtmscore_petases_actual'].values[0])

  df = pd.DataFrame.from_dict(data).sort_values(by='mean_qtmscore', ascending=False)
  print(df)

else:
  # merge predicted and ground truth dataframes
  merged_df = pd.merge(experimental_data[['mutant', 'mutated_sequence', experimental_assay]],
                      predicted_data,
                      on='mutated_sequence',
                      suffixes=('_actual', '_predicted'))
  # calculate spearman correlation with pandas corr()
  correlation = merged_df[experimental_assay + '_actual'].corr(merged_df[experimental_assay + '_predicted'], method='spearman')
  print(f'Spearman correlation of {experimental_assay}: ', correlation)



Spearman correlation of Activity_cutinases:  0.9999010587486867


In [2]:
# @title Creating datasets (NOT RUN)
import pandas as pd

# Load the dataframe
df = pd.read_csv('cutinase_fitness.csv')

# Display the first few rows and column info to understand the data
print(df.head())
print(df.info())

# Calculate Fitness (Specific Activity)
# Avoid division by zero if Bradford is 0 (though unlikely in valid data, good to be safe)
df['Fitness'] = df.apply(lambda row: row['Average Absorbance (control subtracted)'] / row['Bradford Absorbance'] if row['Bradford Absorbance'] > 0 else 0, axis=1)

# Sort by Fitness descending
df_sorted = df.sort_values(by='Fitness', ascending=False)

# Select relevant columns for display
display_cols = ['Name', 'Fitness', 'Average Absorbance (control subtracted)', 'Bradford Absorbance', 'Sequence']
top_sequences = df_sorted[display_cols].head(10)

# Save the sorted data to a new CSV
output_filename = 'ranked_cutinase_fitness.csv'
df_sorted.to_csv(output_filename, index=False)

print(top_sequences)
print(f"Top Fitness Value: {df_sorted['Fitness'].max()}")
print(f"Lowest Fitness Value: {df_sorted['Fitness'].min()}")

                                            Sequence    Name     Phase  \
0  MAENPYERGPDPTEASIEASRGPFAIAQVSVPSGSGSGFGGGTIYY...   1_426  II - ASR   
1  MAENPYERGPDPTEASIEASRGPFAIAQVTVPSGSGSGFGGGTIYY...   1_427  II - ASR   
2  MAANPYERGPDPTESSLEASSGPFSVSQTSVSRLSVSGFGGGTIYY...   1_533  II - ASR   
3  MQANPYQRGPDPTESSLEASSGPFSVSTTSVSRLSVSGFGGGTIYY...   1_534  II - ASR   
4  MAENPYERGPDPTEASIEASRGPFAISQVSVPSGSGSGFGGGTIYY...  10_435  II - ASR   

   Rep 1 A260 nm (a.u.) (control subtracted)  \
0                                     -0.001   
1                                      0.000   
2                                      0.184   
3                                      0.011   
4                                      0.002   

   Rep 2 A260 nm (a.u.) (control subtracted)  \
0                                     -0.003   
1                                     -0.001   
2                                      0.247   
3                                      0.018   
4                         