In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from rdkit.Chem import MolFromSmiles
from rdkit.Chem.Descriptors import ExactMolWt

In [111]:
df = pd.read_csv("glucose_dry_impcols.csv")#39_Formose reaction_MeOH.csv
print(df.columns)

Index(['Peak Number', 'Mass', 'Rel. Abundance', 'Kendrick Mass', 'Nom. Mass',
       'KMD', 'Molecular Formula'],
      dtype='object')


In [112]:
# first get rid of empty lines in the mass list by replacing with ''
df.replace('', np.nan, inplace=True)
# also, some 'Mass' values are not numbers
df.dropna(subset=['Mass'], inplace=True)
# now replace NaNs with '' to avoid weird errors
df.fillna('', inplace=True)
df.shape
df.head()


Unnamed: 0,Peak Number,Mass,Rel. Abundance,Kendrick Mass,Nom. Mass,KMD,Molecular Formula
0,1,514.2844,22.4919,513.7102,514.0,290.0,
1,2,226.06495,0.9022,225.8125,226.0,187.0,
2,3,240.0806,1.0122,239.8125,240.0,187.0,
3,4,436.2997,11.7966,435.8125,436.0,187.0,
4,5,464.331,13.2206,463.8126,464.0,187.0,


In [113]:
# make a list of exact mass and relative abundance.
mass_list = []
rel_abundance = []
for i in range(len(df)):
    # allow entire spectrum for this one
    if float(df['Mass'].iloc[i]) < 250 and "No Hit" not in df['Molecular Formula'].iloc[i]:
        mass_list.append(float(df['Mass'].iloc[i]))
        rel_abundance.append(float(df['Rel. Abundance'].iloc[i]))
# now, "renormalize" the relative abundance.
highest = max(rel_abundance)
norm_factor = 100.0/highest
normalized_abun = []
for ab in rel_abundance:
    normalized_abun.append(norm_factor*ab)
print(f'{len(mass_list)} items in {mass_list}')


125 items in [226.06495, 240.0806, 208.05439, 222.07004, 238.06495, 242.05176, 216.03616, 224.0412, 236.07758, 213.96383, 198.02561, 225.0616, 239.07724, 235.04595, 249.06159, 227.20167, 241.21731, 193.03539, 207.05104, 221.06668, 235.08233, 179.05613, 223.02482, 237.04047, 241.03539, 195.05104, 223.04595, 237.0616, 205.01426, 219.02991, 233.04555, 217.03538, 231.05103, 245.06668, 195.02992, 209.04556, 223.06121, 177.01936, 191.035, 205.05065, 233.08194, 213.04047, 227.05612, 241.07177, 247.02481, 207.02991, 221.04556, 235.0612, 205.03539, 219.05104, 233.06669, 247.08233, 225.04047, 239.05612, 243.0299, 231.0299, 245.04555, 235.02482, 249.04046, 161.04557, 243.05104, 189.01935, 203.035, 217.05064, 245.08194, 177.04049, 179.03501, 193.05065, 207.0663, 227.06584, 241.08149, 199.04009, 227.07138, 241.08702, 247.0612, 227.03499, 241.05064, 183.02992, 197.04556, 211.06121, 225.07686, 175.04009, 189.05574, 217.08702, 201.04048, 215.05612, 243.08741, 219.06629, 181.05066, 195.0663, 209.08195,

In [115]:
# formose MOD output
# ../main/formose/formose_output.txt
data_mod = pd.read_csv('../main/glucose/glucose_degradation_output_10mar.txt', sep='\t', names=['Generation', 'SMILES'])
sim_masses = []
for i in range(len(formose_mod)):
    row = formose_mod.iloc[i]
    mol = MolFromSmiles(row['SMILES'])
    mol_wt = ExactMolWt(mol)
    sim_masses.append(mol_wt)
data_mod['Mol Wt'] = sim_masses

ValueError: Length of values (192004) does not match length of index (48403)

In [None]:
unique_sim_masses = list(set(sim_masses))
unique_mass_freq = [sim_masses.count(mass) for mass in unique_sim_masses]
highest_freq = max(unique_mass_freq)

norm_freq = [100*(freq/highest_freq) for freq in unique_mass_freq]

print('Unique masses:',len(unique_sim_masses))
print('Frequency of each mass', unique_mass_freq)

In [None]:
print(unique_sim_masses)

In [None]:
from matplotlib import rc

# Use LaTeX and CMU Serif font.
rc('text', usetex=True)
rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})

In [None]:
# for some flexibility, create a container for the figure
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(6, 12), sharex=True) # create a figure object
#ax = fig.add_subplot(111) # create an axis object

# first, draw the experimental spectrum
axes[0].vlines(x=mass_list, ymin=0, ymax=normalized_abun, color='cornflowerblue')

# now the CNRN
axes[1].vlines(x=unique_sim_masses, ymin=0, ymax=norm_freq, color='deeppink')

#plt.bar(mass_list, rel_abundance, width=0.5)
axes[0].set_yscale('log')
axes[1].set_yscale('log')
axes[0].set_ylim([0.875, 125])
axes[1].set_ylim([0.875, 125])
plt.gca().invert_yaxis()
plt.xlim(155, 205)
plt.xlabel('Exact Mass')
#plt.ylabel('Normalized Abundance')
plt.tight_layout()
plt.subplots_adjust(wspace=0, hspace=0)
plt.savefig('glucose_mirror_plot.jpg', dpi=300)
plt.show()