In [None]:
import pandas as pd
import numpy as np

In [None]:
full = pd.read_csv('ncrna.csv')
subset = pd.read_csv('ncrna_subset.csv')

# clean sequence entry and make all bases lowercase
full['sequence'] = full['sequence'].map(lambda x: str.replace(x, '\n', '').lower())
subset['sequence'] = subset['sequence'].map(lambda x: str.replace(x, '\n', '').lower())

subset['seq_length'] = subset['sequence'].map(len)
full['seq_length'] = full['sequence'].map(len)

In [68]:
full['seq_length'].describe()

count    8.489773e+06
mean     1.145850e+02
std      2.952030e+02
min      2.500000e+01
25%      6.000000e+01
50%      6.000000e+01
75%      9.400000e+01
max      1.079900e+04
Name: seq_length, dtype: float64

In [69]:
subset['seq_length'].describe()

count    100000.000000
mean        114.071680
std         294.451839
min          30.000000
25%          60.000000
50%          60.000000
75%          94.000000
max        9265.000000
Name: seq_length, dtype: float64

In [75]:
# Get GC content of each dataset
full['g_count'] = full['sequence'].str.count('g')
full['c_count'] = full['sequence'].str.count('c')
full['GC_content'] = ((full['g_count'] + full['c_count']) / full['seq_length']) * 100

subset['g_count'] = subset['sequence'].str.count('g')
subset['c_count'] = subset['sequence'].str.count('c')
subset['GC_content'] = ((subset['g_count'] + subset['c_count']) / subset['seq_length']) * 100

In [78]:
print(full['GC_content'].describe())

count    8.489773e+06
mean     5.168625e+01
std      8.321846e+00
min      8.276644e+00
25%      4.767699e+01
50%      5.333333e+01
75%      5.666667e+01
max      8.939394e+01
Name: GC_content, dtype: float64


In [79]:
print(subset['GC_content'].describe())

count    100000.000000
mean         51.666095
std           8.297734
min          13.717026
25%          47.663551
50%          53.333333
75%          56.666667
max          87.142857
Name: GC_content, dtype: float64


In [64]:
def entropy(df):
    bases_per_sequence = df['sequence'].map(lambda x: ''.join(sorted(set(x))))
    all_bases = set(''.join(bases_per_sequence.unique()))
    total = sum(df['seq_length'])
    base_props = {c: (df['sequence'].str.count(c).sum())/total for c in all_bases}
    entropy = -1 * sum([p * np.log2(p) for p in base_props.values()])
    return entropy


In [66]:
entropy(subset)

np.float64(2.0079234374642563)

In [67]:
entropy(full)

np.float64(2.0100741662326946)