In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# list has sublist which has site and number of sample at each site

data = [['A1', 28], ['A2', 32], ['A3', 1], ['A4', 0],
        ['A5', 10], ['A6', 22], ['A7', 30], ['A8', 19],
        ['B1', 145], ['B2', 27], ['B3', 36], ['B4', 25],
        ['B5', 9], ['B6', 38], ['B7', 21], ['B8', 12],
        ['C1', 122], ['C2', 87], ['C3', 36], ['C4', 3],
        ['D1', 0], ['D2', 5], ['D3', 55], ['D4', 62],
        ['D5', 98], ['D6', 32]]

# Basic Python 1

In [3]:
# How many sites are there ?
site_lst = [sublist[0] for sublist in data]
len(list(set(site_lst)))
# How many birds were counted at the 7th site?
data[7][1]
# How many birds were counted at the last site?
data[-1][1]
# What is the total number of birds counted across all sites?
pop_lst = [sublist[1] for sublist in data]
total_pop = sum(pop_lst)
total_pop
# What is the average number of birds seen on a site?
avg_lst = [sublist[1]/total_pop for sublist in data]
print(avg_lst)
# What is the total number of birds counted on sites with codes beginning with C? 
c_pop_lst = [sublist[1] for sublist in data if sublist[0].startswith('C')]
sum(c_pop_lst)

26

19

32

955

[0.02931937172774869, 0.033507853403141365, 0.0010471204188481676, 0.0, 0.010471204188481676, 0.023036649214659685, 0.031413612565445025, 0.019895287958115182, 0.1518324607329843, 0.028272251308900525, 0.03769633507853403, 0.02617801047120419, 0.009424083769633508, 0.039790575916230364, 0.02198952879581152, 0.012565445026178011, 0.12774869109947645, 0.09109947643979058, 0.03769633507853403, 0.0031413612565445027, 0.0, 0.005235602094240838, 0.05759162303664921, 0.06492146596858639, 0.10261780104712041, 0.033507853403141365]


248

# Basic Python 2

In [4]:
def read_data(filename):
    df = pd.read_csv(filename)
    return df

In [7]:
df = read_data(r'houseelf_earlength_dna_data.csv')

In [8]:
df

Unnamed: 0,id,earlength,dnaseq
0,17A,5.1,CCGCATCTTGACTTAACTGACATATTACCATAGATGACTAGCCATG...
1,24P,7.5,GCTATGACTTGCTTAGCTACGTATGAAGGAAGAAACTTTTGTGTAT...
2,09Q,12.2,CCGCCGATTGATACAGGGGACGGTGACGTCGTCATAGATTCGGCAC...
3,65Y,9.9,GCAGGAGAAGTTCTTAACCTTCTCGTAGGACGTCAACCTATTCTTT...
4,19N,10.0,TCTTCATCCTTATCAAAGTTTGGAGTCAATGATCAGGATTATTGCC...
5,92K,14.6,ACCGATGGACAATGATTCGGGTAGCACCAGGAGTCCGTAGCGCGTG...
6,33W,8.2,CAGCTTGACTCGGTCTGTTAGGCCACGATTACGTGAGTTAGGGCTC...
7,98C,17.8,CTGCATGCTAGGTTGACACGCCTGCACTGCTCGAAGAAAATATGCG...
8,75G,9.4,CTTATTTAGATAACATGATTAGCCGAAGTTGTACGGGATATCCACC...
9,88Q,11.3,GATTGCTCGCACATGAGCAAAACGGTAGAGCGTCACTTTCAGCCCT...


In [9]:
def get_ear_size(df):
    if df['earlength'] > 10:
        return 'large'
    else:
        return 'small'

In [10]:
ans_df = pd.DataFrame()
ans_df['id'] = df['id']
ans_df['earsize'] = df.apply(get_ear_size,axis=1)

In [11]:
ans_df

Unnamed: 0,id,earsize
0,17A,small
1,24P,small
2,09Q,large
3,65Y,small
4,19N,small
5,92K,large
6,33W,small
7,98C,large
8,75G,small
9,88Q,large


In [13]:
def get_gc_percent(df):
    
    obj_dnaseq = df['dnaseq']
    g_num = obj_dnaseq.count('G')
    c_num = obj_dnaseq.count('C')
    total_length = len(obj_dnaseq)
    gc_dict = {'G_percent' : g_num/total_length * 100, 'C_percent' : c_num/total_length * 100}
    return gc_dict
    

In [14]:
ans_df['gc_perc'] = df.apply(get_gc_percent,axis=1)
ans_df

Unnamed: 0,id,earsize,gc_perc
0,17A,small,"{'G_percent': 20.0, 'C_percent': 21.0}"
1,24P,small,"{'G_percent': 22.0, 'C_percent': 17.0}"
2,09Q,large,"{'G_percent': 30.0, 'C_percent': 27.0}"
3,65Y,small,"{'G_percent': 19.0, 'C_percent': 21.0}"
4,19N,small,"{'G_percent': 17.0, 'C_percent': 19.0}"
5,92K,large,"{'G_percent': 33.0, 'C_percent': 28.9999999999..."
6,33W,small,"{'G_percent': 24.0, 'C_percent': 28.0000000000..."
7,98C,large,"{'G_percent': 32.0, 'C_percent': 31.0}"
8,75G,small,"{'G_percent': 20.0, 'C_percent': 27.0}"
9,88Q,large,"{'G_percent': 27.0, 'C_percent': 25.0}"


In [20]:
def get_avg_gc_percent(df):
    
    small_df = df[df['earsize'] == 'small']
    large_df = df[df['earsize'] == 'large']
    
    small_g_avg_perc = small_df['gc_perc'].apply(lambda x : x['G_percent']).mean()
    small_c_avg_perc = small_df['gc_perc'].apply(lambda x : x['C_percent']).mean()
    
    large_g_avg_perc = large_df['gc_perc'].apply(lambda x : x['G_percent']).mean()
    large_c_avg_perc = large_df['gc_perc'].apply(lambda x : x['C_percent']).mean()
    
    return small_g_avg_perc,small_c_avg_perc,large_g_avg_perc,large_c_avg_perc

In [21]:
small_g_avg_perc,small_c_avg_perc,large_g_avg_perc,large_c_avg_perc = get_avg_gc_percent(ans_df)

In [23]:
print(f'Avg G percent for small ear : {small_g_avg_perc}\nAvg C percent for small ear : {small_c_avg_perc}')
print(f'Avg G percent for large ear : {large_g_avg_perc}\nAvg G percent for large ear : {large_c_avg_perc}')

Avg G percent for small ear : 20.333333333333332
Avg C percent for small ear : 22.166666666666668
Avg G percent for large ear : 30.5
Avg G percent for large ear : 28.0


In [24]:
ans_df.to_csv('granger_analysis.csv')