# Identify introductions from trees

## Analysis

In [1]:
import dendropy
import pandas as pd
from tqdm.autonotebook import tqdm
tqdm.pandas()
import collections
from itertools import combinations

  from tqdm.autonotebook import tqdm


In [2]:
def remove_continent(x):
    if '|' in x['country']:
        return x['country']
    else:
        if len(x['country'].split(' / ')) > 1:
            return x['country'].split(' / ')[1]
        else:
            return x['country']

### Calculate distances between countries

In [3]:
country_lat_long = pd.read_csv('../datasets/gavinr_world-countries-centroids-0.2.0/dist/countries.csv')
country_lat_long

Unnamed: 0,longitude,latitude,COUNTRY,ISO,COUNTRYAFF,AFF_ISO
0,-170.700732,-14.305712,American Samoa,AS,United States,US
1,-61.984936,15.421124,United States Minor Outlying Islands,UM,United States,US
2,-161.465133,-17.435370,Cook Islands,CK,New Zealand,NZ
3,-145.255376,-13.526317,French Polynesia,PF,France,FR
4,-169.868781,-19.052309,Niue,NU,New Zealand,NZ
...,...,...,...,...,...,...
244,145.684687,15.090138,Northern Mariana Islands,MP,United States,US
245,134.566640,7.507586,Palau,PW,Palau,PW
246,98.969888,61.452643,Russian Federation,RU,Russian Federation,RU
247,-2.942967,40.030385,Spain,ES,Spain,ES


In [4]:
country_lat_long.drop(columns=['ISO', 'COUNTRYAFF', 'AFF_ISO'], inplace=True)
country_lat_long

Unnamed: 0,longitude,latitude,COUNTRY
0,-170.700732,-14.305712,American Samoa
1,-61.984936,15.421124,United States Minor Outlying Islands
2,-161.465133,-17.435370,Cook Islands
3,-145.255376,-13.526317,French Polynesia
4,-169.868781,-19.052309,Niue
...,...,...,...
244,145.684687,15.090138,Northern Mariana Islands
245,134.566640,7.507586,Palau
246,98.969888,61.452643,Russian Federation
247,-2.942967,40.030385,Spain


In [5]:
country_lat_long[country_lat_long['COUNTRY'] == 'Ireland']

Unnamed: 0,longitude,latitude,COUNTRY
86,-8.258827,53.306147,Ireland


In [6]:
country_lat_long.sort_values(by='COUNTRY').tail(100).head(50)

Unnamed: 0,longitude,latitude,COUNTRY
106,35.208577,-17.52523,Mozambique
228,97.086915,19.777377,Myanmar
107,18.164513,-21.908582,Namibia
218,166.929376,-0.522102,Nauru
197,84.13389,28.300921,Nepal
162,5.474577,52.137515,Netherlands
219,166.275945,-21.25389,New Caledonia
94,56.866654,-43.657479,New Zealand
52,-85.016088,12.893567,Nicaragua
133,8.868632,17.081054,Niger


Countries not in dataset or that need to be renamed to match
```
USA
Taiwan
Democratic Republic of the Congo
Hong Kong
Russia
The Bahamas
U.S. Virgin Islands
Reunion
Canary Islands
Palestine
Brunei
Crimea
Republic of the Congo
Cote d'Ivoire
```

#### Rename countries to match

In [7]:
country_lat_long.loc[country_lat_long['COUNTRY'] == 'United States', 'COUNTRY'] = 'USA'
country_lat_long.loc[country_lat_long['COUNTRY'] == 'Russian Federation', 'COUNTRY'] = 'Russia'
country_lat_long.loc[country_lat_long['COUNTRY'] == 'Congo', 'COUNTRY'] = 'Democratic Republic of the Congo'
country_lat_long.loc[country_lat_long['COUNTRY'] == 'Congo DRC', 'COUNTRY'] = 'Republic of the Congo'
country_lat_long.loc[country_lat_long['COUNTRY'] == 'Bahamas', 'COUNTRY'] = 'The Bahamas'
country_lat_long.loc[country_lat_long['COUNTRY'] == 'US Virgin Islands', 'COUNTRY'] = 'U.S. Virgin Islands'
country_lat_long.loc[country_lat_long['COUNTRY'] == 'Brunei Darussalam', 'COUNTRY'] = 'Brunei'
country_lat_long.loc[country_lat_long['COUNTRY'] == 'Côte d\'Ivoire', 'COUNTRY'] = 'Cote d\'Ivoire'

Countries not in dataset
```
Taiwan
Hong Kong
Reunion
Canary Islands
Palestine
Crimea
```

#### Add missing countries

In [182]:
#for country in strainCountryAll['country'].unique():
#    if len(country.split(' / ')) > 1:
#        if country.split(' / ')[1] not in country_lat_long['COUNTRY'].tolist():
#            print(country.split(' / ')[1])

Taiwan
Hong Kong
Reunion
Canary Islands
Palestine
Crimea


In [8]:
additional_countries = pd.DataFrame([{'longitude': 120.982, 'latitude': 23.973861, 'COUNTRY': 'Taiwan'},
                        {'longitude': 114.1372, 'latitude': 22.3453, 'COUNTRY': 'Hong Kong'},
                        {'longitude': 55.5325, 'latitude': -21.114444, 'COUNTRY': 'Reunion'},
                        {'longitude': -15.745798, 'latitude': 28.47707, 'COUNTRY': 'Canary Islands'},
                        {'longitude': 35, 'latitude': 31.4, 'COUNTRY': 'Palestine'},
                        {'longitude': 34.6, 'latitude': 45.25, 'COUNTRY': 'Crimea'},
                        {'longitude': -1.464854, 'latitude': 52.561928, 'COUNTRY': 'England'},
                        {'longitude': -4.183963, 'latitude': 56.816738, 'COUNTRY': 'Scotland'},
                        {'longitude': -3.766409, 'latitude': 52.33022, 'COUNTRY': 'Wales'},
                        {'longitude': 20.85, 'latitude': 42.55, 'COUNTRY': 'Kosovo'}])

country_lat_long = pd.concat([country_lat_long, additional_countries], ignore_index = True, axis = 0)
country_lat_long

Unnamed: 0,longitude,latitude,COUNTRY
0,-170.700732,-14.305712,American Samoa
1,-61.984936,15.421124,United States Minor Outlying Islands
2,-161.465133,-17.435370,Cook Islands
3,-145.255376,-13.526317,French Polynesia
4,-169.868781,-19.052309,Niue
...,...,...,...
254,34.600000,45.250000,Crimea
255,-1.464854,52.561928,England
256,-4.183963,56.816738,Scotland
257,-3.766409,52.330220,Wales


#### Calculate distance between countries using Haversine formula

In [9]:
# https://stackoverflow.com/a/4913653
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [10]:
country_lat_long['distance_from_ireland'] = country_lat_long.apply(lambda x: haversine(x['longitude'],
                                          x['latitude'],
                                          country_lat_long[country_lat_long['COUNTRY'] == 'Ireland']['longitude'].values[0],
                                          country_lat_long[country_lat_long['COUNTRY'] == 'Ireland']['latitude'].values[0]),
                       axis=1)
country_lat_long

Unnamed: 0,longitude,latitude,COUNTRY,distance_from_ireland
0,-170.700732,-14.305712,American Samoa,15412.146132
1,-61.984936,15.421124,United States Minor Outlying Islands,6266.570625
2,-161.465133,-17.435370,Cook Islands,15402.208170
3,-145.255376,-13.526317,French Polynesia,14206.711886
4,-169.868781,-19.052309,Niue,15891.064102
...,...,...,...,...
254,34.600000,45.250000,Crimea,3182.289574
255,-1.464854,52.561928,England,462.610475
256,-4.183963,56.816738,Scotland,468.558618
257,-3.766409,52.330220,Wales,320.735659


In [11]:
country_lat_long.sort_values(by='distance_from_ireland').head(10)

Unnamed: 0,longitude,latitude,COUNTRY,distance_from_ireland
86,-8.258827,53.306147,Ireland,0.0
87,-4.532995,54.228553,Isle of Man,265.445406
257,-3.766409,52.33022,Wales,320.735659
89,-3.156996,54.900832,United Kingdom,376.805773
255,-1.464854,52.561928,England,462.610475
256,-4.183963,56.816738,Scotland,468.558618
85,-2.576393,49.458708,Guernsey,581.490166
88,-2.12916,49.215397,Jersey,623.125265
162,5.474577,52.137515,Netherlands,932.527741
159,4.67501,50.618214,Belgium,933.661614


In [12]:
country_lat_long.rename(columns={'COUNTRY': 'country'}, inplace=True)

In [13]:
country_lat_long = country_lat_long[country_lat_long['country'] != 'United Kingdom']

### Timepoint A

In [14]:
tree1 = dendropy.Tree.get(path="../outputFiles/timepoint1_all_pastml/named.tree_timepoint1.rooted.nwk", schema="newick")
tree1

<Tree object at 0x110313730>

In [15]:
strainCountryTimepoint1 = pd.read_csv('../outputFiles/timepoint1.strainCountry.all.txt')
strainCountryTimepoint1

Unnamed: 0,strain,country
0,EPI_ISL_489144,England
1,EPI_ISL_489033,Scotland
2,EPI_ISL_488811,England
3,EPI_ISL_488949,Scotland
4,EPI_ISL_489023,Scotland
...,...,...
103311,EPI_ISL_955999,North America / USA
103312,EPI_ISL_955987,North America / USA
103313,EPI_ISL_429770,Europe / Luxembourg
103314,EPI_ISL_429777,Europe / Luxembourg


In [16]:
strainCountryTimepoint1['strain'] = strainCountryTimepoint1['strain'].str.replace('_',' ')

In [17]:
strainCountryTimepoint1.describe()

Unnamed: 0,strain,country
count,103316,103316
unique,103316,156
top,EPI ISL 489144,North America / USA
freq,1,27699


In [18]:
strainCountryTimepoint1[strainCountryTimepoint1['country'].isin(['Republic of Ireland','Northern Ireland'])].groupby('country').count()

Unnamed: 0_level_0,strain
country,Unnamed: 1_level_1
Northern Ireland,633
Republic of Ireland,714


In [19]:
def find_strain_nodes(node, into_ireland_nodes, between_ireland_nodes, out_of_ireland_nodes):
    
    # Increment the appropriate counter for the strain's country
    node_country = str(node.annotations["country"])
    node_label = str(node.label)
    
    for child in node.child_nodes():
        
        child_country = str(child.annotations["country"])
        
        if node_country not in ["country='Republic of Ireland'", "country='Northern Ireland'"] and child_country in ["country='Republic of Ireland'", "country='Northern Ireland'"]:
            into_ireland_nodes.append({'node': node_label, 'parent_country': node_country, 'country': child_country})
        elif node_country in ["country='Republic of Ireland'", "country='Northern Ireland'"] and child_country not in ["country='Republic of Ireland'", "country='Northern Ireland'"]:
            out_of_ireland_nodes.append({'node': node_label, 'parent_country': node_country, 'country': child_country})
        elif node_country in ["country='Republic of Ireland'", "country='Northern Ireland'"] and child_country in ["country='Republic of Ireland'", "country='Northern Ireland'"] and node_country != child_country:
            between_ireland_nodes.append({'node': node_label, 'parent_country': node_country, 'country': child_country})
        
        find_strain_nodes(child, into_ireland_nodes, between_ireland_nodes, out_of_ireland_nodes)
        
def process_tree(tree):
    into_ireland_nodes = []
    between_ireland_nodes = []
    out_of_ireland_nodes = []
    
    find_strain_nodes(tree.seed_node, into_ireland_nodes, between_ireland_nodes, out_of_ireland_nodes)
    
    return {
        'into_ireland_nodes': into_ireland_nodes,
        'between_ireland_nodes': between_ireland_nodes,
        'out_of_ireland_nodes': out_of_ireland_nodes
    }

In [20]:
timepoint1 = process_tree(tree1)

In [18]:
### OLD CODE THAT ONLY CONSIDERS ONE NODE ABOVE

# def find_strain_nodes(strainCountry, tree):
#     roi_count = 0
#     ni_count = 0
#     into_ireland_nodes = []
#     between_ireland_nodes = []
# 
#     for strain in tqdm(strainCountry[strainCountry['country'].isin(['Republic of Ireland','Northern Ireland'])]['strain'].to_list()):
#         node = tree.find_node_with_taxon_label(strain)
#         
#         if not node:
#             continue
# 
#         # Increment the appropriate counter for the strain's country
#         if str(node.annotations["country"]) == "country='Republic of Ireland'":
#             roi_count += 1
#         elif str(node.annotations["country"]) == "country='Northern Ireland'":
#             ni_count += 1
# 
#         # Check the parent node's country
#         parent_node = node.parent_node
#         parent_country = str(parent_node.annotations["country"])
#         if parent_country not in ["country='Republic of Ireland'", "country='Northern Ireland'"]:
#             into_ireland_nodes.append({'node': str(parent_node.label), 'parent_country': parent_country, 'country': str(node.annotations["country"])})
#         elif parent_country != str(node.annotations["country"]):
#             between_ireland_nodes.append({'node': str(parent_node.label), 'parent_country': parent_country, 'country': str(node.annotations["country"])})
# 
#     return {
#         'roi_count': roi_count,
#         'ni_count': ni_count,
#         'into_ireland_nodes': into_ireland_nodes,
#         'between_ireland_nodes': between_ireland_nodes
#     }

In [19]:
#timepoint1 = find_strain_nodes(strainCountryTimepoint1, tree1)

  0%|          | 0/1347 [00:00<?, ?it/s]

In [21]:
len(timepoint1['into_ireland_nodes'])

196

In [22]:
len(list({v['node']:v for v in timepoint1['into_ireland_nodes']}.values()))

170

In [23]:
len(timepoint1['between_ireland_nodes'])

3

In [24]:
len(list({v['node']:v for v in timepoint1['between_ireland_nodes']}.values()))

3

In [25]:
len(timepoint1['out_of_ireland_nodes'])

100

In [26]:
len(list({v['node']:v for v in timepoint1['out_of_ireland_nodes']}.values()))

33

In [27]:
timepoint1['between_ireland_nodes']

[{'node': 'node 777619',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 777473',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 767334',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"}]

In [28]:
timepoint1['into_ireland_nodes'] = list({v['node']:v for v in timepoint1['into_ireland_nodes']}.values())

In [29]:
timepoint1['into_ireland_nodes'] = list(map(lambda x: {'node': x['node'].replace(' ','_'), 'country': x['country'][9:-1], 'parent_country': x['parent_country'][9:-1]}, timepoint1['into_ireland_nodes']))

In [30]:
timepoint1['into_ireland_nodes_without_ambiguousinto_ireland_nodes_without_ambiguous'] = list(filter(lambda x: "|" not in x['parent_country'], timepoint1['into_ireland_nodes']))

In [31]:
len(timepoint1['into_ireland_nodes_without_ambiguousinto_ireland_nodes_without_ambiguous'])

168

In [34]:
timepoint_1_country_counter = collections.Counter(list(map(lambda x: x['country'], timepoint1['into_ireland_nodes'])))
timepoint_1_country_frequency = timepoint_1_country_counter.most_common()
timepoint_1_country_frequency

[('Republic of Ireland', 103), ('Northern Ireland', 67)]

In [35]:
timepoint_1_roi_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], filter(lambda x: x['country'] == 'Republic of Ireland', timepoint1['into_ireland_nodes']))))
timepoint_1_roi_parent_country_frequency = timepoint_1_roi_parent_country_counter.most_common()
timepoint_1_roi_parent_country_frequency

[('England', 61),
 ('North America / USA', 12),
 ('Europe / Netherlands', 4),
 ('Europe / Spain', 4),
 ('Asia / China', 3),
 ('Europe / Germany', 3),
 ('Europe / France', 3),
 ('Asia / Japan', 2),
 ('Europe / Sweden', 1),
 ('Europe / Portugal', 1),
 ('Europe / Italy', 1),
 ('Europe / Belgium', 1),
 ('Asia / Bangladesh', 1),
 ('Wales', 1),
 ('Europe / Switzerland', 1),
 ('Scotland', 1),
 ('Oceania / New Zealand', 1),
 ('South America / Chile|Republic of Ireland|England', 1),
 ('Europe / Denmark', 1)]

In [36]:
timepoint_1_ni_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], filter(lambda x: x['country'] == 'Northern Ireland', timepoint1['into_ireland_nodes']))))
timepoint_1_ni_parent_country_frequency = timepoint_1_ni_parent_country_counter.most_common()
timepoint_1_ni_parent_country_frequency

[('England', 39),
 ('Scotland', 7),
 ('North America / USA', 5),
 ('Europe / France', 2),
 ('Asia / Japan', 2),
 ('Europe / Germany', 2),
 ('Europe / Spain', 2),
 ('Europe / Sweden', 1),
 ('Europe / Switzerland', 1),
 ('South America / Chile', 1),
 ('Oceania / Australia', 1),
 ('Europe / Netherlands', 1),
 ('Asia / China', 1),
 ('Wales', 1),
 ('Europe / Germany|Europe / Austria', 1)]

In [37]:
timepoint_1_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], timepoint1['into_ireland_nodes'])))
timepoint_1_parent_country_frequency = timepoint_1_parent_country_counter.most_common()
timepoint_1_parent_country_frequency

[('England', 100),
 ('North America / USA', 17),
 ('Scotland', 8),
 ('Europe / Spain', 6),
 ('Europe / France', 5),
 ('Europe / Germany', 5),
 ('Europe / Netherlands', 5),
 ('Asia / China', 4),
 ('Asia / Japan', 4),
 ('Europe / Sweden', 2),
 ('Europe / Switzerland', 2),
 ('Wales', 2),
 ('Europe / Portugal', 1),
 ('Europe / Italy', 1),
 ('Europe / Belgium', 1),
 ('South America / Chile', 1),
 ('Asia / Bangladesh', 1),
 ('Oceania / Australia', 1),
 ('Oceania / New Zealand', 1),
 ('South America / Chile|Republic of Ireland|England', 1),
 ('Europe / Germany|Europe / Austria', 1),
 ('Europe / Denmark', 1)]

In [38]:
timepoint_1_parent_country_frequency_df = pd.DataFrame(timepoint_1_parent_country_frequency, columns=['country','Both'])
timepoint_1_parent_country_frequency_df

Unnamed: 0,country,Both
0,England,100
1,North America / USA,17
2,Scotland,8
3,Europe / Spain,6
4,Europe / France,5
5,Europe / Germany,5
6,Europe / Netherlands,5
7,Asia / China,4
8,Asia / Japan,4
9,Europe / Sweden,2


In [37]:
timepoint_1_parent_country_frequency_df = pd.merge(timepoint_1_parent_country_frequency_df, pd.DataFrame(timepoint_1_roi_parent_country_frequency, columns=['country','RoI']), how="left", on="country")
timepoint_1_parent_country_frequency_df = pd.merge(timepoint_1_parent_country_frequency_df, pd.DataFrame(timepoint_1_ni_parent_country_frequency, columns=['country','NI']), how="left", on="country")
timepoint_1_parent_country_frequency_df[['RoI', 'NI']] = timepoint_1_parent_country_frequency_df[['RoI', 'NI']].fillna(value=0).astype(int)

In [38]:
timepoint_1_parent_country_frequency_df['country'] = timepoint_1_parent_country_frequency_df.apply(remove_continent, axis=1)

In [40]:
timepoint_1_parent_country_frequency_df

Unnamed: 0,country,Both,RoI,NI
0,England,100,61,39
1,USA,17,12,5
2,Scotland,8,1,7
3,Spain,6,4,2
4,France,5,3,2
5,Germany,5,3,2
6,Netherlands,5,4,1
7,China,4,3,1
8,Japan,4,2,2
9,Sweden,2,1,1


In [41]:
timepoint_1_parent_country_frequency_df.to_csv('../outputFiles/timepoint_1_parent_country_frequency.csv', index=False)

In [42]:
num_country_tips = strainCountryTimepoint1.groupby('country').count().reset_index()
num_country_tips.rename(columns={'strain': 'num_tips'}, inplace=True)
num_country_tips

Unnamed: 0,country,num_tips
0,Africa / Algeria,3
1,Africa / Benin,12
2,Africa / Botswana,1
3,Africa / Burkina Faso,6
4,Africa / Cameroon,9
...,...,...
151,South America / Peru,361
152,South America / Suriname,3
153,South America / Uruguay,32
154,South America / Venezuela,8


In [43]:
num_country_tips['country'] = num_country_tips.apply(remove_continent, axis=1)
num_country_tips

Unnamed: 0,country,num_tips
0,Algeria,3
1,Benin,12
2,Botswana,1
3,Burkina Faso,6
4,Cameroon,9
...,...,...
151,Peru,361
152,Suriname,3
153,Uruguay,32
154,Venezuela,8


In [44]:
timepoint_1_parent_country_frequency_df = pd.merge(num_country_tips, timepoint_1_parent_country_frequency_df, how="left", on=['country']).fillna(value=0)
timepoint_1_parent_country_frequency_df

Unnamed: 0,country,num_tips,Both,RoI,NI
0,Algeria,3,0.0,0.0,0.0
1,Benin,12,0.0,0.0,0.0
2,Botswana,1,0.0,0.0,0.0
3,Burkina Faso,6,0.0,0.0,0.0
4,Cameroon,9,0.0,0.0,0.0
...,...,...,...,...,...
151,Peru,361,0.0,0.0,0.0
152,Suriname,3,0.0,0.0,0.0
153,Uruguay,32,0.0,0.0,0.0
154,Venezuela,8,0.0,0.0,0.0


In [45]:
timepoint_1_parent_country_frequency_df[~timepoint_1_parent_country_frequency_df['country'].isin(country_lat_long['country'])]

Unnamed: 0,country,num_tips,Both,RoI,NI
115,United Kingdom,1,0.0,0.0,0.0
134,Northern Ireland,633,0.0,0.0,0.0
139,Republic of Ireland,714,0.0,0.0,0.0


In [46]:
timepoint_1_parent_country_frequency_df = pd.merge(country_lat_long, timepoint_1_parent_country_frequency_df, how="inner", on=['country'])
timepoint_1_parent_country_frequency_df

Unnamed: 0,longitude,latitude,country,distance_from_ireland,num_tips,Both,RoI,NI
0,-90.312193,15.820879,Guatemala,8079.118206,16,0.0,0.0,0.0
1,-103.120439,23.643948,Mexico,8231.394213,823,0.0,0.0,0.0
2,-93.566635,62.365872,Canada,4765.573313,8352,0.0,0.0,0.0
3,-64.443152,-37.605645,Argentina,11458.955064,425,0.0,0.0,0.0
4,-81.255450,-34.328238,Chile,12001.453231,563,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...
148,35.000000,31.400000,Palestine,4213.049700,15,0.0,0.0,0.0
149,34.600000,45.250000,Crimea,3182.289574,1,0.0,0.0,0.0
150,-1.464854,52.561928,England,462.610475,15402,100.0,61.0,39.0
151,-4.183963,56.816738,Scotland,468.558618,3898,8.0,1.0,7.0


In [47]:
timepoint_1_parent_country_frequency_df.sort_values(by='country').tail(20)

Unnamed: 0,longitude,latitude,country,distance_from_ireland,num_tips,Both,RoI,NI
97,17.27729,62.465412,Sweden,1802.000528,1212,2.0,1.0,1.0
78,8.286929,46.736781,Switzerland,1384.606007,1388,2.0,1.0,1.0
144,120.982,23.973861,Taiwan,10132.293495,115,0.0,0.0,0.0
132,101.068606,13.596475,Thailand,10031.273226,278,0.0,0.0,0.0
13,-75.94757,23.971981,The Bahamas,6425.67683,4,0.0,0.0,0.0
126,125.668174,-8.79884,Timor-Leste,13583.781233,7,0.0,0.0,0.0
63,0.899086,8.660743,Togo,5032.243545,22,0.0,0.0,0.0
64,9.655876,34.086362,Tunisia,2561.328971,26,0.0,0.0,0.0
73,35.095572,39.090972,Turkey,3620.542031,296,0.0,0.0,0.0
30,-64.822372,17.997412,U.S. Virgin Islands,6213.87522,19,0.0,0.0,0.0


In [48]:
timepoint_1_parent_country_frequency_df.to_csv('../outputFiles/timepoint_1_parent_country_frequency_distance.csv', index=False)

In [49]:
timepoint1['into_ireland_nodes'][:5]

[{'node': 'node_1550',
  'country': 'Republic of Ireland',
  'parent_country': 'Asia / China'},
 {'node': 'node_146091',
  'country': 'Northern Ireland',
  'parent_country': 'England'},
 {'node': 'node_167432',
  'country': 'Republic of Ireland',
  'parent_country': 'North America / USA'},
 {'node': 'node_169047',
  'country': 'Northern Ireland',
  'parent_country': 'England'},
 {'node': 'node_169296',
  'country': 'Republic of Ireland',
  'parent_country': 'England'}]

In [50]:
timepoint_1_parent_country_frequency_df[timepoint_1_parent_country_frequency_df['Both'] > 0][['country','Both']].to_csv('../outputFiles/timepoint_1_country_frequency.csv', index=False)

In [51]:
def find_irish_clusters(node, list_to_append):
    for child in node.child_nodes():
        if child.taxon is not None:
            if 'Ireland' in str(child.annotations["country"]):
                list_to_append.append(str(child.taxon.label))
        if 'Ireland' in str(child.annotations["country"]):
            find_irish_clusters(child, list_to_append)

In [52]:
def find_clusters(node, list_to_append):
    for child in node.child_nodes():
        if child.taxon is not None:
            #if 'Ireland' in str(child.annotations["country"]):
            list_to_append.append(str(child.taxon.label))
        #if 'Ireland' in str(child.annotations["country"]):
        find_clusters(child, list_to_append)

In [53]:
cluster_num = 0
clusters_timepoint_a = []
for intro in tqdm(timepoint1['into_ireland_nodes']):
    cluster_num += 1
    descendant_tips = []
    find_irish_clusters(tree1.find_node_with_label(intro['node'].replace('_',' ')), descendant_tips)
    for tip in list(set(descendant_tips)):
        clusters_timepoint_a.append({'tip': tip.replace(' ','_'), 'cluster': cluster_num, 'timepoint': 'A'})

  0%|          | 0/170 [00:00<?, ?it/s]

In [54]:
clusters_timepoint_a_df = pd.DataFrame(clusters_timepoint_a)
clusters_timepoint_a_df

Unnamed: 0,tip,cluster,timepoint
0,EPI_ISL_671375,1,A
1,EPI_ISL_441714,2,A
2,EPI_ISL_644320,3,A
3,EPI_ISL_585106,4,A
4,EPI_ISL_11191573,5,A
...,...,...,...
1344,EPI_ISL_605065,169,A
1345,EPI_ISL_848117,170,A
1346,EPI_ISL_644284,170,A
1347,EPI_ISL_644341,170,A


In [55]:
len(clusters_timepoint_a_df['tip'].unique())

1347

In [56]:
clusters_timepoint_a_df.groupby('cluster').count().describe()

Unnamed: 0,tip,timepoint
count,170.0,170.0
mean,7.935294,7.935294
std,49.380319,49.380319
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,3.0,3.0
max,638.0,638.0


In [57]:
clusters_timepoint_a_df.to_csv('../outputFiles/clusters/clusters_timepoint_A.csv', index=False)

In [58]:
clusters_timepoint_a_df.groupby('cluster').count().sort_values(by='tip',ascending=False)

Unnamed: 0_level_0,tip,timepoint
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
67,638,638
165,65,65
162,54,54
77,45,45
102,31,31
...,...,...
68,1,1
65,1,1
64,1,1
63,1,1


In [59]:
clusters_timepoint_a_df.groupby('cluster').count().reset_index().rename(columns={'tip':'irish_samples'})[['cluster','irish_samples']].to_csv('../outputFiles/clusters/clusters_timepoint_A_size.csv', index=False)

In [60]:
clusters_timepoint_a_df[clusters_timepoint_a_df['cluster']==72]

Unnamed: 0,tip,cluster,timepoint
857,EPI_ISL_489324,72,A
858,EPI_ISL_501014,72,A
859,EPI_ISL_585107,72,A
860,EPI_ISL_448972,72,A


In [61]:
def is_parent_in_timepoint_list(node_id, tree, timepoint_list):
    node = tree.find_node_with_label(node_id)

    if node and node.parent_node:
        #print('up level')
        if str(node.parent_node.label) in list(map(lambda x: x['node'], timepoint_list)):
            parent_clusters.append(str(node.parent_node.label))
        is_parent_in_timepoint_list(node.parent_node, tree, timepoint_list)

In [62]:
for intro in tqdm(timepoint1['into_ireland_nodes']):
    parent_clusters = []
    is_parent_in_timepoint_list(intro['node'].replace('_',' '), tree1, timepoint1['into_ireland_nodes'])
    if len(parent_clusters) > 0:
        print(intro['node'])
        print(parent_clusters)

  0%|          | 0/170 [00:00<?, ?it/s]

In [63]:
cluster_num = 0
clusters_timepoint_a_full = []
for intro in tqdm(timepoint1['into_ireland_nodes']):
    cluster_num += 1
    descendant_tips = []
    find_clusters(tree1.find_node_with_label(intro['node'].replace('_',' ')), descendant_tips)
    for tip in list(set(descendant_tips)):
        clusters_timepoint_a_full.append({'tip': tip.replace(' ','_'), 'cluster': cluster_num, 'timepoint': 'A'})

  0%|          | 0/170 [00:00<?, ?it/s]

In [64]:
clusters_timepoint_a_full_df = pd.DataFrame(clusters_timepoint_a_full)
clusters_timepoint_a_full_df

Unnamed: 0,tip,cluster,timepoint
0,EPI_ISL_424261,1,A
1,EPI_ISL_463503,1,A
2,EPI_ISL_603780,1,A
3,EPI_ISL_468570,1,A
4,EPI_ISL_438194,1,A
...,...,...,...
324809,EPI_ISL_461828,170,A
324810,EPI_ISL_439410,170,A
324811,EPI_ISL_440177,170,A
324812,EPI_ISL_466783,170,A


In [65]:
len(clusters_timepoint_a_full_df['tip'].unique())

103315

In [66]:
clusters_timepoint_a_full_df.groupby('cluster').count().describe()

Unnamed: 0,tip,timepoint
count,170.0,170.0
mean,1910.670588,1910.670588
std,10930.661579,10930.661579
min,1.0,1.0
25%,4.0,4.0
50%,14.5,14.5
75%,128.5,128.5
max,103315.0,103315.0


In [67]:
clusters_timepoint_a_full_df['parent_cluster'] = pd.NA

In [68]:
cluster_pairs_overlap = []

for cluster_pair in combinations(clusters_timepoint_a_full_df['cluster'].unique().tolist(), 2):
    tip_list_1 = clusters_timepoint_a_full_df[clusters_timepoint_a_full_df['cluster'] == cluster_pair[0]]['tip'].tolist()
    tip_list_2 = clusters_timepoint_a_full_df[clusters_timepoint_a_full_df['cluster'] == cluster_pair[1]]['tip'].tolist()
    
    if len(list(set(tip_list_1 + tip_list_2))) != len(tip_list_1 + tip_list_2):
        #print(len(tip_list_1))
        #print(len(tip_list_2))
        #print(len(list(set(tip_list_1 + tip_list_2))))
        #print(len(tip_list_1 + tip_list_2))
        #print(cluster_pair)
        if len(tip_list_1) > len(tip_list_2):
            cluster_pairs_overlap.append({'parent': cluster_pair[0], 'child': cluster_pair[1]})
        elif len(tip_list_1) < len(tip_list_2):
            cluster_pairs_overlap.append({'parent': cluster_pair[1], 'child': cluster_pair[0]})
        else:
            print("cluster size the same")

In [69]:
cluster_pairs_overlap

[{'parent': 165, 'child': 1},
 {'parent': 131, 'child': 2},
 {'parent': 165, 'child': 2},
 {'parent': 3, 'child': 4},
 {'parent': 3, 'child': 5},
 {'parent': 3, 'child': 6},
 {'parent': 3, 'child': 7},
 {'parent': 3, 'child': 8},
 {'parent': 3, 'child': 9},
 {'parent': 3, 'child': 10},
 {'parent': 3, 'child': 11},
 {'parent': 3, 'child': 12},
 {'parent': 3, 'child': 13},
 {'parent': 3, 'child': 14},
 {'parent': 3, 'child': 15},
 {'parent': 3, 'child': 16},
 {'parent': 3, 'child': 17},
 {'parent': 3, 'child': 18},
 {'parent': 3, 'child': 19},
 {'parent': 3, 'child': 20},
 {'parent': 3, 'child': 21},
 {'parent': 3, 'child': 22},
 {'parent': 3, 'child': 23},
 {'parent': 3, 'child': 24},
 {'parent': 3, 'child': 25},
 {'parent': 3, 'child': 26},
 {'parent': 3, 'child': 27},
 {'parent': 3, 'child': 28},
 {'parent': 3, 'child': 29},
 {'parent': 3, 'child': 30},
 {'parent': 3, 'child': 31},
 {'parent': 3, 'child': 32},
 {'parent': 3, 'child': 33},
 {'parent': 3, 'child': 34},
 {'parent': 3, 'c

### Timepoint B

In [70]:
tree2 = dendropy.Tree.get(path="../outputFiles/timepoint2_all_pastml/named.tree_timepoint2.rooted.nwk", schema="newick")
tree2

<Tree object at 0x168f7f520>

In [71]:
strainCountryTimepoint2 = pd.read_csv('../outputFiles/timepoint2.strainCountry.all.txt')
strainCountryTimepoint2

Unnamed: 0,strain,country
0,EPI_ISL_402125,Asia / China
1,EPI_ISL_9388908,Asia / India
2,EPI_ISL_9388909,Asia / India
3,EPI_ISL_9388907,Asia / India
4,EPI_ISL_791809,North America / USA
...,...,...
31925,EPI_ISL_829314,Europe / Iceland
31926,EPI_ISL_645965,England
31927,EPI_ISL_645964,England
31928,EPI_ISL_1311504,Europe / Netherlands


In [72]:
strainCountryTimepoint2['strain'] = strainCountryTimepoint2['strain'].str.replace('_',' ')

In [73]:
strainCountryTimepoint2.describe()

Unnamed: 0,strain,country
count,31930,31930
unique,31930,51
top,EPI ISL 402125,England
freq,1,16351


In [74]:
strainCountryTimepoint2[strainCountryTimepoint2['country'].isin(['Republic of Ireland','Northern Ireland'])].groupby('country').count()

Unnamed: 0_level_0,strain
country,Unnamed: 1_level_1
Northern Ireland,296
Republic of Ireland,215


In [76]:
timepoint2 = process_tree(tree2)

In [77]:
len(timepoint2['into_ireland_nodes'])

62

In [78]:
len(list({v['node']:v for v in timepoint2['into_ireland_nodes']}.values()))

51

In [79]:
len(timepoint2['between_ireland_nodes'])

1

In [80]:
len(list({v['node']:v for v in timepoint2['between_ireland_nodes']}.values()))

1

In [81]:
timepoint2['between_ireland_nodes']

[{'node': 'node 105046',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"}]

In [82]:
timepoint2['into_ireland_nodes'] = list({v['node']:v for v in timepoint2['into_ireland_nodes']}.values())
timepoint2['into_ireland_nodes']

[{'node': 'node 122113',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 126355',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 126391',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 126231',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 127087',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 127155',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 108178',
  'parent_country': "country='Wales'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 108015',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 107837',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'

In [83]:
timepoint2['into_ireland_nodes'] = list(map(lambda x: {'node': x['node'].replace(' ','_'), 'country': x['country'][9:-1], 'parent_country': x['parent_country'][9:-1]}, timepoint2['into_ireland_nodes']))

In [84]:
timepoint2['into_ireland_nodes_without_ambiguous'] = list(filter(lambda x: "|" not in x['parent_country'], timepoint2['into_ireland_nodes']))

In [85]:
len(timepoint2['into_ireland_nodes_without_ambiguous'])

50

In [86]:
timepoint_2_country_counter = collections.Counter(list(map(lambda x: x['country'], timepoint2['into_ireland_nodes'])))
timepoint_2_country_frequency = timepoint_2_country_counter.most_common()
timepoint_2_country_frequency

[('Northern Ireland', 37), ('Republic of Ireland', 14)]

In [87]:
timepoint_2_roi_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], filter(lambda x: x['country'] == 'Republic of Ireland', timepoint2['into_ireland_nodes']))))
timepoint_2_roi_parent_country_frequency = timepoint_2_roi_parent_country_counter.most_common()
timepoint_2_roi_parent_country_frequency

[('England', 9),
 ('Scotland', 3),
 ('Europe / Latvia|Europe / Norway|Europe / Lithuania|Europe / Iceland', 1),
 ('Europe / Spain', 1)]

In [88]:
timepoint_2_ni_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], filter(lambda x: x['country'] == 'Northern Ireland', timepoint2['into_ireland_nodes']))))
timepoint_2_ni_parent_country_frequency = timepoint_2_ni_parent_country_counter.most_common()
timepoint_2_ni_parent_country_frequency

[('England', 33), ('Wales', 2), ('Scotland', 1), ('Europe / Spain', 1)]

In [89]:
timepoint_2_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], timepoint2['into_ireland_nodes'])))
timepoint_2_parent_country_frequency = timepoint_2_parent_country_counter.most_common()
timepoint_2_parent_country_frequency

[('England', 42),
 ('Scotland', 4),
 ('Wales', 2),
 ('Europe / Spain', 2),
 ('Europe / Latvia|Europe / Norway|Europe / Lithuania|Europe / Iceland', 1)]

In [90]:
timepoint_2_parent_country_frequency_df = pd.DataFrame(timepoint_2_parent_country_frequency, columns=['country','Both'])
timepoint_2_parent_country_frequency_df

Unnamed: 0,country,Both
0,England,42
1,Scotland,4
2,Wales,2
3,Europe / Spain,2
4,Europe / Latvia|Europe / Norway|Europe / Lithu...,1


In [91]:
timepoint_2_parent_country_frequency_df = pd.merge(timepoint_2_parent_country_frequency_df, pd.DataFrame(timepoint_2_roi_parent_country_frequency, columns=['country','RoI']), how="left", on="country")
timepoint_2_parent_country_frequency_df = pd.merge(timepoint_2_parent_country_frequency_df, pd.DataFrame(timepoint_2_ni_parent_country_frequency, columns=['country','NI']), how="left", on="country")
timepoint_2_parent_country_frequency_df[['RoI', 'NI']] = timepoint_2_parent_country_frequency_df[['RoI', 'NI']].fillna(value=0).astype(int)

In [92]:
timepoint_2_parent_country_frequency_df['country'] = timepoint_2_parent_country_frequency_df.apply(remove_continent, axis=1)

In [93]:
timepoint_2_parent_country_frequency_df

Unnamed: 0,country,Both,RoI,NI
0,England,42,9,33
1,Scotland,4,3,1
2,Wales,2,0,2
3,Spain,2,1,1
4,Europe / Latvia|Europe / Norway|Europe / Lithu...,1,1,0


In [94]:
timepoint_2_parent_country_frequency_df.to_csv('../outputFiles/timepoint_2_parent_country_frequency.csv', index=False)

In [95]:
num_country_tipsTimepoint2 = strainCountryTimepoint2.groupby('country').count().reset_index()
num_country_tipsTimepoint2.rename(columns={'strain': 'num_tips'}, inplace=True)
num_country_tipsTimepoint2

Unnamed: 0,country,num_tips
0,Africa / Cameroon,2
1,Africa / Senegal,4
2,Africa / Tunisia,11
3,Asia / China,1
4,Asia / Hong Kong,12
5,Asia / India,3
6,Asia / Japan,6
7,Asia / Malaysia,1
8,Asia / Singapore,5
9,Asia / South Korea,11


In [96]:
num_country_tipsTimepoint2['country'] = num_country_tipsTimepoint2.apply(remove_continent, axis=1)
num_country_tipsTimepoint2

Unnamed: 0,country,num_tips
0,Cameroon,2
1,Senegal,4
2,Tunisia,11
3,China,1
4,Hong Kong,12
5,India,3
6,Japan,6
7,Malaysia,1
8,Singapore,5
9,South Korea,11


In [97]:
timepoint_2_parent_country_frequency_df = pd.merge(num_country_tipsTimepoint2, timepoint_2_parent_country_frequency_df, how="left", on=['country']).fillna(value=0)
timepoint_2_parent_country_frequency_df

Unnamed: 0,country,num_tips,Both,RoI,NI
0,Cameroon,2,0.0,0.0,0.0
1,Senegal,4,0.0,0.0,0.0
2,Tunisia,11,0.0,0.0,0.0
3,China,1,0.0,0.0,0.0
4,Hong Kong,12,0.0,0.0,0.0
5,India,3,0.0,0.0,0.0
6,Japan,6,0.0,0.0,0.0
7,Malaysia,1,0.0,0.0,0.0
8,Singapore,5,0.0,0.0,0.0
9,South Korea,11,0.0,0.0,0.0


In [98]:
timepoint_2_parent_country_frequency_df[~timepoint_2_parent_country_frequency_df['country'].isin(country_lat_long['country'])]

Unnamed: 0,country,num_tips,Both,RoI,NI
43,Northern Ireland,296,0.0,0.0,0.0
47,Republic of Ireland,215,0.0,0.0,0.0


In [99]:
timepoint_2_parent_country_frequency_df = pd.merge(country_lat_long, timepoint_2_parent_country_frequency_df, how="inner", on=['country'])
timepoint_2_parent_country_frequency_df

Unnamed: 0,longitude,latitude,country,distance_from_ireland,num_tips,Both,RoI,NI
0,-93.566635,62.365872,Canada,4765.573313,17,0.0,0.0,0.0
1,-74.114162,-8.522718,Peru,9222.655323,1,0.0,0.0,0.0
2,-64.74585,32.315067,Bermuda,5000.562477,4,0.0,0.0,0.0
3,-80.175902,8.422312,Panama,8060.228802,1,0.0,0.0,0.0
4,-5.345549,36.140227,Gibraltar,1922.179956,143,0.0,0.0,0.0
5,-14.610875,14.228861,Senegal,4381.010193,4,0.0,0.0,0.0
6,-19.05683,65.123609,Iceland,1445.97914,1772,0.0,0.0,0.0
7,56.866654,-43.657479,New Zealand,12433.900405,12,0.0,0.0,0.0
8,12.948474,6.294168,Cameroon,5569.428751,2,0.0,0.0,0.0
9,9.655876,34.086362,Tunisia,2561.328971,11,0.0,0.0,0.0


In [100]:
timepoint_2_parent_country_frequency_df.to_csv('../outputFiles/timepoint_2_parent_country_frequency_distance.csv', index=False)

In [101]:
timepoint_2_parent_country_frequency_df[timepoint_2_parent_country_frequency_df['Both'] > 0][['country','Both']].to_csv('../outputFiles/timepoint_2_country_frequency.csv', index=False)

In [102]:
for intro in tqdm(timepoint2['into_ireland_nodes']):
    parent_clusters = []
    is_parent_in_timepoint_list(intro['node'].replace('_',' '), tree2, timepoint2['into_ireland_nodes'])
    if len(parent_clusters) > 0:
        print(intro['node'])
        print(parent_clusters)

  0%|          | 0/51 [00:00<?, ?it/s]

In [103]:
cluster_num = 0
clusters_timepoint_b = []
for intro in tqdm(timepoint2['into_ireland_nodes']):
    cluster_num += 1
    descendant_tips = []
    find_irish_clusters(tree2.find_node_with_label(intro['node'].replace('_',' ')), descendant_tips)
    for tip in list(set(descendant_tips)):
        clusters_timepoint_b.append({'tip': tip.replace(' ','_'), 'cluster': cluster_num, 'timepoint': 'B'})

  0%|          | 0/51 [00:00<?, ?it/s]

In [104]:
clusters_timepoint_b_df = pd.DataFrame(clusters_timepoint_b)
clusters_timepoint_b_df

Unnamed: 0,tip,cluster,timepoint
0,EPI_ISL_573760,1,B
1,EPI_ISL_639881,2,B
2,EPI_ISL_764406,3,B
3,EPI_ISL_680227,4,B
4,EPI_ISL_680244,4,B
...,...,...,...
506,EPI_ISL_531194,50,B
507,EPI_ISL_531532,50,B
508,EPI_ISL_532897,51,B
509,EPI_ISL_532643,51,B


In [105]:
len(clusters_timepoint_b_df['tip'].unique())

511

In [106]:
clusters_timepoint_b_df.groupby('cluster').count().describe()

Unnamed: 0,tip,timepoint
count,51.0,51.0
mean,10.019608,10.019608
std,26.126225,26.126225
min,1.0,1.0
25%,1.0,1.0
50%,2.0,2.0
75%,5.0,5.0
max,130.0,130.0


In [107]:
clusters_timepoint_b_df.to_csv('../outputFiles/clusters/clusters_timepoint_B.csv', index=False)

In [108]:
clusters_timepoint_b_df.groupby('cluster').count().sort_values(by='tip',ascending=False)

Unnamed: 0_level_0,tip,timepoint
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
39,130,130
36,128,128
37,64,64
31,21,21
9,21,21
15,14,14
48,13,13
40,12,12
12,10,10
32,9,9


In [109]:
clusters_timepoint_b_df.groupby('cluster').count().reset_index().rename(columns={'tip':'irish_samples'})[['cluster','irish_samples']].to_csv('../outputFiles/clusters/clusters_timepoint_B_size.csv', index=False)

### Timepoint C

In [110]:
tree3 = dendropy.Tree.get(path="../outputFiles/timepoint3_all_pastml/named.tree_timepoint3.rooted.nwk", schema="newick")
tree3

<Tree object at 0x173de7eb0>

In [111]:
strainCountryTimepoint3 = pd.read_csv('../outputFiles/timepoint3.strainCountry.all.txt')
strainCountryTimepoint3

Unnamed: 0,strain,country
0,EPI_ISL_402125,Asia / China
1,EPI_ISL_606424,England
2,EPI_ISL_606375,England
3,EPI_ISL_629120,England
4,EPI_ISL_629311,England
...,...,...
184321,EPI_ISL_1205970,England
184322,EPI_ISL_1206020,England
184323,EPI_ISL_1206022,England
184324,EPI_ISL_1206024,England


In [112]:
strainCountryTimepoint3['strain'] = strainCountryTimepoint3['strain'].str.replace('_',' ')

In [113]:
strainCountryTimepoint3.describe()

Unnamed: 0,strain,country
count,184326,184326
unique,184326,137
top,EPI ISL 402125,England
freq,1,97587


In [114]:
strainCountryTimepoint3[strainCountryTimepoint3['country'].isin(['Republic of Ireland','Northern Ireland'])].groupby('country').count()

Unnamed: 0_level_0,strain
country,Unnamed: 1_level_1
Northern Ireland,1695
Republic of Ireland,2850


In [115]:
timepoint3 = process_tree(tree3)

In [116]:
len(timepoint3['into_ireland_nodes'])

278

In [117]:
len(list({v['node']:v for v in timepoint3['into_ireland_nodes']}.values()))

251

In [118]:
len(timepoint3['between_ireland_nodes'])

45

In [119]:
len(list({v['node']:v for v in timepoint3['between_ireland_nodes']}.values()))

45

In [120]:
timepoint3['between_ireland_nodes']

[{'node': 'node 194596',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 234097',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 248580',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 269304',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 269206',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 270788',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 238024',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 239536',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 240448',

In [121]:
timepoint_3_between_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], timepoint3['between_ireland_nodes'])))
timepoint_3_between_country_frequency = timepoint_3_between_country_counter.most_common()
timepoint_3_between_country_frequency

[("country='Republic of Ireland'", 23), ("country='Northern Ireland'", 22)]

In [122]:
timepoint3['into_ireland_nodes'] = list({v['node']:v for v in timepoint3['into_ireland_nodes']}.values())
timepoint3['into_ireland_nodes']

[{'node': 'node 182523',
  'parent_country': "country='Europe / Turkey'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 183623',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 185011',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 187858',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 189547',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 192369',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 188909',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 192502',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 188026',
  'parent_country': "country='England'",
  'country': "country='Republic of Ir

In [123]:
timepoint3['into_ireland_nodes'] = list(map(lambda x: {'node': x['node'].replace(' ','_'), 'country': x['country'][9:-1], 'parent_country': x['parent_country'][9:-1]}, timepoint3['into_ireland_nodes']))

In [124]:
timepoint3['into_ireland_nodes_without_ambiguous'] = list(filter(lambda x: "|" not in x['parent_country'], timepoint3['into_ireland_nodes']))

In [125]:
len(timepoint3['into_ireland_nodes_without_ambiguous'])

249

In [126]:
timepoint_3_country_counter = collections.Counter(list(map(lambda x: x['country'], timepoint3['into_ireland_nodes'])))
timepoint_3_country_frequency = timepoint_3_country_counter.most_common()
timepoint_3_country_frequency

[('Republic of Ireland', 169), ('Northern Ireland', 82)]

In [127]:
timepoint_3_roi_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], filter(lambda x: x['country'] == 'Republic of Ireland', timepoint3['into_ireland_nodes']))))
timepoint_3_roi_parent_country_frequency = timepoint_3_roi_parent_country_counter.most_common()
timepoint_3_roi_parent_country_frequency

[('England', 133),
 ('Europe / Germany', 6),
 ('North America / USA', 5),
 ('Europe / France', 4),
 ('Europe / Sweden', 4),
 ('Europe / Poland', 3),
 ('Europe / Switzerland', 3),
 ('Europe / Spain', 2),
 ('Scotland', 2),
 ('Europe / Turkey', 1),
 ('Europe / Belgium', 1),
 ('Asia / India', 1),
 ('Europe / Croatia', 1),
 ('Europe / Italy', 1),
 ('Europe / Portugal', 1),
 ('Asia / Iraq', 1)]

In [128]:
timepoint_3_ni_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], filter(lambda x: x['country'] == 'Northern Ireland', timepoint3['into_ireland_nodes']))))
timepoint_3_ni_parent_country_frequency = timepoint_3_ni_parent_country_counter.most_common()
timepoint_3_ni_parent_country_frequency

[('England', 73),
 ('Europe / Germany', 3),
 ('Europe / Poland', 1),
 ('Europe / Bulgaria', 1),
 ('North America / USA', 1),
 ('Asia / United Arab Emirates|North America / Canada', 1),
 ('Europe / Germany|England', 1),
 ('Scotland', 1)]

In [129]:
timepoint_3_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], timepoint3['into_ireland_nodes'])))
timepoint_3_parent_country_frequency = timepoint_3_parent_country_counter.most_common()
timepoint_3_parent_country_frequency

[('England', 206),
 ('Europe / Germany', 9),
 ('North America / USA', 6),
 ('Europe / Poland', 4),
 ('Europe / France', 4),
 ('Europe / Sweden', 4),
 ('Europe / Switzerland', 3),
 ('Scotland', 3),
 ('Europe / Spain', 2),
 ('Europe / Turkey', 1),
 ('Europe / Belgium', 1),
 ('Asia / India', 1),
 ('Europe / Bulgaria', 1),
 ('Europe / Croatia', 1),
 ('Europe / Italy', 1),
 ('Europe / Portugal', 1),
 ('Asia / United Arab Emirates|North America / Canada', 1),
 ('Europe / Germany|England', 1),
 ('Asia / Iraq', 1)]

In [130]:
timepoint_3_parent_country_frequency_df = pd.DataFrame(timepoint_3_parent_country_frequency, columns=['country','Both'])
timepoint_3_parent_country_frequency_df

Unnamed: 0,country,Both
0,England,206
1,Europe / Germany,9
2,North America / USA,6
3,Europe / Poland,4
4,Europe / France,4
5,Europe / Sweden,4
6,Europe / Switzerland,3
7,Scotland,3
8,Europe / Spain,2
9,Europe / Turkey,1


In [131]:
timepoint_3_parent_country_frequency_df = pd.merge(timepoint_3_parent_country_frequency_df, pd.DataFrame(timepoint_3_roi_parent_country_frequency, columns=['country','RoI']), how="left", on="country")
timepoint_3_parent_country_frequency_df = pd.merge(timepoint_3_parent_country_frequency_df, pd.DataFrame(timepoint_3_ni_parent_country_frequency, columns=['country','NI']), how="left", on="country")
timepoint_3_parent_country_frequency_df[['RoI', 'NI']] = timepoint_3_parent_country_frequency_df[['RoI', 'NI']].fillna(value=0).astype(int)

In [132]:
timepoint_3_parent_country_frequency_df['country'] = timepoint_3_parent_country_frequency_df.apply(remove_continent, axis=1)

In [133]:
timepoint_3_parent_country_frequency_df

Unnamed: 0,country,Both,RoI,NI
0,England,206,133,73
1,Germany,9,6,3
2,USA,6,5,1
3,Poland,4,3,1
4,France,4,4,0
5,Sweden,4,4,0
6,Switzerland,3,3,0
7,Scotland,3,2,1
8,Spain,2,2,0
9,Turkey,1,1,0


In [134]:
timepoint_3_parent_country_frequency_df.to_csv('../outputFiles/timepoint_3_parent_country_frequency.csv', index=False)

In [135]:
num_country_tipsTimepoint3 = strainCountryTimepoint3.groupby('country').count().reset_index()
num_country_tipsTimepoint3.rename(columns={'strain': 'num_tips'}, inplace=True)
num_country_tipsTimepoint3

Unnamed: 0,country,num_tips
0,Africa / Angola,8
1,Africa / Benin,34
2,Africa / Burkina Faso,1
3,Africa / Cabo Verde,3
4,Africa / Central African Republic,3
...,...,...
132,South America / French Guiana,2
133,South America / Paraguay,1
134,South America / Peru,3
135,South America / Suriname,1


In [136]:
num_country_tipsTimepoint3['country'] = num_country_tipsTimepoint3.apply(remove_continent, axis=1)
num_country_tipsTimepoint3

Unnamed: 0,country,num_tips
0,Angola,8
1,Benin,34
2,Burkina Faso,1
3,Cabo Verde,3
4,Central African Republic,3
...,...,...
132,French Guiana,2
133,Paraguay,1
134,Peru,3
135,Suriname,1


In [137]:
timepoint_3_parent_country_frequency_df = pd.merge(num_country_tipsTimepoint3, timepoint_3_parent_country_frequency_df, how="left", on=['country']).fillna(value=0)
timepoint_3_parent_country_frequency_df

Unnamed: 0,country,num_tips,Both,RoI,NI
0,Angola,8,0.0,0.0,0.0
1,Benin,34,0.0,0.0,0.0
2,Burkina Faso,1,0.0,0.0,0.0
3,Cabo Verde,3,0.0,0.0,0.0
4,Central African Republic,3,0.0,0.0,0.0
...,...,...,...,...,...
132,French Guiana,2,0.0,0.0,0.0
133,Paraguay,1,0.0,0.0,0.0
134,Peru,3,0.0,0.0,0.0
135,Suriname,1,0.0,0.0,0.0


In [138]:
timepoint_3_parent_country_frequency_df[~timepoint_3_parent_country_frequency_df['country'].isin(country_lat_long['country'])]

Unnamed: 0,country,num_tips,Both,RoI,NI
118,Northern Ireland,1695,0.0,0.0,0.0
123,Republic of Ireland,2850,0.0,0.0,0.0


In [139]:
timepoint_3_parent_country_frequency_df = pd.merge(country_lat_long, timepoint_3_parent_country_frequency_df, how="inner", on=['country'])
timepoint_3_parent_country_frequency_df

Unnamed: 0,longitude,latitude,country,distance_from_ireland,num_tips,Both,RoI,NI
0,-103.120439,23.643948,Mexico,8231.394213,21,0.0,0.0,0.0
1,-93.566635,62.365872,Canada,4765.573313,3249,0.0,0.0,0.0
2,-64.443152,-37.605645,Argentina,11458.955064,5,0.0,0.0,0.0
3,-81.255450,-34.328238,Chile,12001.453231,17,0.0,0.0,0.0
4,-82.338976,-1.298065,Ecuador,9076.012306,11,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
130,35.000000,31.400000,Palestine,4213.049700,9,0.0,0.0,0.0
131,-1.464854,52.561928,England,462.610475,97587,206.0,133.0,73.0
132,-4.183963,56.816738,Scotland,468.558618,6242,3.0,2.0,1.0
133,-3.766409,52.330220,Wales,320.735659,5863,0.0,0.0,0.0


In [140]:
timepoint_3_parent_country_frequency_df.to_csv('../outputFiles/timepoint_3_parent_country_frequency_distance.csv', index=False)

In [141]:
timepoint_3_parent_country_frequency_df[timepoint_3_parent_country_frequency_df['Both'] > 0][['country','Both']].to_csv('../outputFiles/timepoint_3_country_frequency.csv', index=False)

In [142]:
for intro in tqdm(timepoint3['into_ireland_nodes']):
    parent_clusters = []
    is_parent_in_timepoint_list(intro['node'].replace('_',' '), tree3, timepoint3['into_ireland_nodes'])
    if len(parent_clusters) > 0:
        print(intro['node'])
        print(parent_clusters)

  0%|          | 0/251 [00:00<?, ?it/s]

In [143]:
cluster_num = 0
clusters_timepoint_c = []
for intro in tqdm(timepoint3['into_ireland_nodes']):
    cluster_num += 1
    descendant_tips = []
    find_irish_clusters(tree3.find_node_with_label(intro['node'].replace('_',' ')), descendant_tips)
    for tip in list(set(descendant_tips)):
        clusters_timepoint_c.append({'tip': tip.replace(' ','_'), 'cluster': cluster_num, 'timepoint': 'C'})

  0%|          | 0/251 [00:00<?, ?it/s]

In [144]:
clusters_timepoint_c_df = pd.DataFrame(clusters_timepoint_c)
clusters_timepoint_c_df

Unnamed: 0,tip,cluster,timepoint
0,EPI_ISL_909909,1,C
1,EPI_ISL_1092976,1,C
2,EPI_ISL_1190098,2,C
3,EPI_ISL_1054522,2,C
4,EPI_ISL_1264986,2,C
...,...,...,...
4540,EPI_ISL_1190249,249,C
4541,EPI_ISL_1051891,249,C
4542,EPI_ISL_1115719,249,C
4543,EPI_ISL_1092887,250,C


In [145]:
len(clusters_timepoint_c_df['tip'].unique())

4545

In [146]:
clusters_timepoint_c_df.groupby('cluster').count().describe()

Unnamed: 0,tip,timepoint
count,251.0,251.0
mean,18.10757,18.10757
std,58.920967,58.920967
min,1.0,1.0
25%,1.0,1.0
50%,2.0,2.0
75%,10.0,10.0
max,527.0,527.0


In [147]:
clusters_timepoint_c_df.to_csv('../outputFiles/clusters/clusters_timepoint_C.csv', index=False)

In [148]:
clusters_timepoint_c_df.groupby('cluster').count().sort_values(by='tip',ascending=False)

Unnamed: 0_level_0,tip,timepoint
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
156,527,527
157,465,465
142,377,377
83,314,314
108,192,192
...,...,...
93,1,1
94,1,1
174,1,1
173,1,1


In [149]:
clusters_timepoint_c_df.groupby('cluster').count().reset_index().rename(columns={'tip':'irish_samples'})[['cluster','irish_samples']].to_csv('../outputFiles/clusters/clusters_timepoint_C_size.csv', index=False)

### Timepoint D

In [150]:
tree4 = dendropy.Tree.get(path="../outputFiles/timepoint4_all_pastml/named.tree_timepoint4.rooted.nwk", schema="newick")
tree4

<Tree object at 0x173de72e0>

In [151]:
strainCountryTimepoint4 = pd.read_csv('../outputFiles/timepoint4.strainCountry.all.txt')
strainCountryTimepoint4

Unnamed: 0,strain,country
0,EPI_ISL_402125,Asia / China
1,EPI_ISL_6949234,North America / USA
2,EPI_ISL_5387777,North America / USA
3,EPI_ISL_6948687,North America / USA
4,EPI_ISL_6948551,North America / USA
...,...,...
513949,EPI_ISL_3283578,England
513950,EPI_ISL_3283579,England
513951,EPI_ISL_3283580,England
513952,EPI_ISL_3283565,England


In [152]:
strainCountryTimepoint4['strain'] = strainCountryTimepoint4['strain'].str.replace('_',' ')

In [153]:
strainCountryTimepoint4.describe()

Unnamed: 0,strain,country
count,513954,513954
unique,513954,162
top,EPI ISL 402125,England
freq,1,152100


In [154]:
strainCountryTimepoint4[strainCountryTimepoint4['country'].isin(['Republic of Ireland','Northern Ireland'])].groupby('country').count()

Unnamed: 0_level_0,strain
country,Unnamed: 1_level_1
Northern Ireland,3147
Republic of Ireland,4923


In [217]:
into_ireland_nodesTimepoint4 = []
between_ireland_nodesTimepoint4 = []
roi_countTimepoint4 = 0
ni_countTimepoint4 = 0
for strain in tqdm(strainCountryTimepoint4[strainCountryTimepoint4['country'].isin(['Republic of Ireland','Northern Ireland'])]['strain'].to_list()):
    node = tree4.find_node_with_taxon_label(strain)
    
    if node:
        if str(node.annotations["country"]) == "country='Republic of Ireland'":
            roi_countTimepoint4 += 1
        elif str(node.annotations["country"]) == "country='Northern Ireland'":
            ni_countTimepoint4 += 1
        
        if str(node.parent_node.annotations["country"]) != "country='Republic of Ireland'" and str(node.parent_node.annotations["country"]) != "country='Northern Ireland'":
            into_ireland_nodesTimepoint4.append({'node': str(node.parent_node.label), 'parent_country': str(node.parent_node.annotations["country"]), 'country': str(node.annotations["country"])})
        elif str(node.parent_node.annotations["country"]) == "country='Republic of Ireland'" and str(node.annotations["country"]) == "country='Northern Ireland'":
            between_ireland_nodesTimepoint4.append({'node': str(node.parent_node.label), 'parent_country': str(node.parent_node.annotations["country"]), 'country': str(node.annotations["country"])})
        elif str(node.annotations["country"]) == "country='Republic of Ireland'" and str(node.parent_node.annotations["country"]) == "country='Northern Ireland'":
            between_ireland_nodesTimepoint4.append({'node': str(node.parent_node.label), 'parent_country': str(node.parent_node.annotations["country"]), 'country': str(node.annotations["country"])})

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8070/8070 [17:58:56<00:00,  8.02s/it]


In [155]:
timepoint4 = process_tree(tree4)

In [156]:
len(timepoint4['into_ireland_nodes'])

1257

In [157]:
len(list({v['node']:v for v in timepoint4['into_ireland_nodes']}.values()))

1132

In [158]:
len(timepoint4['between_ireland_nodes'])

104

In [159]:
len(list({v['node']:v for v in timepoint4['between_ireland_nodes']}.values()))

104

In [160]:
timepoint4['between_ireland_nodes']

[{'node': 'node 823559',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 829236',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 829239',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 829241',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 829230',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 829909',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 829953',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 835927',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 876781',

In [161]:
timepoint_4_between_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], timepoint4['between_ireland_nodes'])))
timepoint_4_between_country_frequency = timepoint_4_between_country_counter.most_common()
timepoint_4_between_country_frequency

[("country='Republic of Ireland'", 52), ("country='Northern Ireland'", 52)]

In [162]:
timepoint4['into_ireland_nodes'] = list({v['node']:v for v in timepoint4['into_ireland_nodes']}.values())
timepoint4['into_ireland_nodes']

[{'node': 'node 823471',
  'parent_country': "country='Asia / India'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 823563',
  'parent_country': "country='Asia / India'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 823229',
  'parent_country': "country='Asia / India'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 826014',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 823767',
  'parent_country': "country='Asia / India'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 823768',
  'parent_country': "country='North America / USA'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 828485',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 828543',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 828570',
  'parent_country': "country='England'",
  'country

In [163]:
timepoint4['into_ireland_nodes'] = list(map(lambda x: {'node': x['node'].replace(' ','_'), 'country': x['country'][9:-1], 'parent_country': x['parent_country'][9:-1]}, timepoint4['into_ireland_nodes']))

In [164]:
timepoint4['into_ireland_nodes_without_ambiguous'] = list(filter(lambda x: "|" not in x['parent_country'], timepoint4['into_ireland_nodes']))

In [165]:
len(timepoint4['into_ireland_nodes_without_ambiguous'])

1119

In [166]:
timepoint_4_country_counter = collections.Counter(list(map(lambda x: x['country'], timepoint4['into_ireland_nodes'])))
timepoint_4_country_frequency = timepoint_4_country_counter.most_common()
timepoint_4_country_frequency

[('Northern Ireland', 709), ('Republic of Ireland', 423)]

In [167]:
timepoint_4_roi_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], filter(lambda x: x['country'] == 'Republic of Ireland', timepoint4['into_ireland_nodes']))))
timepoint_4_roi_parent_country_frequency = timepoint_4_roi_parent_country_counter.most_common()
timepoint_4_roi_parent_country_frequency

[('England', 223),
 ('Asia / India', 33),
 ('Europe / Spain', 23),
 ('Scotland', 19),
 ('Europe / France', 13),
 ('North America / USA', 12),
 ('Europe / Netherlands', 11),
 ('Europe / Sweden', 10),
 ('Europe / Denmark', 10),
 ('Europe / Germany', 8),
 ('Europe / Greece', 6),
 ('Europe / Italy', 6),
 ('Europe / Russia', 5),
 ('Europe / Switzerland', 4),
 ('Republic of Ireland|England', 4),
 ('Wales', 3),
 ('Europe / Belgium', 3),
 ('Africa / Nigeria', 3),
 ('Europe / Turkey', 2),
 ('Europe / Portugal', 2),
 ('Europe / Iceland', 2),
 ('Africa / Kenya', 1),
 ('Europe / Lithuania', 1),
 ('Europe / Belgium|Europe / Italy', 1),
 ('Asia / Hong Kong', 1),
 ('Oceania / Australia', 1),
 ('Europe / Denmark|Europe / Netherlands|Republic of Ireland', 1),
 ('Europe / Belgium|Europe / France', 1),
 ('Europe / Denmark|Europe / Italy', 1),
 ('Asia / Japan', 1),
 ('Africa / Uganda', 1),
 ('Africa / Namibia', 1),
 ('Asia / Kyrgyzstan', 1),
 ('Europe / Croatia', 1),
 ('Asia / Israel', 1),
 ('Africa / Zim

In [168]:
timepoint_4_ni_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], filter(lambda x: x['country'] == 'Northern Ireland', timepoint4['into_ireland_nodes']))))
timepoint_4_ni_parent_country_frequency = timepoint_4_ni_parent_country_counter.most_common()
timepoint_4_ni_parent_country_frequency

[('England', 503),
 ('Scotland', 31),
 ('Europe / Spain', 28),
 ('Europe / France', 21),
 ('Europe / Denmark', 18),
 ('Europe / Netherlands', 17),
 ('Europe / Sweden', 14),
 ('Europe / Germany', 13),
 ('Wales', 12),
 ('Asia / India', 9),
 ('Europe / Russia', 6),
 ('Europe / Portugal', 6),
 ('Europe / Switzerland', 4),
 ('Europe / Italy', 4),
 ('Africa / Nigeria', 3),
 ('North America / USA', 2),
 ('Europe / Greece', 2),
 ('Asia / South Korea', 2),
 ('Africa / Ghana', 2),
 ('North America / Mexico', 2),
 ('Europe / Belgium', 1),
 ('Asia / Indonesia', 1),
 ('Africa / Uganda', 1),
 ('Europe / Croatia', 1),
 ('Europe / Austria', 1),
 ('Wales|Northern Ireland|England', 1),
 ('Europe / Norway', 1),
 ('Asia / Israel', 1),
 ('Europe / France|Northern Ireland', 1),
 ('Europe / Slovenia', 1)]

In [169]:
timepoint_4_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], timepoint4['into_ireland_nodes'])))
timepoint_4_parent_country_frequency = timepoint_4_parent_country_counter.most_common()
timepoint_4_parent_country_frequency

[('England', 726),
 ('Europe / Spain', 51),
 ('Scotland', 50),
 ('Asia / India', 42),
 ('Europe / France', 34),
 ('Europe / Netherlands', 28),
 ('Europe / Denmark', 28),
 ('Europe / Sweden', 24),
 ('Europe / Germany', 21),
 ('Wales', 15),
 ('North America / USA', 14),
 ('Europe / Russia', 11),
 ('Europe / Italy', 10),
 ('Europe / Greece', 8),
 ('Europe / Switzerland', 8),
 ('Europe / Portugal', 8),
 ('Africa / Nigeria', 6),
 ('Europe / Belgium', 4),
 ('Republic of Ireland|England', 4),
 ('Europe / Turkey', 2),
 ('Africa / Uganda', 2),
 ('Europe / Croatia', 2),
 ('Asia / South Korea', 2),
 ('Asia / Israel', 2),
 ('Africa / Ghana', 2),
 ('North America / Mexico', 2),
 ('Europe / Iceland', 2),
 ('Africa / Kenya', 1),
 ('Europe / Lithuania', 1),
 ('Europe / Belgium|Europe / Italy', 1),
 ('Asia / Hong Kong', 1),
 ('Oceania / Australia', 1),
 ('Europe / Denmark|Europe / Netherlands|Republic of Ireland', 1),
 ('Europe / Belgium|Europe / France', 1),
 ('Europe / Denmark|Europe / Italy', 1),
 (

In [170]:
timepoint_4_parent_country_frequency_df = pd.DataFrame(timepoint_4_parent_country_frequency, columns=['country','Both'])
timepoint_4_parent_country_frequency_df

Unnamed: 0,country,Both
0,England,726
1,Europe / Spain,51
2,Scotland,50
3,Asia / India,42
4,Europe / France,34
5,Europe / Netherlands,28
6,Europe / Denmark,28
7,Europe / Sweden,24
8,Europe / Germany,21
9,Wales,15


In [171]:
timepoint_4_parent_country_frequency_df = pd.merge(timepoint_4_parent_country_frequency_df, pd.DataFrame(timepoint_4_roi_parent_country_frequency, columns=['country','RoI']), how="left", on="country")
timepoint_4_parent_country_frequency_df = pd.merge(timepoint_4_parent_country_frequency_df, pd.DataFrame(timepoint_4_ni_parent_country_frequency, columns=['country','NI']), how="left", on="country")
timepoint_4_parent_country_frequency_df[['RoI', 'NI']] = timepoint_4_parent_country_frequency_df[['RoI', 'NI']].fillna(value=0).astype(int)

In [172]:
timepoint_4_parent_country_frequency_df['country'] = timepoint_4_parent_country_frequency_df.apply(remove_continent, axis=1)

In [173]:
timepoint_4_parent_country_frequency_df

Unnamed: 0,country,Both,RoI,NI
0,England,726,223,503
1,Spain,51,23,28
2,Scotland,50,19,31
3,India,42,33,9
4,France,34,13,21
5,Netherlands,28,11,17
6,Denmark,28,10,18
7,Sweden,24,10,14
8,Germany,21,8,13
9,Wales,15,3,12


In [174]:
timepoint_4_parent_country_frequency_df.to_csv('../outputFiles/timepoint_4_parent_country_frequency.csv', index=False)

In [175]:
num_country_tipsTimepoint4 = strainCountryTimepoint4.groupby('country').count().reset_index()
num_country_tipsTimepoint4.rename(columns={'strain': 'num_tips'}, inplace=True)
num_country_tipsTimepoint4

Unnamed: 0,country,num_tips
0,Africa / Algeria,10
1,Africa / Angola,18
2,Africa / Benin,18
3,Africa / Botswana,76
4,Africa / Burundi,57
...,...,...
157,South America / Paraguay,8
158,South America / Peru,83
159,South America / Suriname,3
160,South America / Venezuela,2


In [176]:
num_country_tipsTimepoint4['country'] = num_country_tipsTimepoint4.apply(remove_continent, axis=1)
num_country_tipsTimepoint4

Unnamed: 0,country,num_tips
0,Algeria,10
1,Angola,18
2,Benin,18
3,Botswana,76
4,Burundi,57
...,...,...
157,Paraguay,8
158,Peru,83
159,Suriname,3
160,Venezuela,2


In [177]:
timepoint_4_parent_country_frequency_df = pd.merge(num_country_tipsTimepoint4, timepoint_4_parent_country_frequency_df, how="left", on=['country']).fillna(value=0)
timepoint_4_parent_country_frequency_df

Unnamed: 0,country,num_tips,Both,RoI,NI
0,Algeria,10,0.0,0.0,0.0
1,Angola,18,0.0,0.0,0.0
2,Benin,18,0.0,0.0,0.0
3,Botswana,76,0.0,0.0,0.0
4,Burundi,57,0.0,0.0,0.0
...,...,...,...,...,...
157,Paraguay,8,0.0,0.0,0.0
158,Peru,83,0.0,0.0,0.0
159,Suriname,3,0.0,0.0,0.0
160,Venezuela,2,0.0,0.0,0.0


In [178]:
timepoint_4_parent_country_frequency_df[~timepoint_4_parent_country_frequency_df['country'].isin(country_lat_long['country'])]

Unnamed: 0,country,num_tips,Both,RoI,NI
137,Northern Ireland,3147,0.0,0.0,0.0
145,Republic of Ireland,4923,0.0,0.0,0.0


In [179]:
timepoint_4_parent_country_frequency_df = pd.merge(country_lat_long, timepoint_4_parent_country_frequency_df, how="inner", on=['country'])
timepoint_4_parent_country_frequency_df

Unnamed: 0,longitude,latitude,country,distance_from_ireland,num_tips,Both,RoI,NI
0,-145.255376,-13.526317,French Polynesia,14206.711886,1,0.0,0.0,0.0
1,-90.312193,15.820879,Guatemala,8079.118206,80,0.0,0.0,0.0
2,-103.120439,23.643948,Mexico,8231.394213,3673,2.0,0.0,2.0
3,-93.566635,62.365872,Canada,4765.573313,8793,0.0,0.0,0.0
4,-64.443152,-37.605645,Argentina,11458.955064,83,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
155,34.600000,45.250000,Crimea,3182.289574,12,0.0,0.0,0.0
156,-1.464854,52.561928,England,462.610475,152100,726.0,223.0,503.0
157,-4.183963,56.816738,Scotland,468.558618,20683,50.0,19.0,31.0
158,-3.766409,52.330220,Wales,320.735659,9969,15.0,3.0,12.0


In [180]:
timepoint_4_parent_country_frequency_df.to_csv('../outputFiles/timepoint_4_parent_country_frequency_distance.csv', index=False)

In [181]:
timepoint_4_parent_country_frequency_df[timepoint_4_parent_country_frequency_df['Both'] > 0][['country','Both']].to_csv('../outputFiles/timepoint_4_country_frequency.csv', index=False)

In [183]:
for intro in tqdm(timepoint4['into_ireland_nodes']):
    parent_clusters = []
    is_parent_in_timepoint_list(intro['node'].replace('_',' '), tree4, timepoint4['into_ireland_nodes'])
    if len(parent_clusters) > 0:
        print(intro['node'])
        print(parent_clusters)

  0%|          | 0/1132 [00:00<?, ?it/s]

In [184]:
cluster_num = 0
clusters_timepoint_d = []
for intro in tqdm(timepoint4['into_ireland_nodes']):
    cluster_num += 1
    descendant_tips = []
    find_irish_clusters(tree4.find_node_with_label(intro['node'].replace('_',' ')), descendant_tips)
    for tip in list(set(descendant_tips)):
        clusters_timepoint_d.append({'tip': tip.replace(' ','_'), 'cluster': cluster_num, 'timepoint': 'D'})

  0%|          | 0/1132 [00:00<?, ?it/s]

In [185]:
clusters_timepoint_d_df = pd.DataFrame(clusters_timepoint_d)
clusters_timepoint_d_df

Unnamed: 0,tip,cluster,timepoint
0,EPI_ISL_2363379,1,D
1,EPI_ISL_2533637,1,D
2,EPI_ISL_2193937,1,D
3,EPI_ISL_2612914,1,D
4,EPI_ISL_2601122,1,D
...,...,...,...
8089,EPI_ISL_2984348,1131,D
8090,EPI_ISL_3247631,1131,D
8091,EPI_ISL_3205303,1131,D
8092,EPI_ISL_1976611,1132,D


In [186]:
len(clusters_timepoint_d_df['tip'].unique())

8070

In [187]:
clusters_timepoint_d_df.groupby('cluster').count().describe()

Unnamed: 0,tip,timepoint
count,1132.0,1132.0
mean,7.150177,7.150177
std,43.302265,43.302265
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,2.0,2.0
max,970.0,970.0


In [188]:
clusters_timepoint_d_df.to_csv('../outputFiles/clusters/clusters_timepoint_D.csv', index=False)

In [189]:
clusters_timepoint_d_df.groupby('cluster').count().sort_values(by='tip',ascending=False)

Unnamed: 0_level_0,tip,timepoint
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
613,970,970
13,795,795
1049,330,330
225,284,284
701,246,246
...,...,...
577,1,1
579,1,1
580,1,1
141,1,1


In [190]:
clusters_timepoint_d_df.groupby('cluster').count().reset_index().rename(columns={'tip':'irish_samples'})[['cluster','irish_samples']].to_csv('../outputFiles/clusters/clusters_timepoint_D_size.csv', index=False)

### Timepoint E

In [191]:
tree5 = dendropy.Tree.get(path="../outputFiles/timepoint5_all_pastml/named.tree_timepoint5.rooted.nwk", schema="newick")
tree5

<Tree object at 0x1e159b2e0>

In [192]:
strainCountryTimepoint5 = pd.read_csv('../outputFiles/timepoint5.strainCountry.all.txt')
strainCountryTimepoint5

Unnamed: 0,strain,country
0,EPI_ISL_402125,Asia / China
1,EPI_ISL_11246938,Africa / Nigeria
2,EPI_ISL_11229467,Europe / Austria
3,EPI_ISL_8542937,Asia / India
4,EPI_ISL_8542934,Asia / India
...,...,...
1313963,EPI_ISL_9681542,England
1313964,EPI_ISL_9681866,England
1313965,EPI_ISL_9681987,England
1313966,EPI_ISL_9681523,England


In [193]:
strainCountryTimepoint5['strain'] = strainCountryTimepoint5['strain'].str.replace('_',' ')

In [194]:
strainCountryTimepoint5.describe()

Unnamed: 0,strain,country
count,1313968,1313968
unique,1313968,164
top,EPI ISL 402125,North America / USA
freq,1,470629


In [195]:
strainCountryTimepoint5[strainCountryTimepoint5['country'].isin(['Republic of Ireland','Northern Ireland'])].groupby('country').count()

Unnamed: 0_level_0,strain
country,Unnamed: 1_level_1
Northern Ireland,3949
Republic of Ireland,7888


In [196]:
timepoint5 = process_tree(tree5)

In [197]:
len(timepoint5['into_ireland_nodes'])

2118

In [198]:
len(list({v['node']:v for v in timepoint5['into_ireland_nodes']}.values()))

1975

In [199]:
len(timepoint5['between_ireland_nodes'])

71

In [200]:
len(list({v['node']:v for v in timepoint5['between_ireland_nodes']}.values()))

71

In [201]:
timepoint5['between_ireland_nodes']

[{'node': 'node 423746',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 423723',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 423724',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 604844',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 606013',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 600527',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 600520',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 602001',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 601994',

In [202]:
timepoint_5_between_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], timepoint5['between_ireland_nodes'])))
timepoint_5_between_country_frequency = timepoint_5_between_country_counter.most_common()
timepoint_5_between_country_frequency

[("country='Republic of Ireland'", 38), ("country='Northern Ireland'", 33)]

In [203]:
timepoint5['into_ireland_nodes'] = list({v['node']:v for v in timepoint5['into_ireland_nodes']}.values())
timepoint5['into_ireland_nodes']

[{'node': 'node 386844',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 387076',
  'parent_country': "country='North America / USA'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 755871',
  'parent_country': "country='Scotland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 387082',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 427232',
  'parent_country': "country='North America / USA'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 427177',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 427660',
  'parent_country': "country='North America / USA'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 427711',
  'parent_country': "country='North America / USA'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 428564',
  'parent_country': "country='North A

In [204]:
timepoint5['into_ireland_nodes'] = list(map(lambda x: {'node': x['node'].replace(' ','_'), 'country': x['country'][9:-1], 'parent_country': x['parent_country'][9:-1]}, timepoint5['into_ireland_nodes']))

In [205]:
timepoint5['into_ireland_nodes_without_ambiguous'] = list(filter(lambda x: "|" not in x['parent_country'], timepoint5['into_ireland_nodes']))

In [206]:
len(timepoint5['into_ireland_nodes_without_ambiguous'])

1958

In [207]:
timepoint_5_country_counter = collections.Counter(list(map(lambda x: x['country'], timepoint5['into_ireland_nodes'])))
timepoint_5_country_frequency = timepoint_5_country_counter.most_common()
timepoint_5_country_frequency

[('Republic of Ireland', 1334), ('Northern Ireland', 641)]

In [208]:
timepoint_5_roi_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], filter(lambda x: x['country'] == 'Republic of Ireland', timepoint5['into_ireland_nodes']))))
timepoint_5_roi_parent_country_frequency = timepoint_5_roi_parent_country_counter.most_common()
timepoint_5_roi_parent_country_frequency

[('England', 595),
 ('North America / USA', 380),
 ('Europe / France', 105),
 ('Europe / Spain', 34),
 ('Europe / Germany', 32),
 ('Europe / Poland', 21),
 ('North America / Canada', 21),
 ('Scotland', 14),
 ('South America / Brazil', 10),
 ('Europe / Croatia', 8),
 ('Europe / Belgium', 8),
 ('Europe / Denmark', 8),
 ('Europe / Switzerland', 7),
 ('Oceania / Australia', 7),
 ('Wales', 7),
 ('Europe / Netherlands', 6),
 ('Asia / India', 5),
 ('Asia / Japan', 5),
 ('Europe / Italy', 4),
 ('Africa / South Africa', 4),
 ('Europe / Turkey', 3),
 ('Europe / Portugal', 3),
 ('Europe / Latvia', 3),
 ('Europe / Norway', 3),
 ('Republic of Ireland|Europe / Poland', 2),
 ('South America / Peru', 2),
 ('Republic of Ireland|Europe / Germany', 2),
 ('Europe / Czech Republic', 2),
 ('Europe / Lithuania', 2),
 ('Europe / Slovenia', 2),
 ('Europe / Romania', 2),
 ('Republic of Ireland|Europe / Switzerland', 1),
 ('Asia / Bangladesh', 1),
 ('Asia / Sri Lanka', 1),
 ('England|Republic of Ireland', 1),
 (

In [209]:
timepoint_5_ni_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], filter(lambda x: x['country'] == 'Northern Ireland', timepoint5['into_ireland_nodes']))))
timepoint_5_ni_parent_country_frequency = timepoint_5_ni_parent_country_counter.most_common()
timepoint_5_ni_parent_country_frequency

[('England', 401),
 ('North America / USA', 137),
 ('Scotland', 34),
 ('Europe / France', 18),
 ('Europe / Germany', 9),
 ('Wales', 6),
 ('North America / Canada', 5),
 ('Europe / Spain', 5),
 ('Europe / Poland', 4),
 ('Asia / India', 4),
 ('South America / Brazil', 3),
 ('Africa / South Africa', 3),
 ('Europe / Italy', 2),
 ('Asia / Japan', 2),
 ('Europe / Slovakia', 1),
 ('Europe / Romania', 1),
 ('Scotland|Northern Ireland', 1),
 ('Europe / Slovenia', 1),
 ('Europe / Portugal', 1),
 ('Europe / Croatia', 1),
 ('Europe / Switzerland|Northern Ireland', 1),
 ('Europe / Bulgaria', 1)]

In [210]:
timepoint_5_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], timepoint5['into_ireland_nodes'])))
timepoint_5_parent_country_frequency = timepoint_5_parent_country_counter.most_common()
timepoint_5_parent_country_frequency

[('England', 996),
 ('North America / USA', 517),
 ('Europe / France', 123),
 ('Scotland', 48),
 ('Europe / Germany', 41),
 ('Europe / Spain', 39),
 ('North America / Canada', 26),
 ('Europe / Poland', 25),
 ('South America / Brazil', 13),
 ('Wales', 13),
 ('Europe / Croatia', 9),
 ('Asia / India', 9),
 ('Europe / Belgium', 8),
 ('Europe / Denmark', 8),
 ('Europe / Switzerland', 7),
 ('Asia / Japan', 7),
 ('Oceania / Australia', 7),
 ('Africa / South Africa', 7),
 ('Europe / Netherlands', 6),
 ('Europe / Italy', 6),
 ('Europe / Portugal', 4),
 ('Europe / Turkey', 3),
 ('Europe / Romania', 3),
 ('Europe / Slovenia', 3),
 ('Europe / Latvia', 3),
 ('Europe / Norway', 3),
 ('Republic of Ireland|Europe / Poland', 2),
 ('South America / Peru', 2),
 ('Republic of Ireland|Europe / Germany', 2),
 ('Europe / Czech Republic', 2),
 ('Europe / Lithuania', 2),
 ('Europe / Bulgaria', 2),
 ('Republic of Ireland|Europe / Switzerland', 1),
 ('Europe / Slovakia', 1),
 ('Asia / Bangladesh', 1),
 ('Asia / 

In [211]:
timepoint_5_parent_country_frequency_df = pd.DataFrame(timepoint_5_parent_country_frequency, columns=['country','Both'])
timepoint_5_parent_country_frequency_df

Unnamed: 0,country,Both
0,England,996
1,North America / USA,517
2,Europe / France,123
3,Scotland,48
4,Europe / Germany,41
...,...,...
56,Asia / Thailand,1
57,Europe / Austria,1
58,Europe / Slovakia|Europe / Poland,1
59,Africa / Reunion,1


In [212]:
timepoint_5_parent_country_frequency_df = pd.merge(timepoint_5_parent_country_frequency_df, pd.DataFrame(timepoint_5_roi_parent_country_frequency, columns=['country','RoI']), how="left", on="country")
timepoint_5_parent_country_frequency_df = pd.merge(timepoint_5_parent_country_frequency_df, pd.DataFrame(timepoint_5_ni_parent_country_frequency, columns=['country','NI']), how="left", on="country")
timepoint_5_parent_country_frequency_df[['RoI', 'NI']] = timepoint_5_parent_country_frequency_df[['RoI', 'NI']].fillna(value=0).astype(int)

In [213]:
timepoint_5_parent_country_frequency_df['country'] = timepoint_5_parent_country_frequency_df.apply(remove_continent, axis=1)

In [214]:
timepoint_5_parent_country_frequency_df

Unnamed: 0,country,Both,RoI,NI
0,England,996,595,401
1,USA,517,380,137
2,France,123,105,18
3,Scotland,48,14,34
4,Germany,41,32,9
...,...,...,...,...
56,Thailand,1,1,0
57,Austria,1,1,0
58,Europe / Slovakia|Europe / Poland,1,1,0
59,Reunion,1,1,0


In [215]:
timepoint_5_parent_country_frequency_df.to_csv('../outputFiles/timepoint_5_parent_country_frequency.csv', index=False)

In [216]:
num_country_tipsTimepoint5 = strainCountryTimepoint5.groupby('country').count().reset_index()
num_country_tipsTimepoint5.rename(columns={'strain': 'num_tips'}, inplace=True)
num_country_tipsTimepoint5

Unnamed: 0,country,num_tips
0,Africa / Algeria,1
1,Africa / Angola,23
2,Africa / Botswana,726
3,Africa / Cabo Verde,133
4,Africa / Cameroon,4
...,...,...
159,South America / Peru,1872
160,South America / Suriname,64
161,South America / Trinidad and Tobago,34
162,South America / Venezuela,39


In [217]:
num_country_tipsTimepoint5['country'] = num_country_tipsTimepoint5.apply(remove_continent, axis=1)
num_country_tipsTimepoint5

Unnamed: 0,country,num_tips
0,Algeria,1
1,Angola,23
2,Botswana,726
3,Cabo Verde,133
4,Cameroon,4
...,...,...
159,Peru,1872
160,Suriname,64
161,Trinidad and Tobago,34
162,Venezuela,39


In [218]:
timepoint_5_parent_country_frequency_df = pd.merge(num_country_tipsTimepoint5, timepoint_5_parent_country_frequency_df, how="left", on=['country']).fillna(value=0)
timepoint_5_parent_country_frequency_df

Unnamed: 0,country,num_tips,Both,RoI,NI
0,Algeria,1,0.0,0.0,0.0
1,Angola,23,0.0,0.0,0.0
2,Botswana,726,0.0,0.0,0.0
3,Cabo Verde,133,0.0,0.0,0.0
4,Cameroon,4,0.0,0.0,0.0
...,...,...,...,...,...
159,Peru,1872,2.0,2.0,0.0
160,Suriname,64,0.0,0.0,0.0
161,Trinidad and Tobago,34,0.0,0.0,0.0
162,Venezuela,39,0.0,0.0,0.0


In [219]:
timepoint_5_parent_country_frequency_df[~timepoint_5_parent_country_frequency_df['country'].isin(country_lat_long['country'])]

Unnamed: 0,country,num_tips,Both,RoI,NI
138,Northern Ireland,3949,0.0,0.0,0.0
146,Republic of Ireland,7888,0.0,0.0,0.0


In [220]:
timepoint_5_parent_country_frequency_df = pd.merge(country_lat_long, timepoint_5_parent_country_frequency_df, how="inner", on=['country'])
timepoint_5_parent_country_frequency_df

Unnamed: 0,longitude,latitude,country,distance_from_ireland,num_tips,Both,RoI,NI
0,-170.700732,-14.305712,American Samoa,15412.146132,8,0.0,0.0,0.0
1,-145.255376,-13.526317,French Polynesia,14206.711886,13,0.0,0.0,0.0
2,-88.859115,13.758042,El Salvador,8163.052966,147,0.0,0.0,0.0
3,-90.312193,15.820879,Guatemala,8079.118206,285,0.0,0.0,0.0
4,-103.120439,23.643948,Mexico,8231.394213,8149,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...
157,35.000000,31.400000,Palestine,4213.049700,6,0.0,0.0,0.0
158,-1.464854,52.561928,England,462.610475,344687,996.0,595.0,401.0
159,-4.183963,56.816738,Scotland,468.558618,32025,48.0,14.0,34.0
160,-3.766409,52.330220,Wales,320.735659,34537,13.0,7.0,6.0


In [221]:
timepoint_5_parent_country_frequency_df.to_csv('../outputFiles/timepoint_5_parent_country_frequency_distance.csv', index=False)

In [222]:
timepoint_5_parent_country_frequency_df[timepoint_5_parent_country_frequency_df['Both'] > 0][['country','Both']].to_csv('../outputFiles/timepoint_5_country_frequency.csv', index=False)

In [224]:
for intro in tqdm(timepoint5['into_ireland_nodes']):
    parent_clusters = []
    is_parent_in_timepoint_list(intro['node'].replace('_',' '), tree5, timepoint5['into_ireland_nodes'])
    if len(parent_clusters) > 0:
        print(intro['node'])
        print(parent_clusters)

  0%|          | 0/1975 [00:00<?, ?it/s]

In [225]:
cluster_num = 0
clusters_timepoint_e = []
for intro in tqdm(timepoint5['into_ireland_nodes']):
    cluster_num += 1
    descendant_tips = []
    find_irish_clusters(tree5.find_node_with_label(intro['node'].replace('_',' ')), descendant_tips)
    for tip in list(set(descendant_tips)):
        clusters_timepoint_e.append({'tip': tip.replace(' ','_'), 'cluster': cluster_num, 'timepoint': 'E'})

  0%|          | 0/1975 [00:00<?, ?it/s]

In [226]:
clusters_timepoint_e_df = pd.DataFrame(clusters_timepoint_e)
clusters_timepoint_e_df

Unnamed: 0,tip,cluster,timepoint
0,EPI_ISL_10174017,1,E
1,EPI_ISL_10178453,2,E
2,EPI_ISL_10175130,2,E
3,EPI_ISL_10174898,2,E
4,EPI_ISL_10175823,2,E
...,...,...,...
11854,EPI_ISL_10177968,1973,E
11855,EPI_ISL_8805406,1974,E
11856,EPI_ISL_10171890,1974,E
11857,EPI_ISL_10173607,1974,E


In [227]:
len(clusters_timepoint_e_df['tip'].unique())

11837

In [228]:
clusters_timepoint_e_df.groupby('cluster').count().describe()

Unnamed: 0,tip,timepoint
count,1975.0,1975.0
mean,6.004557,6.004557
std,48.316034,48.316034
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,2.0,2.0
max,1325.0,1325.0


In [229]:
clusters_timepoint_e_df.to_csv('../outputFiles/clusters/clusters_timepoint_E.csv', index=False)

In [230]:
clusters_timepoint_e_df.groupby('cluster').count().sort_values(by='tip',ascending=False)

Unnamed: 0_level_0,tip,timepoint
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
316,1325,1325
1408,913,913
876,615,615
337,542,542
640,507,507
...,...,...
766,1,1
765,1,1
763,1,1
762,1,1


In [231]:
clusters_timepoint_e_df.groupby('cluster').count().reset_index().rename(columns={'tip':'irish_samples'})[['cluster','irish_samples']].to_csv('../outputFiles/clusters/clusters_timepoint_E_size.csv', index=False)

### Timepoint F

In [232]:
tree6 = dendropy.Tree.get(path="../outputFiles/timepoint6_all_pastml/named.tree_timepoint6.rooted.nwk", schema="newick")
tree6

<Tree object at 0x2a97306d0>

In [233]:
strainCountryTimepoint6 = pd.read_csv('../outputFiles/timepoint6.strainCountry.all.txt')
strainCountryTimepoint6

Unnamed: 0,strain,country
0,EPI_ISL_402125,Asia / China
1,EPI_ISL_9796145,Europe / France
2,EPI_ISL_10504840,Asia / India
3,EPI_ISL_9679276,Africa / South Africa
4,EPI_ISL_8128463,Africa / South Africa
...,...,...
644939,EPI_ISL_11670731,Europe / Poland
644940,EPI_ISL_11670738,Europe / Poland
644941,EPI_ISL_11664475,Europe / Poland
644942,EPI_ISL_11670728,Europe / Poland


In [234]:
strainCountryTimepoint6['strain'] = strainCountryTimepoint6['strain'].str.replace('_',' ')

In [235]:
strainCountryTimepoint6.describe()

Unnamed: 0,strain,country
count,644944,644944
unique,644944,112
top,EPI ISL 402125,England
freq,1,221477


In [236]:
strainCountryTimepoint6[strainCountryTimepoint6['country'].isin(['Republic of Ireland','Northern Ireland'])].groupby('country').count()

Unnamed: 0_level_0,strain
country,Unnamed: 1_level_1
Northern Ireland,5586
Republic of Ireland,1235


In [237]:
timepoint6 = process_tree(tree6)

In [238]:
len(timepoint6['into_ireland_nodes'])

1488

In [239]:
len(list({v['node']:v for v in timepoint6['into_ireland_nodes']}.values()))

1348

In [240]:
len(timepoint6['between_ireland_nodes'])

37

In [241]:
len(list({v['node']:v for v in timepoint6['between_ireland_nodes']}.values()))

37

In [242]:
timepoint6['between_ireland_nodes']

[{'node': 'node 658145',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 676376',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 684985',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 685234',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 685348',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 685347',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 692519',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 693248',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 699054',

In [243]:
timepoint_6_between_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], timepoint6['between_ireland_nodes'])))
timepoint_6_between_country_frequency = timepoint_6_between_country_counter.most_common()
timepoint_6_between_country_frequency

[("country='Northern Ireland'", 32), ("country='Republic of Ireland'", 5)]

In [244]:
timepoint6['into_ireland_nodes'] = list({v['node']:v for v in timepoint6['into_ireland_nodes']}.values())
timepoint6['into_ireland_nodes']

[{'node': 'node 614636',
  'parent_country': "country='Europe / Germany'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 614626',
  'parent_country': "country='Europe / Belgium'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 653553',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 653552',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 653551',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 653581',
  'parent_country': "country='Europe / France'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 653661',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 654646',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 654666',
  'parent_country': "country='England'",
  'coun

In [245]:
timepoint6['into_ireland_nodes'] = list(map(lambda x: {'node': x['node'].replace(' ','_'), 'country': x['country'][9:-1], 'parent_country': x['parent_country'][9:-1]}, timepoint6['into_ireland_nodes']))

In [246]:
timepoint6['into_ireland_nodes_without_ambiguous'] = list(filter(lambda x: "|" not in x['parent_country'], timepoint6['into_ireland_nodes']))

In [247]:
len(timepoint6['into_ireland_nodes_without_ambiguous'])

1323

In [248]:
timepoint_6_country_counter = collections.Counter(list(map(lambda x: x['country'], timepoint6['into_ireland_nodes'])))
timepoint_6_country_frequency = timepoint_6_country_counter.most_common()
timepoint_6_country_frequency

[('Northern Ireland', 924), ('Republic of Ireland', 424)]

In [249]:
timepoint_6_roi_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], filter(lambda x: x['country'] == 'Republic of Ireland', timepoint6['into_ireland_nodes']))))
timepoint_6_roi_parent_country_frequency = timepoint_6_roi_parent_country_counter.most_common()
timepoint_6_roi_parent_country_frequency

[('England', 215),
 ('Europe / Denmark', 41),
 ('Europe / Germany', 29),
 ('Europe / Switzerland', 24),
 ('Scotland', 22),
 ('Asia / India', 16),
 ('Europe / France', 9),
 ('Europe / Sweden', 9),
 ('Europe / Netherlands', 7),
 ('Wales', 6),
 ('North America / USA', 5),
 ('Europe / Poland', 4),
 ('Europe / Slovakia', 4),
 ('Asia / Israel', 3),
 ('Europe / Belgium', 2),
 ('Europe / Austria', 2),
 ('Europe / Spain', 2),
 ('Republic of Ireland|Europe / France', 2),
 ('North America / USA|England', 1),
 ('Scotland|Republic of Ireland', 1),
 ('Asia / Sri Lanka', 1),
 ('Asia / Israel|Europe / France|Europe / Slovakia', 1),
 ('North America / USA|Wales', 1),
 ('North America / USA|Republic of Ireland', 1),
 ('Europe / Netherlands|Republic of Ireland|England', 1),
 ('Asia / Thailand', 1),
 ('England|Europe / France', 1),
 ('Europe / Poland|England', 1),
 ('Europe / Germany|England', 1),
 ('Europe / Germany|Republic of Ireland', 1),
 ('Europe / France|Europe / Switzerland', 1),
 ('Europe / Slove

In [250]:
timepoint_6_ni_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], filter(lambda x: x['country'] == 'Northern Ireland', timepoint6['into_ireland_nodes']))))
timepoint_6_ni_parent_country_frequency = timepoint_6_ni_parent_country_counter.most_common()
timepoint_6_ni_parent_country_frequency

[('England', 595),
 ('Europe / Denmark', 98),
 ('Scotland', 89),
 ('Europe / Germany', 51),
 ('Europe / Sweden', 18),
 ('Asia / India', 12),
 ('Europe / France', 9),
 ('North America / USA', 8),
 ('Wales', 5),
 ('Europe / Poland', 5),
 ('Europe / Norway', 5),
 ('Northern Ireland|Scotland', 4),
 ('Europe / Netherlands', 3),
 ('Europe / Austria', 3),
 ('Europe / Switzerland', 3),
 ('Europe / Spain', 2),
 ('Asia / Israel', 2),
 ('Europe / Estonia', 2),
 ('Northern Ireland|England', 1),
 ('Northern Ireland|Europe / Spain', 1),
 ('Asia / Hong Kong', 1),
 ('Europe / Germany|Northern Ireland', 1),
 ('Asia / Thailand', 1),
 ('Europe / Germany|England', 1),
 ('Europe / Germany|Europe / France', 1),
 ('Oceania / Australia', 1),
 ('Oceania / New Zealand', 1),
 ('Europe / Italy', 1)]

In [251]:
timepoint_6_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], timepoint6['into_ireland_nodes'])))
timepoint_6_parent_country_frequency = timepoint_6_parent_country_counter.most_common()
timepoint_6_parent_country_frequency

[('England', 810),
 ('Europe / Denmark', 139),
 ('Scotland', 111),
 ('Europe / Germany', 80),
 ('Asia / India', 28),
 ('Europe / Switzerland', 27),
 ('Europe / Sweden', 27),
 ('Europe / France', 18),
 ('North America / USA', 13),
 ('Wales', 11),
 ('Europe / Netherlands', 10),
 ('Europe / Poland', 9),
 ('Europe / Austria', 5),
 ('Asia / Israel', 5),
 ('Europe / Norway', 5),
 ('Northern Ireland|Scotland', 4),
 ('Europe / Spain', 4),
 ('Europe / Slovakia', 4),
 ('Europe / Belgium', 2),
 ('Asia / Thailand', 2),
 ('Republic of Ireland|Europe / France', 2),
 ('Europe / Germany|England', 2),
 ('Oceania / Australia', 2),
 ('Europe / Estonia', 2),
 ('North America / USA|England', 1),
 ('Northern Ireland|England', 1),
 ('Northern Ireland|Europe / Spain', 1),
 ('Asia / Hong Kong', 1),
 ('Scotland|Republic of Ireland', 1),
 ('Asia / Sri Lanka', 1),
 ('Asia / Israel|Europe / France|Europe / Slovakia', 1),
 ('North America / USA|Wales', 1),
 ('North America / USA|Republic of Ireland', 1),
 ('Europe 

In [252]:
timepoint_6_parent_country_frequency_df = pd.DataFrame(timepoint_6_parent_country_frequency, columns=['country','Both'])
timepoint_6_parent_country_frequency_df

Unnamed: 0,country,Both
0,England,810
1,Europe / Denmark,139
2,Scotland,111
3,Europe / Germany,80
4,Asia / India,28
5,Europe / Switzerland,27
6,Europe / Sweden,27
7,Europe / France,18
8,North America / USA,13
9,Wales,11


In [253]:
timepoint_6_parent_country_frequency_df = pd.merge(timepoint_6_parent_country_frequency_df, pd.DataFrame(timepoint_6_roi_parent_country_frequency, columns=['country','RoI']), how="left", on="country")
timepoint_6_parent_country_frequency_df = pd.merge(timepoint_6_parent_country_frequency_df, pd.DataFrame(timepoint_6_ni_parent_country_frequency, columns=['country','NI']), how="left", on="country")
timepoint_6_parent_country_frequency_df[['RoI', 'NI']] = timepoint_6_parent_country_frequency_df[['RoI', 'NI']].fillna(value=0).astype(int)

In [254]:
timepoint_6_parent_country_frequency_df['country'] = timepoint_6_parent_country_frequency_df.apply(remove_continent, axis=1)

In [255]:
timepoint_6_parent_country_frequency_df

Unnamed: 0,country,Both,RoI,NI
0,England,810,215,595
1,Denmark,139,41,98
2,Scotland,111,22,89
3,Germany,80,29,51
4,India,28,16,12
5,Switzerland,27,24,3
6,Sweden,27,9,18
7,France,18,9,9
8,USA,13,5,8
9,Wales,11,6,5


In [256]:
timepoint_6_parent_country_frequency_df.to_csv('../outputFiles/timepoint_6_parent_country_frequency.csv', index=False)

In [257]:
num_country_tipsTimepoint6 = strainCountryTimepoint6.groupby('country').count().reset_index()
num_country_tipsTimepoint6.rename(columns={'strain': 'num_tips'}, inplace=True)
num_country_tipsTimepoint6

Unnamed: 0,country,num_tips
0,Africa / Algeria,22
1,Africa / Botswana,60
2,Africa / Djibouti,1
3,Africa / Ethiopia,1
4,Africa / Kenya,5
...,...,...
107,South America / Colombia,1
108,South America / Curacao,72
109,South America / Ecuador,5
110,South America / Peru,17


In [258]:
num_country_tipsTimepoint6['country'] = num_country_tipsTimepoint6.apply(remove_continent, axis=1)
num_country_tipsTimepoint6

Unnamed: 0,country,num_tips
0,Algeria,22
1,Botswana,60
2,Djibouti,1
3,Ethiopia,1
4,Kenya,5
...,...,...
107,Colombia,1
108,Curacao,72
109,Ecuador,5
110,Peru,17


In [259]:
timepoint_6_parent_country_frequency_df = pd.merge(num_country_tipsTimepoint6, timepoint_6_parent_country_frequency_df, how="left", on=['country']).fillna(value=0)
timepoint_6_parent_country_frequency_df

Unnamed: 0,country,num_tips,Both,RoI,NI
0,Algeria,22,0.0,0.0,0.0
1,Botswana,60,0.0,0.0,0.0
2,Djibouti,1,0.0,0.0,0.0
3,Ethiopia,1,0.0,0.0,0.0
4,Kenya,5,0.0,0.0,0.0
...,...,...,...,...,...
107,Colombia,1,0.0,0.0,0.0
108,Curacao,72,0.0,0.0,0.0
109,Ecuador,5,0.0,0.0,0.0
110,Peru,17,0.0,0.0,0.0


In [260]:
timepoint_6_parent_country_frequency_df[~timepoint_6_parent_country_frequency_df['country'].isin(country_lat_long['country'])]

Unnamed: 0,country,num_tips,Both,RoI,NI
94,Northern Ireland,5586,0.0,0.0,0.0
101,Republic of Ireland,1235,0.0,0.0,0.0


In [261]:
timepoint_6_parent_country_frequency_df = pd.merge(country_lat_long, timepoint_6_parent_country_frequency_df, how="inner", on=['country'])
timepoint_6_parent_country_frequency_df

Unnamed: 0,longitude,latitude,country,distance_from_ireland,num_tips,Both,RoI,NI
0,-103.120439,23.643948,Mexico,8231.394213,62,0.0,0.0,0.0
1,-93.566635,62.365872,Canada,4765.573313,4437,0.0,0.0,0.0
2,-64.443152,-37.605645,Argentina,11458.955064,10,0.0,0.0,0.0
3,-81.255450,-34.328238,Chile,12001.453231,72,0.0,0.0,0.0
4,-82.338976,-1.298065,Ecuador,9076.012306,5,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
105,55.532500,-21.114444,Reunion,10279.460896,132,0.0,0.0,0.0
106,-1.464854,52.561928,England,462.610475,221477,810.0,215.0,595.0
107,-4.183963,56.816738,Scotland,468.558618,56753,111.0,22.0,89.0
108,-3.766409,52.330220,Wales,320.735659,12294,11.0,6.0,5.0


In [262]:
timepoint_6_parent_country_frequency_df.to_csv('../outputFiles/timepoint_6_parent_country_frequency_distance.csv', index=False)

In [263]:
timepoint_6_parent_country_frequency_df[timepoint_6_parent_country_frequency_df['Both'] > 0][['country','Both']].to_csv('../outputFiles/timepoint_6_country_frequency.csv', index=False)

In [264]:
for intro in tqdm(timepoint6['into_ireland_nodes']):
    parent_clusters = []
    is_parent_in_timepoint_list(intro['node'].replace('_',' '), tree6, timepoint6['into_ireland_nodes'])
    if len(parent_clusters) > 0:
        print(intro['node'])
        print(parent_clusters)

  0%|          | 0/1348 [00:00<?, ?it/s]

In [265]:
cluster_num = 0
clusters_timepoint_f = []
for intro in tqdm(timepoint6['into_ireland_nodes']):
    cluster_num += 1
    descendant_tips = []
    find_irish_clusters(tree6.find_node_with_label(intro['node'].replace('_',' ')), descendant_tips)
    for tip in list(set(descendant_tips)):
        clusters_timepoint_f.append({'tip': tip.replace(' ','_'), 'cluster': cluster_num, 'timepoint': 'F'})

  0%|          | 0/1348 [00:00<?, ?it/s]

In [266]:
clusters_timepoint_f_df = pd.DataFrame(clusters_timepoint_f)
clusters_timepoint_f_df

Unnamed: 0,tip,cluster,timepoint
0,EPI_ISL_10943349,1,F
1,EPI_ISL_10943369,2,F
2,EPI_ISL_10632641,2,F
3,EPI_ISL_10887430,2,F
4,EPI_ISL_10585184,3,F
...,...,...,...
6841,EPI_ISL_11582812,1346,F
6842,EPI_ISL_11508238,1346,F
6843,EPI_ISL_11460572,1347,F
6844,EPI_ISL_11058569,1348,F


In [267]:
len(clusters_timepoint_f_df['tip'].unique())

6821

In [268]:
clusters_timepoint_f_df.groupby('cluster').count().describe()

Unnamed: 0,tip,timepoint
count,1348.0,1348.0
mean,5.078635,5.078635
std,68.002444,68.002444
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,2.0,2.0
max,2476.0,2476.0


In [269]:
clusters_timepoint_f_df.to_csv('../outputFiles/clusters/clusters_timepoint_F.csv', index=False)

In [270]:
clusters_timepoint_f_df.groupby('cluster').count().sort_values(by='tip',ascending=False)

Unnamed: 0_level_0,tip,timepoint
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
1311,2476,2476
1218,145,145
690,135,135
1018,117,117
1213,95,95
...,...,...
606,1,1
607,1,1
609,1,1
611,1,1


In [271]:
clusters_timepoint_f_df.groupby('cluster').count().reset_index().rename(columns={'tip':'irish_samples'})[['cluster','irish_samples']].to_csv('../outputFiles/clusters/clusters_timepoint_F_size.csv', index=False)

In [58]:
tree2 = dendropy.Tree.get(path="../outputFiles/timepoint2_all_pastml/named.tree_timepoint2.rooted.nwk", schema="newick")

In [59]:
tree2

<Tree object at 0x16f9e3790>

In [60]:
strainCountryTimepoint2 = pd.read_csv('../outputFiles/timepoint2.strainCountry.all.txt')
strainCountryTimepoint2

Unnamed: 0,strain,country
0,EPI_ISL_402125,Asia / China
1,EPI_ISL_9388908,Asia / India
2,EPI_ISL_9388909,Asia / India
3,EPI_ISL_9388907,Asia / India
4,EPI_ISL_791809,North America / USA
...,...,...
31925,EPI_ISL_829314,Europe / Iceland
31926,EPI_ISL_645965,England
31927,EPI_ISL_645964,England
31928,EPI_ISL_1311504,Europe / Netherlands


In [71]:
strainCountryTimepoint2['strain'] = strainCountryTimepoint2['strain'].str.replace('_',' ')

In [72]:
into_ireland_nodes_timepoint_2 = []
between_ireland_nodes_timepoint_2 = []
roi_count_timepoint_2 = 0
ni_count_timepoint_2 = 0
for strain in tqdm(strainCountryTimepoint2[strainCountryTimepoint2['country'].isin(['Republic of Ireland','Northern Ireland'])]['strain'].to_list()):
    node = tree2.find_node_with_taxon_label(strain)
    
    if node:
        if str(node.annotations["country"]) == "country='Republic of Ireland'":
            roi_count_timepoint_2 += 1
        elif str(node.annotations["country"]) == "country='Northern Ireland'":
            ni_count_timepoint_2 += 1
        
        if str(node.parent_node.annotations["country"]) != "country='Republic of Ireland'" and str(node.parent_node.annotations["country"]) != "country='Northern Ireland'":
            into_ireland_nodes_timepoint_2.append({'node': str(node.parent_node.label), 'parent_country': str(node.parent_node.annotations["country"]), 'country': str(node.annotations["country"])})
        elif str(node.parent_node.annotations["country"]) == "country='Republic of Ireland'" and str(node.annotations["country"]) == "country='Northern Ireland'":
            between_ireland_nodes_timepoint_2.append({'node': str(node.parent_node.label), 'parent_country': str(node.parent_node.annotations["country"]), 'country': str(node.annotations["country"])})
        elif str(node.annotations["country"]) == "country='Republic of Ireland'" and str(node.parent_node.annotations["country"]) == "country='Northern Ireland'":
            between_ireland_nodes_timepoint_2.append({'node': str(node.parent_node.label), 'parent_country': str(node.parent_node.annotations["country"]), 'country': str(node.annotations["country"])})

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 511/511 [00:11<00:00, 42.91it/s]


In [73]:
roi_count_timepoint_2

215

In [74]:
ni_count_timepoint_2

296

In [75]:
len(into_ireland_nodes_timepoint_2)

28

In [76]:
len(list({v['node']:v for v in into_ireland_nodes_timepoint_2}.values()))

27

In [77]:
len(between_ireland_nodes_timepoint_2)

1

In [78]:
len(list({v['node']:v for v in between_ireland_nodes_timepoint_2}.values()))

1

In [79]:
between_ireland_nodes_timepoint_2

[{'node': 'node 105046',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"}]

In [80]:
into_ireland_nodes_timepoint_2 = list({v['node']:v for v in into_ireland_nodes_timepoint_2}.values())
into_ireland_nodes_timepoint_2

[{'node': 'node 120096',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 105921',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 105941',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 129301',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 122113',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 120391',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 109693',
  'parent_country': "country='Europe / Norway|Europe / Iceland|Europe / Lithuania|Europe / Latvia'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 108015',
  'parent_country': "country='England'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 108061',
  'parent_country': 

In [81]:
into_ireland_nodes_timepoint_2 = list(map(lambda x: {'node': x['node'].replace(' ','_'), 'country': x['country'][9:-1], 'parent_country': x['parent_country'][9:-1]}, into_ireland_nodes_timepoint_2))

In [82]:
into_ireland_nodes_without_ambiguous_timepoint_2 = list(filter(lambda x: "|" not in x['parent_country'], into_ireland_nodes_timepoint_2))

In [83]:
len(into_ireland_nodes_without_ambiguous_timepoint_2)

26

In [84]:
timepoint_2_country_counter = collections.Counter(list(map(lambda x: x['country'], into_ireland_nodes_timepoint_2)))
timepoint_2_country_frequency = timepoint_2_country_counter.most_common()
timepoint_2_country_frequency

[('Northern Ireland', 16), ('Republic of Ireland', 11)]

In [85]:
11/(11+16)

0.4074074074074074

In [86]:
215/(215+296)

0.4207436399217221

In [87]:
timepoint_2_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], into_ireland_nodes_timepoint_2)))
timepoint_2_parent_country_frequency = timepoint_2_parent_country_counter.most_common()
timepoint_2_parent_country_frequency

[('England', 24),
 ('Europe / Norway|Europe / Iceland|Europe / Lithuania|Europe / Latvia', 1),
 ('Scotland', 1),
 ('Wales', 1)]

In [89]:
24/26

0.9230769230769231

In [64]:
with open('../outputFiles/timepoint_2_parent_country_frequency.csv', 'w') as f:
    f.write('country' + ',' + 'frequency' + '\n')
    for country in timepoint_2_parent_country_frequency:
        if len(country[0].split(' / ')) > 1:
            f.write(country[0].split(' / ')[1] + ',' + str(country[1]) + '\n')
        else:
            f.write(country[0] + ',' + str(country[1]) + '\n')

In [88]:
timepoint_2_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country']+' ||| '+x['country'], into_ireland_nodes_timepoint_2)))
timepoint_2_parent_country_frequency = timepoint_2_parent_country_counter.most_common()
timepoint_2_parent_country_frequency

[('England ||| Northern Ireland', 15),
 ('England ||| Republic of Ireland', 9),
 ('Europe / Norway|Europe / Iceland|Europe / Lithuania|Europe / Latvia ||| Republic of Ireland',
  1),
 ('Scotland ||| Republic of Ireland', 1),
 ('Wales ||| Northern Ireland', 1)]

In [189]:
number_of_tips_in_tree_1 = strainCountryAll.groupby('country').count().reset_index()
number_of_tips_in_tree_1

Unnamed: 0,country,strain
0,Africa / Algeria,3
1,Africa / Benin,12
2,Africa / Botswana,1
3,Africa / Burkina Faso,6
4,Africa / Cameroon,9
...,...,...
151,South America / Peru,361
152,South America / Suriname,3
153,South America / Uruguay,32
154,South America / Venezuela,8


In [192]:
number_of_tips_in_tree_1['country'] = number_of_tips_in_tree_1.apply(lambda x: x['country'].split(' / ')[1] if len(x['country'].split(' / ')) > 1 else x['country'], axis=1)

In [193]:
number_of_tips_in_tree_1

Unnamed: 0,country,strain
0,Algeria,3
1,Benin,12
2,Botswana,1
3,Burkina Faso,6
4,Cameroon,9
...,...,...
151,Peru,361
152,Suriname,3
153,Uruguay,32
154,Venezuela,8


In [198]:
number_of_tips_in_tree_1 = pd.merge(number_of_tips_in_tree_1, country_lat_long, how="left", on='country')
number_of_tips_in_tree_1

Unnamed: 0,country,strain,longitude,latitude,distance_from_ireland
0,Algeria,3,2.655846,28.350970,2915.226435
1,Benin,12,2.305715,9.503013,4961.956763
2,Botswana,1,23.857800,-22.236609,8951.399815
3,Burkina Faso,6,-1.693282,12.108709,4617.886446
4,Cameroon,9,12.948474,6.294168,5569.428751
...,...,...,...,...,...
151,Peru,361,-74.114162,-8.522718,9222.655323
152,Suriname,3,-55.855514,4.098724,6967.769283
153,Uruguay,32,-56.019195,-32.781950,10622.912302
154,Venezuela,8,-66.361124,7.162821,7293.305295


In [199]:
number_of_tips_in_tree_1.describe()

Unnamed: 0,strain,longitude,latitude,distance_from_ireland
count,156.0,154.0,154.0,154.0
mean,662.282051,15.600869,22.51284,5840.09411
std,2670.690378,60.052098,24.381939,3408.745879
min,1.0,-127.763857,-43.657479,320.735659
25%,8.0,-11.603323,7.776016,2722.02833
50%,30.5,18.564862,21.922834,5758.430372
75%,294.5,45.314011,42.689861,8131.241747
max,27699.0,178.695173,65.187111,16220.400928


In [200]:
number_of_tips_in_tree_1[number_of_tips_in_tree_1.isna().any(axis=1)]

Unnamed: 0,country,strain,longitude,latitude,distance_from_ireland
134,Northern Ireland,633,,,
139,Republic of Ireland,714,,,


In [69]:
tree4 = dendropy.Tree.get(path="../outputFiles/timepoint4_all_pastml/named.tree_timepoint4.rooted.nwk", schema="newick")

In [70]:
tree4

<Tree object at 0x167b7f130>

In [71]:
strainCountryTimepoint4 = pd.read_csv('../outputFiles/timepoint4.strainCountry.all.txt')
strainCountryTimepoint4

Unnamed: 0,strain,country
0,EPI_ISL_402125,Asia / China
1,EPI_ISL_6949234,North America / USA
2,EPI_ISL_5387777,North America / USA
3,EPI_ISL_6948687,North America / USA
4,EPI_ISL_6948551,North America / USA
...,...,...
513949,EPI_ISL_3283578,England
513950,EPI_ISL_3283579,England
513951,EPI_ISL_3283580,England
513952,EPI_ISL_3283565,England


In [72]:
strainCountryTimepoint4['strain'] = strainCountryTimepoint4['strain'].str.replace('_',' ')

In [73]:
into_ireland_nodes_timepoint_4 = []
between_ireland_nodes_timepoint_4 = []
roi_count_timepoint_4 = 0
ni_count_timepoint_4 = 0
for strain in tqdm(strainCountryTimepoint4[strainCountryTimepoint4['country'].isin(['Republic of Ireland','Northern Ireland'])]['strain'].to_list()):
    node = tree4.find_node_with_taxon_label(strain)
    
    if node:
        if str(node.annotations["country"]) == "country='Republic of Ireland'":
            roi_count_timepoint_4 += 1
        elif str(node.annotations["country"]) == "country='Northern Ireland'":
            ni_count_timepoint_4 += 1
        
        if str(node.parent_node.annotations["country"]) != "country='Republic of Ireland'" and str(node.parent_node.annotations["country"]) != "country='Northern Ireland'":
            into_ireland_nodes_timepoint_4.append({'node': str(node.parent_node.label), 'parent_country': str(node.parent_node.annotations["country"]), 'country': str(node.annotations["country"])})
        elif str(node.parent_node.annotations["country"]) == "country='Republic of Ireland'" and str(node.annotations["country"]) == "country='Northern Ireland'":
            between_ireland_nodes_timepoint_4.append({'node': str(node.parent_node.label), 'parent_country': str(node.parent_node.annotations["country"]), 'country': str(node.annotations["country"])})
        elif str(node.annotations["country"]) == "country='Republic of Ireland'" and str(node.parent_node.annotations["country"]) == "country='Northern Ireland'":
            between_ireland_nodes_timepoint_4.append({'node': str(node.parent_node.label), 'parent_country': str(node.parent_node.annotations["country"]), 'country': str(node.annotations["country"])})

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8070/8070 [1:17:45<00:00,  1.73it/s]


In [74]:
roi_count_timepoint_4

4923

In [75]:
ni_count_timepoint_4

3147

In [76]:
len(into_ireland_nodes_timepoint_4)

812

In [77]:
len(list({v['node']:v for v in into_ireland_nodes_timepoint_4}.values()))

774

In [78]:
len(between_ireland_nodes_timepoint_4)

46

In [79]:
len(list({v['node']:v for v in between_ireland_nodes_timepoint_4}.values()))

46

In [80]:
between_ireland_nodes_timepoint_4

[{'node': 'node 823559',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 1177753',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 1121739',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 1121740',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 984775',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 1135191',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 984756',
  'parent_country': "country='Republic of Ireland'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 984816',
  'parent_country': "country='Northern Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 8299

In [87]:
timepoint_4_between_country_counter = collections.Counter(list(map(lambda x: x['country'], between_ireland_nodes_timepoint_4)))
timepoint_4_between_country_frequency = timepoint_4_between_country_counter.most_common()
timepoint_4_between_country_frequency

[("country='Northern Ireland'", 24), ("country='Republic of Ireland'", 22)]

In [81]:
into_ireland_nodes_timepoint_4 = list({v['node']:v for v in into_ireland_nodes_timepoint_4}.values())
into_ireland_nodes_timepoint_4

[{'node': 'node 836447',
  'parent_country': "country='Asia / India'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 1479099',
  'parent_country': "country='Europe / Denmark|Europe / Netherlands|Republic of Ireland'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 1513292',
  'parent_country': "country='Asia / India'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 832846',
  'parent_country': "country='Asia / India'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 834312',
  'parent_country': "country='Asia / India'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 1498667',
  'parent_country': "country='Wales'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 833057',
  'parent_country': "country='England'",
  'country': "country='Northern Ireland'"},
 {'node': 'node 823767',
  'parent_country': "country='Asia / India'",
  'country': "country='Republic of Ireland'"},
 {'node': 'node 1459019

In [82]:
into_ireland_nodes_timepoint_4 = list(map(lambda x: {'node': x['node'].replace(' ','_'), 'country': x['country'][9:-1], 'parent_country': x['parent_country'][9:-1]}, into_ireland_nodes_timepoint_4))

In [83]:
into_ireland_nodes_without_ambiguous_timepoint_4 = list(filter(lambda x: "|" not in x['parent_country'], into_ireland_nodes_timepoint_4))

In [84]:
len(into_ireland_nodes_without_ambiguous_timepoint_4)

763

In [85]:
timepoint_4_country_counter = collections.Counter(list(map(lambda x: x['country'], into_ireland_nodes_timepoint_4)))
timepoint_4_country_frequency = timepoint_4_country_counter.most_common()
timepoint_4_country_frequency

[('Northern Ireland', 475), ('Republic of Ireland', 299)]

In [85]:
11/(11+16)

0.4074074074074074

In [86]:
215/(215+296)

0.4207436399217221

In [86]:
timepoint_4_parent_country_counter = collections.Counter(list(map(lambda x: x['parent_country'], into_ireland_nodes_timepoint_4)))
timepoint_4_parent_country_frequency = timepoint_4_parent_country_counter.most_common()
timepoint_4_parent_country_frequency

[('England', 471),
 ('Europe / Spain', 43),
 ('Scotland', 29),
 ('Europe / France', 27),
 ('Asia / India', 21),
 ('Europe / Netherlands', 21),
 ('Europe / Sweden', 21),
 ('Europe / Denmark', 20),
 ('Europe / Germany', 17),
 ('North America / USA', 12),
 ('Wales', 10),
 ('Europe / Russia', 8),
 ('Europe / Portugal', 8),
 ('Europe / Italy', 8),
 ('Africa / Nigeria', 6),
 ('Europe / Switzerland', 6),
 ('Europe / Greece', 4),
 ('Republic of Ireland|England', 4),
 ('Europe / Belgium', 3),
 ('Africa / Uganda', 2),
 ('Asia / South Korea', 2),
 ('Europe / Turkey', 2),
 ('North America / Mexico', 2),
 ('Africa / Ghana', 2),
 ('Asia / Israel', 2),
 ('Europe / Denmark|Europe / Netherlands|Republic of Ireland', 1),
 ('Europe / Belgium|Europe / France', 1),
 ('Africa / Namibia', 1),
 ('Europe / Denmark|Europe / Italy', 1),
 ('Asia / Japan', 1),
 ('Asia / Indonesia', 1),
 ('Europe / Lithuania', 1),
 ('Europe / Croatia', 1),
 ('Europe / France|Northern Ireland', 1),
 ('Europe / Portugal|Republic of I