In [3]:
import pandas as pd
import numpy as np
from tabulate import tabulate
import math
from pickleshare import *

In [4]:
db = PickleShareDB(os.path.join(os.getcwd(),'SharedVars'))


<i>Question 3</i>

For every year, model the data as a multinomial probability distribution for the following features: (1) gender, (2) age, (3) purpose of visit. <br>
Do it overall as well as country-wise and continent-wise.<br>
Find the years when the entropy of the distributions (three seaparate) for the overall count is the maximum and the minimum.<br>
Repeat it for every country and continent.<br>

In [5]:
years = ['2013','2014','2016','2017','2018','2019','2020','2021','2022']
regions = ['North America', 'Central & South America', 'Western Europe', 'Eastern Europe', 'Africa', 'West Asia', 'South Asia', 'South East Asia', 'East Asia', 'Australasia']


In [6]:
def entropy(prob_list):
    retval = 0
    if prob_list:
        for prob in prob_list:
            if(prob>0 and prob<=1):
                retval-=prob*math.log2(prob)
    return retval

Working on Gender-wise distribution of FTAs

In [7]:
data_g = {}
cubes_g = {}

for y in years:
    data_g[f'df{y[-2:]}'] = pd.read_excel(rf"data/TourismData-{y}/NATIONALITY-WISE GENDER-WISE DISTRIBUTION OF FTAs IN INDIA.xlsx") ## to store pandas dataframe

for y in years:
    df = data_g[f'df{y[-2:]}']
    region_indices = df[df['Country'].isin(regions)].index.tolist()
    split_data = [] ## empty list to hold the region separated 2D lists to be created

    # Iterate over the indices, splitting the DataFrame into segments
    for i in range(len(region_indices)):
        # Define the start and end indices for each split
        start_idx = region_indices[i]
        end_idx = region_indices[i + 1] if i + 1 < len(region_indices) else len(df)        
        # Extract the section and convert it to a list of lists (2D list)
        section = df.iloc[start_idx:end_idx].values.tolist()
        split_data.append(section) 
    # Dataframe split into 2D lists based on region
    # Format of 2D lists created : [['North America', nan, nan, nan], ['Canada', 268485.0, 53.7, 46.3], ['USA', 1118983.0, 55.9, 44.1]]
    # 1st row contains the name of the region with empty spaces for sum of features, followed by countries in that region with their features. 

    # Calculate region-wise sum for male, female and total tourists
    for i in range(len(split_data)):
        chunk = split_data[i]
        total = 0
        males = 0
        females = 0
        others = 0
        for j in range(1,len(chunk)):
            total+=chunk[j][1]
            males+=chunk[j][2]*chunk[j][1]
            females+=chunk[j][3]*chunk[j][1]
            if len(chunk[j]) == 5:
                others+=chunk[j][4]*chunk[j][1]
        chunk[0][1] = int(total)
        chunk[0][2] = round(males/total,2)
        chunk[0][3] = round(females/total,2)
        if len(chunk[0]) == 5:
            chunk[0][4] = round(others/total,2) 

    for i in range(len(split_data)):
        for j in range(len(split_data[i])):
            split_data[i][j][2] = round(split_data[i][j][2]/100,2) ## turn percentages of males into fraction (MLE estimate of probability of male tourist)
            split_data[i][j][3] = round(split_data[i][j][3]/100,2) ## turn percentages of females into fraction (MLE estimate of probability of female tourist)
            if y == '2022':
                split_data[i][j][4] = round(split_data[i][j][4]/100,2)
            
    cubes_g[f'cube{y[-2:]}'] = split_data

In [8]:
ans_g = {}
for y in years:
    cube = cubes_g[f'cube{y[-2:]}']
    ans = {}
    ans['countries'] = []
    ans['regions'] = []

    for chunk in cube:
        ans['regions'].append(chunk[0])
        for country_data in chunk[1:]:
            if(country_data[0] != 'Others'):
                ans['countries'].append(country_data)
    
    total_overall = 0
    males_overall = 0
    females_overall = 0
    others_overall = 0
    for region in ans['regions']:
        total_overall += region[1]
        males_overall += region[2]*region[1]
        females_overall += region[3]*region[1]
        if y == '2022':
            others_overall += region[4]*region[1]
    ans['overall'] = ['Overall', total_overall, round(males_overall/total_overall,2), round(females_overall/total_overall,2)]
    if(y == '2022'):
        ans['overall']+=[round(others_overall/total_overall,2)]

    ans_g[f'{y}'] = ans



In [9]:
def find_data(dict_name, year, type, name): # year and type to be provided in string format. type is 'countries' or 'regions'.
    for row in dict_name[year][type]:
        if row[0].lower() == name.lower():
            retval = row.copy()
            retval[1] = int(retval[1])
            return tuple(retval[1:])
    return None

In [10]:
countries = set()
for y in years:
    data = ans_g[f'{y}']['countries']
    for country in [c_data[0] for c_data in data]:
        countries.add(country)
countries = sorted(list(countries))

In [11]:
table_gc = {} # g-> gender; c-> countrywise
table_gr = {} # g-> gender; r-> region-wise
table_go = {} # g-> gender; o-> overall
table_gc['Country'] = countries
table_gr['Continent'] = regions
table_go[' '] = ['Overall']

for y in years:
    table_gc[f'{y}'] = []
    for c in countries:
        table_gc[f'{y}'].append(find_data(ans_g, f'{y}', 'countries', c))
    
    table_gr[f'{y}'] = []
    for r in regions:
        table_gr[f'{y}'].append(find_data(ans_g, f'{y}', 'regions', r))
    
    table_go[f'{y}'] = [tuple(ans_g[f'{y}']['overall'][1:])]

df_go = pd.DataFrame(table_go)
df_go['Max Entropy year'] = df_go.iloc[:,1:10].apply(lambda row: row.apply(entropy).idxmax(), axis=1) # adds another column to the table that stores the year with maximum entropy in distribution
df_go['Min Entropy year'] = df_go.iloc[:,1:10].apply(lambda row: row.apply(entropy).idxmin(), axis=1)

df_gr = pd.DataFrame(table_gr)
df_gr['Max Entropy year'] = df_gr.iloc[:,1:10].apply(lambda row: row.apply(entropy).idxmax(), axis=1)
df_gr['Min Entropy year'] = df_gr.iloc[:,1:10].apply(lambda row: row.apply(entropy).idxmin(), axis=1)

df_gc = pd.DataFrame(table_gc)
df_gc['Max Entropy year'] = df_gc.iloc[:,1:10].apply(lambda row: row.apply(entropy).idxmax(), axis=1)
df_gc['Min Entropy year'] = df_gc.iloc[:,1:10].apply(lambda row: row.apply(entropy).idxmin(), axis=1)


In [12]:
## each tuple contains parameters for multinomial distributions. format : (no. of trials, probability of male, probability of female, (if present) probability of third gender)

print(tabulate(df_go, headers='keys', tablefmt='grid'))
print("Table 1: parameters to define multinomial distribution for overall data for yearly genderwise distribution")


+----+---------+-----------------------+---------------------+---------------------+------------------------+------------------------+------------------------+-----------------------+---------------------+----------------------------+--------------------+--------------------+
|    |         | 2013                  | 2014                | 2016                | 2017                   | 2018                   | 2019                   | 2020                  | 2021                | 2022                       |   Max Entropy year |   Min Entropy year |
|  0 | Overall | (7648432, 0.59, 0.41) | (8002576, 0.6, 0.4) | (8789423, 0.6, 0.4) | (14512347, 0.62, 0.38) | (10547939, 0.59, 0.41) | (10919515, 0.59, 0.41) | (2741748, 0.58, 0.42) | (1525972, 0.6, 0.4) | (6430142, 0.42, 0.58, 0.0) |               2020 |               2017 |
+----+---------+-----------------------+---------------------+---------------------+------------------------+------------------------+------------------------+----------

In [13]:
print(tabulate(df_gr, headers='keys', tablefmt='grid'))
print("Table 2: parameters to define multinomial distribution for gender data distribution for each continent for each year")

+----+-------------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+----------------------+----------------------+----------------------------+--------------------+--------------------+
|    | Continent               | 2013                  | 2014                  | 2016                  | 2017                  | 2018                  | 2019                  | 2020                 | 2021                 | 2022                       |   Max Entropy year |   Min Entropy year |
|  0 | North America           | (1387468, 0.55, 0.45) | (1494930, 0.55, 0.45) | (1614178, 0.55, 0.45) | (1712358, 0.54, 0.46) | (1807718, 0.54, 0.46) | (1863893, 0.54, 0.46) | (516960, 0.55, 0.45) | (510299, 0.54, 0.46) | (1692660, 0.47, 0.53, 0.0) |               2022 |               2013 |
+----+-------------------------+-----------------------+-----------------------+-----------------------+--------------

In [14]:
print(tabulate(df_gc, headers='keys', tablefmt='grid'))
print("Table 3: parameters to define multinomial distribution for gender data distribution for each countries for each year")

+----+----------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+----------------------+----------------------+----------------------------+--------------------+--------------------+
|    | Country        | 2013                  | 2014                  | 2016                  | 2017                  | 2018                  | 2019                  | 2020                 | 2021                 | 2022                       |   Max Entropy year |   Min Entropy year |
|  0 | Afghanistan    | (115569, 0.74, 0.26)  | (114406, 0.74, 0.26)  | (123330, 0.71, 0.29)  | (149176, 0.69, 0.31)  | (153905, 0.69, 0.31)  | (124120, 0.68, 0.32)  | (47561, 0.66, 0.34)  | (36451, 0.71, 0.29)  | (1617, 0.18, 0.82, 0.0)    |               2020 |               2022 |
+----+----------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+--

Working on nationality-wise FTAs according to age group

In [15]:
data_a = {}
cubes_a = {}

for y in years:
    data_a[f'df{y[-2:]}'] = pd.read_excel(rf"data\TourismData-{y}\DISTRIBUTION OF NATIONALITY-WISE FTAs IN INDIA ACCORDING TO AGE GROUP.xlsx")

for y in years:
    df = data_a[f'df{y[-2:]}']
    region_indices = df[df['Country'].isin(regions)].index.tolist()
    split_data = []

    for i in range(len(region_indices)):
        start_idx = region_indices[i]
        end_idx = region_indices[i + 1] if i + 1 < len(region_indices) else len(df)  
        section = df.iloc[start_idx:end_idx].values.tolist()
        split_data.append(section) 
    
    for i in range(len(split_data)):
        chunk = split_data[i]
        total = 0
        grp = 7*[0]
        for j in range(1,len(chunk)):
            total+=chunk[j][1]
            for k in range(7):
                grp[k]+=chunk[j][2+k]*chunk[j][1]
        chunk[0][1] = int(total)
        
        for k in range(7):
            chunk[0][2+k] = grp[k]/total

    for i in range(len(split_data)):
        for j in range(len(split_data[i])):
            for k in range(2,len(split_data[i][j])):
                split_data[i][j][k] = round(split_data[i][j][k]/100,2)
    
    # print(split_data)
    cubes_a[f'cube{y[-2:]}'] = split_data

In [16]:
ans_a = {}
for y in years:
    cube = cubes_a[f'cube{y[-2:]}']
    ans = {}
    ans['countries'] = []
    ans['regions'] = []

    for chunk in cube:
        ans['regions'].append(chunk[0])
        for country_data in chunk[1:]:
                if(country_data[0] != 'Others'):
                    ans['countries'].append(country_data)
    
    grp = 7*[0]
    total_overall = 0
    for region in ans['regions']:
        total_overall+=region[1]
        for k in range(7):
            grp[k] += region[k+2]*region[1]
    
    ans['overall'] = ['Overall', total_overall]+[round(val/total_overall,2) for val in grp]

    ans_a[f'{y}'] = ans

In [17]:
table_ac = {} # a-> age; c-> countrywise
table_ar = {} # a-> age; r-> region-wise
table_ao = {} # a-> age; o-> overall
table_ac['Country'] = countries
table_ar['Continents'] = regions
table_ao[' '] = ['Overall']

for y in years:
    table_ac[f'{y}'] = []
    for c in countries:
        table_ac[f'{y}'].append(find_data(ans_a, f'{y}', 'countries', c))
    
    table_ar[f'{y}'] = []
    for r in regions:
        table_ar[f'{y}'].append(find_data(ans_a, f'{y}', 'regions', r))
    
    table_ao[f'{y}'] = [tuple(ans_a[f'{y}']['overall'][1:])]

df_ao = pd.DataFrame(table_ao)
df_ao['Max Entropy year'] = df_ao.iloc[:,1:10].apply(lambda row: row.apply(entropy).idxmax(), axis=1) # adds another column to the table that stores the year with maximum entropy in distribution
df_ao['Min Entropy year'] = df_ao.iloc[:,1:10].apply(lambda row: row.apply(entropy).idxmin(), axis=1) # adds another column to the table that stores the year with maximum entropy in distribution

df_ar = pd.DataFrame(table_ar)
df_ar['Max Entropy year'] = df_ar.iloc[:,1:10].apply(lambda row: row.apply(entropy).idxmax(), axis=1)
df_ar['Min Entropy year'] = df_ar.iloc[:,1:10].apply(lambda row: row.apply(entropy).idxmin(), axis=1)

df_ac = pd.DataFrame(table_ac)
df_ac['Max Entropy year'] = df_ac.iloc[:,1:10].apply(lambda row: row.apply(entropy).idxmax(), axis=1)
df_ac['Min Entropy year'] = df_ac.iloc[:,1:10].apply(lambda row: row.apply(entropy).idxmin(), axis=1)


In [18]:
## each tuple contains parameters for multinomial distributions. format : (no. of trials, p(grp1), p(grp2), p(grp3), p(grp4), p(grp5), p(grp6), p(grp7))
## p(grp1) = probability of age being in range 0-14
## p(grp2) = probability of age being in range 15-24
## p(grp3) = probability of age being in range 25-34
## p(grp4) = probability of age being in range 35-44
## p(grp5) = probability of age being in range 45-54
## p(grp6) = probability of age being in range 55-64
## p(grp7) = probability of age being in range 65 and above


print(tabulate(df_ao, headers='keys', tablefmt='grid'))
print("Table 4: parameters to define multinomial distribution for overall data for yearly age-wise distribution")



+----+---------+----------------------------------------------------+---------------------------------------------------+----------------------------------------------------+-----------------------------------------------------+-----------------------------------------------------+-----------------------------------------------------+---------------------------------------------------+-----------------------------------------------------+----------------------------------------------------+--------------------+--------------------+
|    |         | 2013                                               | 2014                                              | 2016                                               | 2017                                                | 2018                                                | 2019                                                | 2020                                              | 2021                                                | 2022               

In [19]:
print(tabulate(df_ar, headers='keys', tablefmt='grid'))
print("Table 5: parameters to define multinomial distribution for age data distribution for each continent for each year")
print('\n\n')

+----+-------------------------+----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+-----------------------------------------------------+-----------------------------------------------------+-----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+-----------------------------------------------------+--------------------+--------------------+
|    | Continents              | 2013                                               | 2014                                               | 2016                                               | 2017                                                | 2018                                                | 2019                                                | 2020                                               | 2021                                 

In [20]:
print(tabulate(df_ac, headers='keys', tablefmt='grid'))
print("Table 6: parameters to define multinomial distribution for age data distribution for each countries for each year")

+----+----------------+----------------------------------------------------+-----------------------------------------------------+-----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+--------------------+--------------------+
|    | Country        | 2013                                               | 2014                                                | 2016                                                | 2017                                               | 2018                                               | 2019                                               | 2020                                               | 2021                                               | 2022 

In [21]:
# print(ans_a['2013']['countries'])
# print(ans_a['2013']['regions'])

Working on yearly FTA data according to purpose of visit

The data for year 2014 is not provided.

For year 2022 the "students" column and "unknown" columns are added to "other" column.<br>
For year 2021 the "students" column is merged with "other".<br>
For year 2013 the "students" column is merged with "other".<br>

The data for "DISTRIBUTION OF NATIONALITY-WISE FTAs IN INDIA ACCORDING TO PURPOSE" for the year 2017 was same as 2019, so original correct data was fetched from tourism.gov.in file titled "India Tourism Statistics 2018"(pages 51-53).


In the data for "STATE UT-WISE DOMESTIC AND FOREIGN TOURIST VISITS" UT-Leh Ladakh has been added for years 2019,2020,2021 but not in the previous years, so some cells are empty

In [22]:
data_r = {}
cubes_r = {}

for y in years:
    if(y == '2014'):
        continue
    data_r[f'df{y[-2:]}'] = pd.read_excel(rf"data\TourismData-{y}\DISTRIBUTION OF NATIONALITY-WISE FTAs IN INDIA ACCORDING TO PURPOSE.xlsx")

for y in years:
    if(y == '2014'):
        continue
    df = data_r[f'df{y[-2:]}']
    region_indices = df[df['Country of Nationality'].isin(regions)].index.tolist()
    split_data = []

    for i in range(len(region_indices)):
        start_idx = region_indices[i]
        end_idx = region_indices[i + 1] if i + 1 < len(region_indices) else len(df)  
        section = df.iloc[start_idx:end_idx].values.tolist()
        split_data.append(section) 

    # print(split_data)
    
    
    for i in range(len(split_data)):
        chunk = split_data[i]
        size = len(chunk[0])
        total = 0
        grp = (size-2)*[0]
        for j in range(1,len(chunk)):
            total+=chunk[j][1]
            for k in range(size-2):
                grp[k]+=chunk[j][2+k]*chunk[j][1]
        chunk[0][1] = int(total)
        
        for k in range(size-2):
            chunk[0][2+k] = grp[k]/total

    for i in range(len(split_data)):
        for j in range(len(split_data[i])):
            for k in range(2,len(split_data[i][j])):
                split_data[i][j][k] = round(split_data[i][j][k]/100,2)
    
    # print(split_data)
    cubes_r[f'cube{y[-2:]}'] = split_data

In [23]:
ans_r = {}
for y in years:
    if(y == '2014'):
         continue
    cube = cubes_r[f'cube{y[-2:]}']
    ans = {}
    ans['countries'] = []
    ans['regions'] = []

    for chunk in cube:
        ans['regions'].append(chunk[0])
        for country_data in chunk[1:]:
                if(country_data[0] != 'Others'):
                    ans['countries'].append(country_data)
    
    size = len(cube[0][0])
    grp = (size-2)*[0]
    total_overall = 0
    for region in ans['regions']:
        total_overall+=region[1]
        for k in range(size-2):
            grp[k] += region[k+2]*region[1]
    
    ans['overall'] = ['Overall', total_overall]+[round(val/total_overall,2) for val in grp]

    ans_r[f'{y}'] = ans

In [24]:
# storing the ans_r variable for later use in question 10
db['q3/purpose-wise FTA data'] = ans_r

In [25]:
table_rc = {} # r-> reason; c-> country
table_rr = {} # r-> reason; r-> region
table_ro = {} # r-> reason; o-> overall
table_rc['Country'] = countries
table_rr['Continents'] = regions
table_ro[' '] = ['Overall']

for y in years:
    if(y == '2014'):
        continue
    table_rc[f'{y}'] = []
    for c in countries:
        table_rc[f'{y}'].append(find_data(ans_r, f'{y}', 'countries', c))
    
    table_rr[f'{y}'] = []
    for r in regions:
        table_rr[f'{y}'].append(find_data(ans_r, f'{y}', 'regions', r))
    
    table_ro[f'{y}'] = [tuple(ans_r[f'{y}']['overall'][1:])]

df_ro = pd.DataFrame(table_ro)
df_ro['Max Entropy year'] = df_ro.iloc[:,1:9].apply(lambda row: row.apply(entropy).idxmax(), axis=1) # adds another column to the table that stores the year with maximum entropy in distribution
df_ro['Min Entropy year'] = df_ro.iloc[:,1:9].apply(lambda row: row.apply(entropy).idxmin(), axis=1) # adds another column to the table that stores the year with maximum entropy in distribution

df_rr = pd.DataFrame(table_rr)
df_rr['Max Entropy year'] = df_rr.iloc[:,1:9].apply(lambda row: row.apply(entropy).idxmax(), axis=1)
df_rr['Min Entropy year'] = df_rr.iloc[:,1:9].apply(lambda row: row.apply(entropy).idxmin(), axis=1)

df_rc = pd.DataFrame(table_rc)
df_rc['Max Entropy year'] = df_rc.iloc[:,1:9].apply(lambda row: row.apply(entropy).idxmax(), axis=1)
df_rc['Min Entropy year'] = df_rc.iloc[:,1:9].apply(lambda row: row.apply(entropy).idxmin(), axis=1)

In [26]:
'''
Data doesn't exist for year 2014
Data for 2013 was also not complete
PIO -> People of Indian Origin (Also called Indian diaspora)

Data format for each year:
parameters -> (#trials, p(Business purpose), p(Recreation purpose), p(Medical purpose), p(NRI and PIO visits), p(Other purposes))
'''

print(tabulate(df_ro, headers='keys', tablefmt='grid'))
print("Table 7: parameters to define multinomial distribution for overall data for yearly purpose-wise distribution")


+----+---------+----------------------------------------+----------------------------------------+------------------------------------------+------------------------------------------+----------------------------------------+----------------------------------------+---------------------------------------+----------------------------------------+--------------------+--------------------+
|    |         | 2013                                   | 2016                                   | 2017                                     | 2018                                     | 2019                                   | 2020                                   | 2021                                  | 2022                                   |   Max Entropy year |   Min Entropy year |
|  0 | Overall | (3317974, 0.17, 0.52, 0.0, 0.24, 0.06) | (8789423, 0.16, 0.6, 0.05, 0.15, 0.04) | (10012347, 0.14, 0.59, 0.05, 0.19, 0.03) | (10547939, 0.17, 0.63, 0.06, 0.14, 0.02) | (6088369, 0.13, 0.5, 0.03, 0.17, 0.

In [27]:
print('\n\n')
print(tabulate(df_rr, headers='keys', tablefmt='grid'))
print("Table 8: parameters to define multinomial distribution for purpose data distribution for each continent for each year")





+----+-------------------------+---------------------------------------+-----------------------------------------+-----------------------------------------+-----------------------------------------+----------------------------------------+---------------------------------------+----------------------------------------+----------------------------------------+--------------------+--------------------+
|    | Continents              | 2013                                  | 2016                                    | 2017                                    | 2018                                    | 2019                                   | 2020                                  | 2021                                   | 2022                                   |   Max Entropy year |   Min Entropy year |
|  0 | North America           | (1387468, 0.11, 0.4, 0.0, 0.43, 0.06) | (1614178, 0.1, 0.52, 0.0, 0.34, 0.04)   | (1712358, 0.09, 0.43, 0.0, 0.44, 0.03)  | (1807718, 0.16, 0.54, 0.01, 0.29

In [28]:
print('\n\n')
print(tabulate(df_rc, headers='keys', tablefmt='grid'))
print("Table 9: parameters to define multinomial distribution for purpose data distribution for each countries for each year")




+----+----------------+----------------------------------------+-----------------------------------------+----------------------------------------+-----------------------------------------+----------------------------------------+----------------------------------------+---------------------------------------+-----------------------------------------+--------------------+--------------------+
|    | Country        | 2013                                   | 2016                                    | 2017                                   | 2018                                    | 2019                                   | 2020                                   | 2021                                  | 2022                                    |   Max Entropy year |   Min Entropy year |
|  0 | Afghanistan    |                                        | (123330, 0.03, 0.39, 0.43, 0.1, 0.06)   | (149176, 0.04, 0.44, 0.37, 0.09, 0.06) | (153905, 0.08, 0.49, 0.3, 0.07, 0.06)   | (47561, 0.09, 0

In [29]:
cube13 = cubes_g['cube13']

ans = {}
ans['countries'] = []
ans['regions'] = []

for chunk in cube13:
    ans['regions'].append(chunk[0])
    for country_data in chunk[1:]:
        if(country_data[0] != 'Others'):
            ans['countries'].append(country_data)

total_overall = 0
males_overall = 0
females_overall = 0
for region in ans['regions']:
    total_overall += region[1]
    males_overall += region[2]*region[1]
    females_overall += region[3]*region[1]
ans['overall'] = ['Overall', total_overall, round(males_overall/total_overall,2), round(females_overall/total_overall,2)]

print(ans['regions'])
print(ans['countries'])
print(ans['overall'])

[['North America', 1387468, 0.55, 0.45], ['Central & South America', 69926, 0.52, 0.48], ['Western Europe', 1860580, 0.57, 0.43], ['Eastern Europe', 422278, 0.45, 0.55], ['Africa', 280754, 0.56, 0.44], ['West Asia', 413678, 0.65, 0.35], ['South Asia', 1694857, 0.68, 0.32], ['South East Asia', 685805, 0.52, 0.48], ['East Asia', 546792, 0.66, 0.34], ['Australasia', 286294, 0.58, 0.42]]
[['Canada', 268485.0, 0.54, 0.46], ['USA', 1118983.0, 0.56, 0.44], ['Argentina', 9731.0, 0.46, 0.54], ['Brazil', 19563.0, 0.54, 0.46], ['Mexico', 13978.0, 0.51, 0.49], ['Austria', 34360.0, 0.57, 0.43], ['Belgium', 37441.0, 0.6, 0.4], ['Denmark', 26775.0, 0.57, 0.43], ['Finland', 18765.0, 0.55, 0.45], ['France', 246101.0, 0.54, 0.46], ['Germany', 239106.0, 0.6, 0.4], ['Greece', 8300.0, 0.66, 0.34], ['Ireland', 28978.0, 0.55, 0.46], ['Italy', 91589.0, 0.63, 0.37], ['Netherlands', 67747.0, 0.61, 0.39], ['Norway', 19690.0, 0.55, 0.45], ['Portugal', 36156.0, 0.66, 0.34], ['Spain', 66463.0, 0.54, 0.46], ['Sweden

In [30]:
df = data_g['df17']
print(df)
region_indices = df[df['Country'].isin(regions)].index.tolist()

# Initialize an empty list to hold the 2D lists
split_data = []

# Iterate over the indices, splitting the DataFrame into segments
for i in range(len(region_indices)):
    # Define the start and end indices for each split
    start_idx = region_indices[i]
    end_idx = region_indices[i + 1] if i + 1 < len(region_indices) else len(df)
    
    # Extract the section and convert it to a list of lists (2D list)
    section = df.iloc[start_idx:end_idx].values.tolist()
    split_data.append(section)

print(split_data[8])

for i in range(len(split_data)):
    chunk = split_data[i]
    total = 0
    males = 0
    females = 0
    for j in range(1,len(chunk)):
        total+=chunk[j][1]
        males+=chunk[j][2]*chunk[j][1]
        females+=chunk[j][3]*chunk[j][1]
    chunk[0][1] = total
    chunk[0][2] = round(males/total,2)
    chunk[0][3] = round(females/total,2)

for i in range(len(split_data)):
    for j in range(len(split_data[i])):
        split_data[i][j][2] = round(split_data[i][j][2]/100,2) ## turn percentages of males into fraction (MLE estimate of probability of male tourist)
        split_data[i][j][3] = round(split_data[i][j][3]/100,2) ## turn percentages of females into fraction (MLE estimate of probability of female tourist)

print(split_data)

                    Country      Total  Male  Female
0             North America        NaN   NaN     NaN
1                    Canada   335439.0  53.0    47.0
2                       USA  1376919.0  54.8    45.2
3   Central & South America        NaN   NaN     NaN
4                 Argentina    14875.0  43.2    56.8
..                      ...        ...   ...     ...
76                   Others     4107.0  54.8    45.2
77              Australasia        NaN   NaN     NaN
78                Australia   324243.0  56.8    43.2
79              New Zealand    56597.0  54.3    45.7
80                   Others     5219.0  52.5    47.5

[81 rows x 4 columns]
[['East Asia', nan, nan, nan], ['China (Main)', 247235.0, 63.7, 36.3], ['Japan', 222527.0, 73.1, 26.9], ['Rep. of Korea', 142383.0, 60.7, 39.3], ['China (Taiwan)', 47043.0, 50.9, 49.1], ['Others', 4107.0, 54.8, 45.2]]
[[['North America', 1712358.0, 0.54, 0.46], ['Canada', 335439.0, 0.53, 0.47], ['USA', 1376919.0, 0.55, 0.45]], [['Central &

In [31]:
print(split_data[0])

[['North America', 1712358.0, 0.54, 0.46], ['Canada', 335439.0, 0.53, 0.47], ['USA', 1376919.0, 0.55, 0.45]]
