In [19]:
import pandas as pd
import numpy as np
from tabulate import tabulate
import math

<i>Question 3</i>

For every year, model the data as a multinomial probability distribution for the following features: (1) departure
mode, (2) departure port, and (3) state-wise visit of domestic versus foreign tourists. <br>
Find the years when the entropy of the distributions (three seaparate) for the overall count is the maximum and the minimum.<br>

<h3>Part 1</h3>

In [20]:
years = ['2013','2014','2016','2017','2018','2019','2020','2021','2022']

In [21]:
## to add comma after every thousandth place in number to avoid scientific notation conversion by tabular
def format_number(num):
    return f"{num:,}"

In [22]:
def entropy(prob_list):
    retval = 0
    if prob_list and type(prob_list) != float:
        for prob in prob_list:
            if(prob>0 and prob<=1):
                retval-=prob*math.log2(prob)
    return retval

In [23]:
data1 = {}

for y in years:
    data1[f'{y}'] = pd.read_excel(rf"data\TourismData-{y}\MODE WISE DISTRIBUTION OF INDIAN NATIONALS’ DEPARTURES FROM INDIA.xlsx") ## to store pandas dataframe

combined_df1 = pd.DataFrame()
for year, df in data1.items():
    # Set the 'MODE' column as the index and transpose so 'MODE' becomes columns
    df = df.set_index('MODE').T
    df.index = [year]  # Set the year as the index
    combined_df1 = pd.concat([combined_df1, df])
combined_df1.columns.name = None
combined_df1['Total'] = combined_df1.sum(axis=1) # for summing up all the travellers from each mode

answer1 = pd.DataFrame({
    'Trials': np.vectorize(format_number)(combined_df1['Total'].to_numpy()),
    'p(air)': combined_df1['AIR'] / combined_df1['Total'],
    'p(land)': combined_df1['LAND'] / combined_df1['Total'],
    'p(sea)': combined_df1['SEA'] / combined_df1['Total']
})

In [24]:
np_arr = answer1.to_numpy()
max_entropy_year = years[np.argmax([entropy(tuple(parameters[1:])) for parameters in np_arr])]
min_entropy_year = years[np.argmin([entropy(tuple(parameters[1:])) for parameters in np_arr])]

In [25]:
print(tabulate(answer1, headers='keys', tablefmt='grid'))
print("\nTable 1: Multinomial distribution parameters for mode of transport for each year\n")

print(f"Year with maximum entropy in the multinomial distribution : {max_entropy_year}");
print(f"Year with minimum entropy in the multinomial distribution : {min_entropy_year}");

+------+------------+----------+------------+------------+
|      | Trials     |   p(air) |    p(land) |     p(sea) |
| 2013 | 16,626,316 | 0.9869   | 0.0116     | 0.00149997 |
+------+------------+----------+------------+------------+
| 2014 | 18,332,319 | 0.986442 | 0.0117799  | 0.00177833 |
+------+------------+----------+------------+------------+
| 2016 | 21,871,995 | 0.9859   | 0.0119069  | 0.00219322 |
+------+------------+----------+------------+------------+
| 2017 | 23,942,957 | 0.985028 | 0.0126631  | 0.00230857 |
+------+------------+----------+------------+------------+
| 2018 | 26,296,484 | 0.984883 | 0.0127365  | 0.00238017 |
+------+------------+----------+------------+------------+
| 2019 | 26,915,034 | 0.981684 | 0.0155409  | 0.00277555 |
+------+------------+----------+------------+------------+
| 2020 | 7,291,880  | 0.973098 | 0.0188153  | 0.00808625 |
+------+------------+----------+------------+------------+
| 2021 | 8,551,309  | 0.984581 | 0.00767964 | 0.00773952

<h3>Part 2</h3>

In [26]:
data2 = {}

for y in years:
    data2[f'{y}'] = pd.read_excel(rf"data\TourismData-{y}\PORT-WISE DEPARTURES OF INDIAN NATIONALS FROM INDIA.xlsx") ## to store pandas dataframe

combined_df2 = pd.DataFrame()
for year, df in data2.items():
    df = df.set_index('Checkpost').T
    df.index = [year]  # Set the year as the index
    combined_df2 = pd.concat([combined_df2, df])
combined_df2.columns.name = None
combined_df2['Total'] = combined_df2.sum(axis=1) # for summing up all the travellers from each mode

answer2 = pd.DataFrame({
    'Trials': np.vectorize(format_number)(combined_df2['Total'].to_numpy()),
    'p(ahemdabad)': combined_df2['Ahmedabad (Airport)'] / combined_df2['Total'],
    'p(bengaluru)': combined_df2['Bengaluru (Airport)'] / combined_df2['Total'],
    'p(calicut)': combined_df2['Calicut (Airport)'] / combined_df2['Total'],
    'p(chennai)': combined_df2['Chennai (Airport)'] / combined_df2['Total'],
    'p(cochin)': combined_df2['Cochin (Airport)'] / combined_df2['Total'],
    'p(delhi)': combined_df2['Delhi (Airport)'] / combined_df2['Total'],
    'p(hyderabad)': combined_df2['Hyderabad (Airport)'] / combined_df2['Total'],
    'p(kolkata)': combined_df2['Kolkata (Airport)'] / combined_df2['Total'],
    'p(mumbai)': combined_df2['Mumbai (Airport)'] / combined_df2['Total'],
    'p(trivandrum)': combined_df2['Trivandrum (Airport)'] / combined_df2['Total'],
    'p(others)': combined_df2['Others'] / combined_df2['Total']
})


In [27]:
np_arr2 = answer2.to_numpy()
max_entropy_year2 = years[np.argmax([entropy(tuple(parameters[1:])) for parameters in np_arr2])]
min_entropy_year2 = years[np.argmin([entropy(tuple(parameters[1:])) for parameters in np_arr2])]

In [28]:
print(tabulate(answer2, headers='keys', tablefmt='grid'))
print("\nTable 2: Multinomial distribution parameters for departure port for each year\n")

print(f"Year with maximum entropy in the multinomial distribution : {max_entropy_year2}");
print(f"Year with minimum entropy in the multinomial distribution : {min_entropy_year2}");

+------+------------+----------------+----------------+--------------+--------------+-------------+------------+----------------+--------------+-------------+-----------------+-------------+
|      | Trials     |   p(ahemdabad) |   p(bengaluru) |   p(calicut) |   p(chennai) |   p(cochin) |   p(delhi) |   p(hyderabad) |   p(kolkata) |   p(mumbai) |   p(trivandrum) |   p(others) |
| 2013 | 16,626,316 |      0.0220194 |      0.0512796 |    0.0668998 |    0.104082  |   0.0836525 |   0.211568 |      0.0556588 |    0.0338564 |    0.224594 |       0.0513983 |   0.0949912 |
+------+------------+----------------+----------------+--------------+--------------+-------------+------------+----------------+--------------+-------------+-----------------+-------------+
| 2014 | 18,332,319 |      0.0216519 |      0.0520151 |    0.0643    |    0.098371  |   0.0878635 |   0.207007 |      0.0582937 |    0.032272  |    0.226187 |       0.0504146 |   0.101624  |
+------+------------+----------------+-------

<h3>Part 3</h3>

In [29]:
data3 = {}

for y in years:
    df = pd.read_excel(rf"data\TourismData-{y}\STATE UT-WISE DOMESTIC AND FOREIGN TOURIST VISITS.xlsx") ## to store pandas dataframe
    df.set_index('State/UT', inplace=True)
    df.index.name = None
    data3[f'{y}'] = df



In [30]:
# function to generate tuple that contains multinomial distribution parameters
def create_tuple(row): 
    total = row['Domestic'] + row['Foreign'] 
    domestic_ratio = round(row['Domestic'] / total, 3) 
    foreign_ratio = round(row['Foreign'] / total,3) 
    return (total, domestic_ratio, foreign_ratio)

In [31]:
df_arr = []
for y in years:
    df = data3[f'{y}']
    df[f'{y}'] = df.apply(create_tuple, axis=1)
    df2 = df[[f'{y}']]
    df_arr.append(df2)

combined_df3 = pd.concat(df_arr, axis=1)
combined_df3['Max Entropy year'] = combined_df3.iloc[:,:].apply(lambda row: row.apply(entropy).idxmax(), axis=1)
# print(combined_df3)
combined_df3['Min Entropy year'] = combined_df3.iloc[:,:-1].apply(lambda row: row.apply(entropy).idxmin(), axis=1)

In [32]:
print(tabulate(combined_df3, headers='keys', tablefmt='grid'))
print("\nTable 2: Multinomial distribution parameters for state-wise visit of domestic versus foreign tourists for each year")

# parameters are in format of tuple.
# 1st element of tuple represents the no. of trials for the experiment
# 2nd element of tuple represents the probability of tourist being domestic
# 3rd element of tuple represents the probability of tourist being foreign
# table also contains max entropy year for each state/UT at the end


+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+--------------------------+---------------------------+--------------------+--------------------+
|                           | 2013                      | 2014                      | 2016                      | 2017                      | 2018                      | 2019                      | 2020                      | 2021                     | 2022                      |   Max Entropy year |   Min Entropy year |
| Andaman & Nicobar Islands | (258445, 0.943, 0.057)    | (302381, 0.943, 0.057)    | (400018, 0.961, 0.039)    | (487232, 0.969, 0.031)    | (513521, 0.97, 0.03)      | (521604, 0.969, 0.031)    | (196619, 0.972, 0.028)    | (127925, 0.987, 0.013)   | (2396.0, 0.981, 0.019)    |               2013 |               2021 |
+---------------------------+--