In [3]:
import pandas as pd
import numpy as np
from tabulate import tabulate
from pickleshare import *

In [4]:
# using pickelshare to temporarily store some variables to transfer them to other notebooks
db = PickleShareDB(os.path.join(os.getcwd(),'SharedVars'))
# db.clear()

For every year, find the change in outgoing tourists in absolute and percentage terms from the last year.<br>
Do this for every month and the year as a whole.<br>
Which years show the highest positive and negative changes?<br>
For each month, find the changes across the entire data from the previous month.<br>
Which months show the highest positive and negative changes?<br>

In [5]:
years = ['2013','2014','2016','2017','2018','2019','2020','2021','2022']
months = ["January", "February", "March", "April", "May", "June", "July", "August", 
          "September", "October", "November", "December"]

In [6]:
## to add comma after every thousandth place in number to avoid scientific notation conversion by tabular
def format_number(num):
    return f"{num:,}"

In [7]:
data = []
for y in years:
    df = pd.read_excel(rf"data\TourismData-{y}\MONTH WISE NUMBER & PERCENTAGE SHARE OF INDIAN NATIONALS’ DEPARTURES FROM INDIA.xlsx") ## to store pandas dataframe
    df.set_index("Month", inplace=True)
    df.index.name = None
    df.columns = [y]
    df.loc['Annual Total'] = df.sum()
    data.append(df)

merged_df = pd.concat(data,axis=1);
print(merged_df)


                  2013      2014      2016      2017      2018      2019  \
January        1424291   1518285   1866995   1962619   2238035   2309062   
February       1285813   1328360   1605765   1714399   1839947   1992487   
March          1287728   1463879   1731968   1846395   2099266   2203115   
April          1395879   1561150   1912473   2047568   2321632   2202018   
May            1591906   1744621   2128686   2312939   2521860   2384815   
June           1320676   1573931   1693204   1781817   2054526   2198582   
July           1415677   1327984   1701014   1938221   2150580   2180437   
August         1593409   1688554   2001816   2118235   2272537   2351701   
September      1640118   1697955   1942743   2232437   2434217   2354445   
October        1218333   1490000   1725665   1999069   2074788   2145065   
November       1438279   1388722   1721215   1856300   2087972   2147330   
December       1566117   1566514   1840451   2132958   2201124   2355977   
Annual Total

In [8]:
df_matrix = merged_df.to_numpy()
# print(df_matrix)

In [9]:
change = []
for i in range(len(df_matrix)):
    change.append([df_matrix[i][j]-df_matrix[i][j-1] for j in range(1,len(df_matrix[i]))])
change = np.array(change)
# print(change)

In [10]:
change_percent = []
for i in range(len(change)):
    change_percent.append([round(change[i][j]*100/df_matrix[i][j],2) for j in range(len(change[i]))])
change_percent = np.array(change_percent)
# print(change_percent)

In [11]:
rows = months+["Overall"]
years_ = ['2014','2016','2017','2018','2019','2020','2021','2022']

change_df = pd.DataFrame(np.vectorize(format_number)(change), columns=years_, index=rows)
change_pdf = pd.DataFrame(np.vectorize(format_number)(change_percent), columns=years_, index=rows)

max_positive_change = np.array([years_[np.argmax(monthly_data)] for monthly_data in change])
max_negative_change = np.array([years_[np.argmin(monthly_data)] for monthly_data in change])
max_min_change = np.column_stack((max_positive_change, max_negative_change))

max_positive_change_p = np.array([years_[np.argmax(monthly_data)] for monthly_data in change_percent])
max_negative_change_p = np.array([years_[np.argmin(monthly_data)] for monthly_data in change_percent])
max_min_change_p = np.column_stack((max_positive_change_p, max_negative_change_p))

columns = ["year with maximum positive change", "year with maximum negative change"]

min_max_df = pd.DataFrame(max_min_change, columns=columns, index=rows)
min_max_pdf = pd.DataFrame(max_min_change_p, columns=columns, index=rows)

In [12]:
print(tabulate(change_df, headers='keys', tablefmt='grid'))
print('Table 1: increase and decrease in no. of Departures from last year for each month and year as a whlole.\n')


+-----------+-----------+-----------+-----------+-----------+----------+-------------+------------+------------+
|           | 2014      | 2016      | 2017      | 2018      | 2019     | 2020        | 2021       | 2022       |
| January   | 93,994    | 348,710   | 95,624    | 275,416   | 71,027   | 44,085      | -1,663,781 | 614,766    |
+-----------+-----------+-----------+-----------+-----------+----------+-------------+------------+------------+
| February  | 42,547    | 277,405   | 108,634   | 125,548   | 152,540  | -9,198      | -1,330,218 | 505,168    |
+-----------+-----------+-----------+-----------+-----------+----------+-------------+------------+------------+
| March     | 176,151   | 268,089   | 114,427   | 252,871   | 103,849  | -1,351,783  | -142,322   | 789,472    |
+-----------+-----------+-----------+-----------+-----------+----------+-------------+------------+------------+
| April     | 165,271   | 351,323   | 135,095   | 274,064   | -119,614 | -2,195,485  | 628,130  

In [13]:
print(tabulate(min_max_df, headers='keys', tablefmt='grid'))
print('Table 2: Years which saw maximum increase and decrease in Departure compared to previous year for each month and year as a whole.\n')

+-----------+-------------------------------------+-------------------------------------+
|           |   year with maximum positive change |   year with maximum negative change |
| January   |                                2022 |                                2021 |
+-----------+-------------------------------------+-------------------------------------+
| February  |                                2022 |                                2021 |
+-----------+-------------------------------------+-------------------------------------+
| March     |                                2022 |                                2020 |
+-----------+-------------------------------------+-------------------------------------+
| April     |                                2022 |                                2020 |
+-----------+-------------------------------------+-------------------------------------+
| May       |                                2022 |                                2020 |
+---------

In [14]:
print(tabulate(change_pdf, headers='keys', tablefmt='grid'))
print('Table 3: percent increase and decrease in no. of departures from previous year for each month and year as a whlole.\n')

+-----------+--------+--------+--------+--------+--------+--------+----------+----------+
|           |   2014 |   2016 |   2017 |   2018 |   2019 |   2020 | 2021     | 2022     |
| January   |   6.6  |  22.97 |   5.12 |  14.03 |   3.17 |   1.91 | -70.7    | 89.18    |
+-----------+--------+--------+--------+--------+--------+--------+----------+----------+
| February  |   3.31 |  20.88 |   6.77 |   7.32 |   8.29 |  -0.46 | -67.07   | 77.35    |
+-----------+--------+--------+--------+--------+--------+--------+----------+----------+
| March     |  13.68 |  18.31 |   6.61 |  13.7  |   4.95 | -61.36 | -16.72   | 111.35   |
+-----------+--------+--------+--------+--------+--------+--------+----------+----------+
| April     |  11.84 |  22.5  |   7.06 |  13.38 |  -5.15 | -99.7  | 9,614.73 | 146.52   |
+-----------+--------+--------+--------+--------+--------+--------+----------+----------+
| May       |   9.59 |  22.01 |   8.66 |   9.03 |  -5.43 | -99.04 | 575.95   | 1,148.88 |
+---------

In [15]:
print(tabulate(min_max_pdf, headers='keys', tablefmt='grid'))
print('Table 4: Years which saw maximum percent increase and decrease in departures compared to previous year for each month and year as a whole.\n')

+-----------+-------------------------------------+-------------------------------------+
|           |   year with maximum positive change |   year with maximum negative change |
| January   |                                2022 |                                2021 |
+-----------+-------------------------------------+-------------------------------------+
| February  |                                2022 |                                2021 |
+-----------+-------------------------------------+-------------------------------------+
| March     |                                2022 |                                2020 |
+-----------+-------------------------------------+-------------------------------------+
| April     |                                2021 |                                2020 |
+-----------+-------------------------------------+-------------------------------------+
| May       |                                2022 |                                2020 |
+---------

In [16]:
df_matrix2 = df_matrix[0:-1,:]
df_matrix2 = np.transpose(df_matrix2)
# print(df_matrix2)
continous_data = df_matrix2.flatten()
# print(continous_data)
continous_diff = np.array([0]+[continous_data[i]-continous_data[i-1] for i in range(1,len(continous_data))])
continous_diff[24] = 0 ## because it represents January of 2016 and we don't have data for 2015
# print(continous_diff)

max_pos = np.argmax(continous_diff)
min_pos = np.argmin(continous_diff)

print(f"Maximum positive change occurred in {months[max_pos%12]} of {years[int(max_pos/12)]}.")
print(f"Maximum negative change occurred in {months[min_pos%12]} of {years[int(min_pos/12)]}")

Maximum positive change occurred in August of 2021.
Maximum negative change occurred in March of 2020


In [17]:
## storing continous data in database for later use

db['q4/monthly departure data'] = continous_data

In [18]:
df_diff = continous_diff.reshape(-1,12)
df_diff = np.transpose(df_diff)
january_avg_diff = np.mean(np.array([ele for ele in df_diff[0] if ele])) ## calculating avg decrease seen by january only for years when data from previous year's december is available
# print(january_avg_diff)
df_diff[0][0] = january_avg_diff 
df_diff[0][2] = january_avg_diff ## replacing 0 with avg diff of january to fill missing data of january
# print(df_diff)

median_diff = np.array([np.mean(row) for row in df_diff]) ## average increase or decrease of each month over all years
# print(mean_diff)
print(f"On average month of {months[np.argmax(median_diff)]} saw maximum positive change in a year")
print(f"On average month of {months[np.argmin(median_diff)]} saw maximum negative change in a year")

On average month of August saw maximum positive change in a year
On average month of February saw maximum negative change in a year
