In [45]:
import pandas as pd
import numpy as np
from scipy.interpolate import make_interp_spline
import statsmodels.api as sm
from scipy.interpolate import interp1d
from statsmodels.nonparametric.smoothers_lowess import lowess
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error

<i>Question 9</i>

Impute (or interpolate) data for the following years: 2018, 2020, 2021. When you
are imputing, assume that data for all the other years are available, except this.<br>
Use at least 3 different imputation techniques.<br>
Impute for the following features:<br> (1) Purpose of visit for each continent,<br> (2) Total number of
incoming tourists per continent,<br> (3) Number of domestic toursists per state,<br> and measure the
accuracy for the 3 different techniques

Note:
<p>
The data for year 2014 is not provided.<br>
For year 2022 the "students" column and "unknown" columns are added to "other" column.<br>
For year 2021 the "students" column is merged with "other".<br>
For year 2013 the "students" column is merged with "other".<br>
</p><p>
The data for "DISTRIBUTION OF NATIONALITY-WISE FTAs IN INDIA ACCORDING TO PURPOSE" for the year 2017 was same as 2019, so original correct data was fetched from tourism.gov.in file titled "India Tourism Statistics 2018"(pages 51-53).
</p>

In the data for "STATE UT-WISE DOMESTIC AND FOREIGN TOURIST VISITS" UT-Leh Ladakh has been added for years 2019, 2020, 2021 and 2022 but not in the previous years. So it is merged with data of jammu and kashmir.

In [46]:
years = [16, 17, 18, 19, 20, 21, 22]
dff = np.empty(25, dtype=object)
region_summary = []
purpose_columns = ['business and professional', 'leisure holiday and recreation', 'medical', 'indian diaspora', 'others']

for i in years:
    file_path = f'data/TourismData-20{i}/DISTRIBUTION OF NATIONALITY-WISE FTAs IN INDIA ACCORDING TO PURPOSE.xlsx'
    
    # Load data
    dff[i] = pd.read_excel(file_path, sheet_name=0)
    df = dff[i]
    
    # Convert column names to lowercase
    df.columns = df.columns.str.lower()
    
    # Forward-fill region names based on 'arrivals (in numbers)'
    df['region'] = df['country of nationality'].where(df['arrivals (in numbers)'].isna()).ffill().str.lower()
    
    # Filter rows that contain actual arrival data
    df_filtered = df.dropna(subset=['arrivals (in numbers)']).copy()
    
    # Convert purpose percentages to absolute numbers
    for col in purpose_columns:
        df_filtered[col] = df_filtered[col] * df_filtered['arrivals (in numbers)'] / 100
    
    # Group by 'region' and calculate sums
    region_summarydf = df_filtered.groupby('region')[['arrivals (in numbers)'] + purpose_columns].sum().reset_index()
    region_summary.append(region_summarydf)

# Define purpose columns with "arrivals" included
purpose_columns = ['arrivals (in numbers)'] + purpose_columns

# Step 1: First interpolation for 2018 and 2020
initial_known_years = [2016, 2017, 2019, 2022]  # Known years for the first round
initial_missing_years = [2018, 2020]  # Interpolate for these years initially
initial_interpolated_results = {"linear": {}, "polynomial": {}, "lowess": {}}

for region in region_summary[0]['region'].unique():
    for purpose in purpose_columns:
        # Collect known data points for initial interpolation
        y_known = [
            region_summary[years.index(y % 100)].loc[region_summary[years.index(y % 100)]['region'] == region, purpose].values[0]
            for y in initial_known_years
        ]
        
        # Linear Interpolation
        y_series = pd.Series(y_known, index=initial_known_years)
        linear_interpolated = y_series.reindex(initial_known_years + initial_missing_years).interpolate(method='linear').tolist()
        initial_interpolated_results["linear"][(region, purpose)] = [linear_interpolated[years.index(y % 100)] for y in initial_missing_years]
        # print(initial_interpolated_results['linear'][(region, purpose)])

        # Polynomial Interpolation (Cubic)
        x_known = np.array(initial_known_years)
        y_known_array = np.array(y_known)
        poly_interp = make_interp_spline(x_known, y_known_array, k=3)
        poly_interpolated = poly_interp(initial_known_years + initial_missing_years).tolist()
        initial_interpolated_results["polynomial"][(region, purpose)] = [poly_interpolated[years.index(y % 100)] for y in initial_missing_years]
        
        # LOWESS Interpolation
        lowess_fit = sm.nonparametric.lowess(y_known_array, x_known, frac=0.3)
        lowess_interpolated = np.interp(initial_known_years + initial_missing_years, lowess_fit[:, 0], lowess_fit[:, 1]).tolist()
        initial_interpolated_results["lowess"][(region, purpose)] = [lowess_interpolated[years.index(y % 100)] for y in initial_missing_years]

# Step 2: Second interpolation, using results from 2018 and 2020, to predict 2021
full_known_years = [2016, 2017, 2018, 2019, 2020, 2022]  # Now includes interpolated 2018 and 2020
final_missing_years = [2021]  # Interpolate only for 2021 now
final_interpolated_results = {"linear": {}, "polynomial": {}, "lowess": {}}

for region in region_summary[0]['region'].unique():
    for purpose in purpose_columns:
        # Use interpolated values for 2018 and 2020 along with actual known values
        y_known = [
            region_summary[years.index(y % 100)].loc[region_summary[years.index(y % 100)]['region'] == region, purpose].values[0]
            if y in initial_known_years else initial_interpolated_results["linear"][(region, purpose)][initial_missing_years.index(y)]
            for y in full_known_years
        ]
        
        # Linear Interpolation for 2021
        y_series = pd.Series(y_known, index=full_known_years)
        linear_interpolated = y_series.reindex(full_known_years + final_missing_years).interpolate(method='linear').tolist()
        final_interpolated_results["linear"][(region, purpose)] = [linear_interpolated[years.index(y % 100)] for y in final_missing_years]

        # Polynomial Interpolation (Cubic) for 2021
        x_known = np.array(full_known_years)
        y_known_array = np.array(y_known)
        poly_interp = make_interp_spline(x_known, y_known_array, k=3)
        poly_interpolated = poly_interp(full_known_years + final_missing_years).tolist()
        final_interpolated_results["polynomial"][(region, purpose)] = [poly_interpolated[years.index(y % 100)] for y in final_missing_years]
        
        # LOWESS Interpolation for 2021
        lowess_fit = sm.nonparametric.lowess(y_known_array, x_known, frac=0.3)
        lowess_interpolated = np.interp(full_known_years + final_missing_years, lowess_fit[:, 0], lowess_fit[:, 1]).tolist()
        final_interpolated_results["lowess"][(region, purpose)] = [lowess_interpolated[years.index(y % 100)] for y in final_missing_years]

# # Display final interpolated values for 2021
# for region in region_summary[0]['region'].unique():
#     for purpose in purpose_columns:
#         print(f"Final Linear Interpolated value for 2021 in region {region}, purpose {purpose}: {final_interpolated_results['linear'][(region, purpose)]}")


In [47]:
# Initialize dictionaries to hold DataFrames for each interpolation technique and year
interpolated_dfs = {
    "linear": {},
    "polynomial": {},
    "lowess": {}
}

# Function to construct DataFrames for each technique and year
def construct_interpolation_table(interpolated_results, year):
    table_data = []
    for region in region_summary[0]['region'].unique():
        row = {'region': region}
        for purpose in purpose_columns:
            row[purpose] = interpolated_results[(region, purpose)][0]  # Access interpolated value for the year
        table_data.append(row)
    return pd.DataFrame(table_data)

# Generate tables for each technique for 2018, 2020, and 2021
for technique in interpolated_dfs.keys():
    for year in [2018, 2020, 2021]:
        # Retrieve interpolated results for the given year
        interpolated_results_for_year = {
            (region, purpose): final_interpolated_results[technique][(region, purpose)]
            if year == 2021 else initial_interpolated_results[technique][(region, purpose)]
            for region in region_summary[0]['region'].unique()
            for purpose in purpose_columns
        }
        # Construct DataFrame for each year and technique
        interpolated_dfs[technique][year] = construct_interpolation_table(interpolated_results_for_year, year)

# Display each DataFrame (table) for 2018, 2020, and 2021 by technique
for technique, year_dfs in interpolated_dfs.items():
    print(f"\n--- {technique.capitalize()} Interpolation Results ---\n")
    for year, df in year_dfs.items():
        print(f"Year: {year}")
        print(df)
        print("\n" + "-"*50 + "\n")



--- Linear Interpolation Results ---

Year: 2018
                    region  arrivals (in numbers)  business and professional  \
0                   africa               175865.0                 26777.0790   
1              australasia               105047.0                  5801.8100   
2  central & south america                98926.0                 20019.0578   
3                east asia               130383.0                 61042.0660   
4           eastern europe               456481.0                 56718.9039   
5            north america              1863892.0                186912.0006   
6               south asia               750061.0                 51251.8300   
7          south east asia               231622.0                 23155.4610   
8                west asia                97651.0                 12763.8350   
9           western europe              2178441.0                375756.9347   

   leisure holiday and recreation      medical  indian diaspora      

In [49]:
def calculate_error_metrics(original_data, interpolated_data):
    # Ensure both original_data and interpolated_data are the same length
    assert len(original_data) == len(interpolated_data), "Data length mismatch"

    # MAPE calculation
    mape = mean_absolute_percentage_error(original_data, interpolated_data) * 100

    # RMSE calculation
    rmse = np.sqrt(mean_squared_error(original_data, interpolated_data))
    
    return mape, rmse

In [51]:
print(purpose_columns)

['arrivals (in numbers)', 'business and professional', 'leisure holiday and recreation', 'medical', 'indian diaspora', 'others']


In [54]:
# dictionary to store the error metrics for each region and purpose
error_metrics = {
    "linear": {},
    "polynomial": {},
    "lowess": {}
}

# Loop through each region and purpose to calculate errors for each interpolation technique and year
for region in region_summary[0]['region'].unique():
    for purpose in purpose_columns:
        if(purpose != 'arrivals (in numbers)'):
            continue
        for technique in ["linear", "polynomial", "lowess"]:
            # Get original data (actual values) for the region and purpose
            actual_data = [
                region_summary[years.index(year % 100)].loc[region_summary[years.index(year % 100)]['region'] == region, purpose].values[0]
                for year in [2018]  # Known years
            ]
            interpolated_data = initial_interpolated_results[technique].get((region,purpose))[:1]
            mape, rmse = calculate_error_metrics(actual_data, interpolated_data)
            error_metrics[technique][(region, purpose)] = {
                "mape": mape,
                "rmse": rmse
            }

# Display the error metrics for each technique and region

print("Interpolation error metrics for 2018 for total arrivals")
for technique, region_metrics in error_metrics.items():
    print(f"\n--- {technique.capitalize()} Interpolation Error Metrics ---")
    for (region, purpose), metrics in region_metrics.items():
        print(f"Region: {region}, Purpose: {purpose}")
        print(f"MAPE: {metrics['mape']:.2f}%, RMSE: {metrics['rmse']:.2f}")
        print("-" * 50)

Interpolation error metrics for 2018 for total arrivals

--- Linear Interpolation Error Metrics ---
Region: africa, Purpose: arrivals (in numbers)
MAPE: 49.92%, RMSE: 175333.00
--------------------------------------------------
Region: australasia, Purpose: arrivals (in numbers)
MAPE: 74.54%, RMSE: 307581.00
--------------------------------------------------
Region: central & south america, Purpose: arrivals (in numbers)
MAPE: 2.14%, RMSE: 2159.00
--------------------------------------------------
Region: east asia, Purpose: arrivals (in numbers)
MAPE: 82.01%, RMSE: 594185.00
--------------------------------------------------
Region: eastern europe, Purpose: arrivals (in numbers)
MAPE: 2.05%, RMSE: 9568.00
--------------------------------------------------
Region: north america, Purpose: arrivals (in numbers)
MAPE: 3.11%, RMSE: 56174.00
--------------------------------------------------
Region: south asia, Purpose: arrivals (in numbers)
MAPE: 75.84%, RMSE: 2354361.00
------------------

<h3> Linear Interpolation </h3>
Method: Linear interpolation estimates values between two known data points by connecting them with a straight line.

Summary: Simple and fast, it works well for linear trends but may introduce errors with non-linear data.

<h3>Polynomial Interpolation</h3>
Method: Polynomial interpolation uses a polynomial to fit a curve through all data points, passing through each one exactly.

Summary: Suitable for complex relationships, but can overfit and cause oscillations with high-degree polynomials, especially with evenly spaced points.



<h3>LOWESS (Locally Weighted Scatterplot Smoothing)</h3>
Method: LOWESS fits a smooth curve through the data using weighted least squares, focusing more on nearby points.

Summary: Excellent for noisy data, capturing local trends without assuming a global pattern. It’s robust to outliers and flexible in modeling different relationships.

Conclusion
1. Linear interpolation is best for simple, linear data.
2. Polynomial interpolation fits complex curves but can overfit.
3. LOWESS is ideal for noisy data, providing smooth, adaptable results.
Each technique has its strengths, with linear interpolation being fast, polynomial offering complexity, and LOWESS being robust for irregular datasets.
