In [1]:
%pip install pandas scikit-learn mlxtend matplotlib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from scipy.sparse import csr_matrix

# Load the dataset
file_path = 'world_tourism_economy_data.csv'  # Replace with the actual path
tourism_data = pd.read_csv(file_path)

european_country_codes = [
    'ALB', 'AND', 'ARM', 'AUT', 'BEL', 'BGR', 'BIH', 'BLR', 'BUL', 'CHE', 
    'CYP', 'CZE', 'DEU', 'DNK', 'EST', 'FIN', 'FRA', 'GEO', 'GRC', 'HRV', 
    'HUN', 'IRL', 'ISL', 'ISR', 'ITA', 'KOS', 'LTU', 'LUX', 'LVA', 'MDA', 
    'MNE', 'NLD', 'NOR', 'POL', 'PRT', 'ROU', 'RUS', 'SVK', 'SVN', 'ESP', 
    'SWE', 'TUR', 'UKR', 'GBR'
]
tourism_data = tourism_data[tourism_data['country_code'].isin(european_country_codes)]


# Calculate Tourism GDP Percentage and High Tourism Impact
tourism_data['Tourism_GDP_Percentage'] = (tourism_data['tourism_receipts'] / tourism_data['gdp']) * 100
tourism_threshold = 5  # Adjust threshold as needed
tourism_data['High_Tourism_Impact'] = tourism_data['Tourism_GDP_Percentage'] > tourism_threshold

# Clean the data
tourism_data = tourism_data.dropna(subset=['High_Tourism_Impact'])

# Group the data by 'country_code' and 'year', and pivot
ds_grouped = tourism_data.groupby(['country_code', 'year'], as_index=False).agg({'High_Tourism_Impact': 'any'})
ds_pivot = ds_grouped.pivot(index='year', columns='country_code', values='High_Tourism_Impact').fillna(False)

# Ensure the pivoted table is Boolean
ds_pivot = ds_pivot.applymap(lambda x: bool(x))

# Run apriori on the pivoted data
min_support = 0.01

freq_itemsets = apriori(ds_pivot, min_support=min_support, use_colnames=True)

# If no error occurs, print confirmation
print(freq_itemsets)

# Number of itemsets
num_itemsets = len(freq_itemsets)

# Generate association rules
rules = association_rules(freq_itemsets, metric="confidence", min_threshold=0.7, num_itemsets=num_itemsets)
rules = rules.sort_values(by='confidence', ascending=False)

# Display the top 10 rules
print("Top 10 rules by confidence:")
print(rules.head(10))

# List unique countries with 'High_Tourism_Impact'
countries_with_high_impact = tourism_data[tourism_data['High_Tourism_Impact']]['country_code'].unique()
print(f"Countries with High Tourism Impact: {countries_with_high_impact}")


  ds_pivot = ds_pivot.applymap(lambda x: bool(x))


        support                                           itemsets
0          0.88                                              (ALB)
1          0.04                                              (AND)
2          0.56                                              (ARM)
3          0.48                                              (AUT)
4          0.84                                              (BGR)
...         ...                                                ...
133626     0.04  (ARM, AND, PRT, ALB, BGR, GRC, CYP, HRV, HUN, ...
133627     0.04  (AND, PRT, ALB, BGR, GRC, CYP, HRV, HUN, MNE, ...
133628     0.04  (ARM, PRT, ALB, BGR, GRC, CYP, HRV, HUN, MNE, ...
133629     0.04  (ARM, AND, PRT, GRC, BGR, CYP, HRV, HUN, MNE, ...
133630     0.04  (ARM, AND, PRT, ALB, BGR, GRC, CYP, HRV, HUN, ...

[133631 rows x 2 columns]
