In [1]:
%pip install pandas scikit-learn mlxtend matplotlib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from scipy.sparse import csr_matrix

# Load the dataset
file_path = 'world_tourism_economy_data.csv'  # Replace with the actual path
tourism_data = pd.read_csv(file_path)

# european_country_codes = [
#     'ALB', 'AND', 'ARM', 'AUT', 
#     'BEL', 'BGR', 'BIH',
#      'BLR', 'BUL', 'CHE', 
#     'CYP', 'CZE', 'DEU', 'DNK', 'EST', 'FIN', 'FRA', 'GEO', 'GRC', 'HRV', 
#     'HUN', 'IRL', 'ISL', 'ISR', 'ITA', 'KOS', 'LTU', 'LUX', 'LVA', 'MDA', 
#     'MNE', 
#     'NLD', 
#     'NOR', 
#     'POL', 'PRT', 'ROU', 'RUS', 'SVK', 'SVN', 'ESP', 
#     'SWE', 'TUR', 'UKR', 'GBR'
# ]
european_union_country_codes = [
    'AUT', 'BEL', 'BGR', 'CYP', 'CZE', 'DEU', 'DNK', 'EST', 'FIN', 'FRA', 
    'GRC', 'HRV', 'HUN', 'IRL', 'ITA', 'LTU', 'LUX', 'LVA', 'POL', 'PRT', 
    'ROU', 'SVK', 'SVN', 'ESP', 'SWE'
]
tourism_data = tourism_data[tourism_data['country_code'].isin(european_union_country_codes)]


# Calculate Tourism GDP Percentage and High Tourism Impact
tourism_data['Tourism_GDP_Percentage'] = (tourism_data['tourism_receipts'] / tourism_data['gdp']) * 100
tourism_threshold = 5  # Adjust threshold as needed
tourism_data['High_Tourism_Impact'] = tourism_data['Tourism_GDP_Percentage'] > tourism_threshold

# Clean the data
tourism_data = tourism_data.dropna(subset=['High_Tourism_Impact'])





In [3]:
# List unique countries with 'High_Tourism_Impact'
countries_with_high_impact = tourism_data[tourism_data['High_Tourism_Impact']]['country_code'].unique()
print(f"Countries with High Tourism Impact: {countries_with_high_impact}")

Countries with High Tourism Impact: ['AUT' 'BGR' 'CYP' 'EST' 'GRC' 'HRV' 'HUN' 'LTU' 'PRT' 'SVN' 'LUX']


In [6]:
# Group the data by 'country_code' and 'year', and pivot
ds_grouped = tourism_data.groupby(['country_code', 'year'], as_index=False).agg({'High_Tourism_Impact': 'any'})
ds_pivot = ds_grouped.pivot(index='year', columns='country_code', values='High_Tourism_Impact').fillna(False)

# Ensure the pivoted table is Boolean
ds_pivot = ds_pivot.applymap(lambda x: bool(x))

# Run apriori on the pivoted data
min_support = 0.5

freq_itemsets = apriori(ds_pivot, min_support=min_support, use_colnames=True)

# If no error occurs, print confirmation
print(freq_itemsets)


     support                                  itemsets
0       0.48                                     (AUT)
1       0.84                                     (BGR)
2       0.84                                     (CYP)
3       0.68                                     (EST)
4       0.72                                     (GRC)
..       ...                                       ...
546     0.36       (PRT, EST, HRV, SVN, GRC, BGR, LUX)
547     0.32       (PRT, EST, HRV, SVN, HUN, GRC, CYP)
548     0.36       (PRT, EST, HRV, SVN, GRC, LUX, CYP)
549     0.32  (PRT, EST, HRV, SVN, HUN, GRC, BGR, CYP)
550     0.36  (PRT, EST, HRV, SVN, GRC, BGR, LUX, CYP)

[551 rows x 2 columns]


  ds_pivot = ds_pivot.applymap(lambda x: bool(x))


In [None]:

# Number of itemsets
num_itemsets = len(freq_itemsets)

# Generate association rules
rules = association_rules(freq_itemsets, metric="confidence", min_threshold=1, num_itemsets=num_itemsets)
rules = rules.sort_values(by='confidence', ascending=False)

# Display the top 10 rules
print("Top 10 rules by confidence:")
print(rules.head(10))


Top 10 rules by confidence:
                    antecedents      consequents  antecedent support  \
4927  (PRT, EST, GRC, LUX, CYP)  (HRV, BGR, SVN)                0.36   
4925  (PRT, EST, GRC, BGR, LUX)  (HRV, CYP, SVN)                0.36   
4921  (PRT, SVN, EST, GRC, LUX)  (HRV, BGR, CYP)                0.36   
4916  (PRT, EST, HRV, GRC, LUX)  (BGR, CYP, SVN)                0.36   
30                        (SVN)            (CYP)                0.72   
28                        (PRT)            (CYP)                0.64   
25                        (HUN)            (CYP)                0.48   
24                        (CYP)            (HRV)                0.84   
22                        (GRC)            (CYP)                0.72   
20                        (EST)            (CYP)                0.68   

      consequent support  support  confidence      lift  representativity  \
4927                0.72     0.36         1.0  1.388889               1.0   
4925                0.72 

In [23]:
def get_rules_where_country_is_antecedent(rules, code):
    # Filter the rules where Portugal (PRT) is in the antecedents
    c_rules = rules[rules['antecedents'].apply(lambda x: code in x)]
    
    # Extract the consequents of these rules
    c_consequents = c_rules['consequents']
    
    # Flatten the list of consequents and get unique countries
    consequent_countries = set()
    for consequent in c_consequents:
        consequent_countries.update(consequent)  # Adds all countries in the consequent
    
    # Return the list of unique countries
    return list(consequent_countries)

# Call the function with the `rules` DataFrame
countries_with_antecedent = get_rules_where_country_is_antecedent(rules, 'PRT')

# Print the result
print("Countries where the writen code is an antecedent:", countries_with_antecedent)


Countries where the writen code is an antecedent: ['SVN', 'HRV', 'EST', 'GRC', 'BGR', 'LUX', 'CYP']
