In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# 1. Festival Dataset
festival_df = pd.read_csv('E:\\FDVA\\data\\raw\\synthetic_global_festivals_500.csv')

# 2. Tourist Arrivals by Country (heatflow)
tourist_df = pd.read_csv('E:\\FDVA\\data\\raw\\structured_UNWTO_tourism_data.csv')  # Ensure monthly/yearly columns

# 3. Cost of Living / Affordability Dataset
cost_df = pd.read_csv('E:\\FDVA\\data\\raw\\wikipedia_cost_of_living_indices3.csv')

# 4. Geo-Tagged Tourism Reviews (for sentiment)
reviews_df = pd.read_csv('E:\\FDVA\\data\\raw\\tripadvisor_hotel_reviews.csv')


In [4]:
# Function to check basic info
def clean_overview(df, name):
    print(f"=== {name} ===")
    print("Shape:", df.shape)
    print("Missing values:\n", df.isnull().sum())
    print("Duplicates:", df.duplicated().sum())
    print("\n")

clean_overview(festival_df, "Festival Dataset")
clean_overview(tourist_df, "Tourist Arrivals Dataset")
clean_overview(cost_df, "Cost of Living Dataset")
clean_overview(reviews_df, "Travel Reviews Dataset")

=== Festival Dataset ===
Shape: (500, 7)
Missing values:
 Festival_Name    0
Country          0
Month            0
Impact_Score     0
Category         0
Lat              0
Lon              0
dtype: int64
Duplicates: 0


=== Tourist Arrivals Dataset ===
Shape: (8253, 33)
Missing values:
 Country           0
Report Type       0
Category          0
Subcategory       0
Metric            0
1995           5773
1996           5730
1997           5643
1998           5573
1999           5513
2000           5412
2001           5314
2002           5286
2003           5252
2004           5211
2005           5046
2006           4912
2007           4779
2008           4733
2009           4694
2010           4630
2011           4629
2012           4608
2013           4573
2014           4528
2015           4461
2016           4444
2017           4480
2018           4509
2019           4564
2020           4773
2021           4958
2022           5542
dtype: int64
Duplicates: 0


=== Cost of Living Data

In [10]:
years = [str(y) for y in range(1995, 2023)]
tourist_df[years] = tourist_df[years].apply(pd.to_numeric, errors='coerce')
tourist_df[years] = tourist_df[years].interpolate()


In [12]:
festival_df = festival_df.drop_duplicates()
festival_df['Impact_Score'] = festival_df['Impact_Score'].clip(0,1)

tourist_df = tourist_df.drop_duplicates()
# Example: Fill missing monthly data with interpolation
tourist_df = tourist_df.interpolate()

cost_df = cost_df.drop_duplicates()
# For simplicity, we can define it as: Cost of Living Index / Local Purchasing Power Index
cost_df['Affordability_Index'] = cost_df['Cost of Living Index'] / cost_df['Local Purchasing Power Index']

reviews_df['Review_Text'] = reviews_df['Review']
reviews_df['Sentiment_Score'] = np.random.uniform(0.6,1.0, size=len(reviews_df))

reviews_df = reviews_df.drop_duplicates()
reviews_df['Review_Text'] = reviews_df['Review_Text'].fillna('')


  tourist_df = tourist_df.interpolate()


In [13]:
def month_to_season(month):
    if month in [12,1,2]: return 'Winter'
    elif month in [3,4,5]: return 'Spring'
    elif month in [6,7,8]: return 'Summer'
    else: return 'Autumn'

festival_df['Season'] = festival_df['Month'].astype(int).apply(month_to_season)


In [16]:
# ---- Corrected Tourism Intensity Calculation ----
year_cols = [str(y) for y in range(1995, 2023)]
tourist_df[year_cols] = tourist_df[year_cols].apply(pd.to_numeric, errors='coerce')
tourist_df[year_cols] = tourist_df[year_cols].interpolate()

tourist_df['Avg_Arrivals'] = tourist_df[year_cols].mean(axis=1)
tourist_df['Tourism_Intensity'] = (
    (tourist_df['Avg_Arrivals'] - tourist_df['Avg_Arrivals'].min()) /
    (tourist_df['Avg_Arrivals'].max() - tourist_df['Avg_Arrivals'].min())
)


In [17]:
# Example: placeholder sentiment score (later NLP)
reviews_df['Sentiment_Score'] = np.random.uniform(0.6,1.0, size=len(reviews_df))


In [21]:
festival_df.to_csv('E:\\FDVA\\data\\processed\\festivals_processed.csv', index=False)
tourist_df.to_csv('E:\\FDVA\\data\\processed\\tourist_processed.csv', index=False)
cost_df.to_csv('E:\\FDVA\\data\\processed\\cost_processed.csv', index=False)
reviews_df.to_csv('E:\\FDVA\\data\\processed\\reviews_processed.csv', index=False)

print("All processed CSVs saved!")


All processed CSVs saved!
