## Imports

In [2]:
import pandas as pd
import numpy as np

### Import departures data

In [3]:
departures = pd.read_csv('data/Tourism_Departures.csv', sep=';')

### Import expenditures data

In [4]:
expenditures = pd.read_csv('data/Tourism_Expenditures.csv', sep=';')

## Process data

### Remove NaN values

In [5]:
# Remove NaN in departures dataframe
departures.dropna(subset=['Value'], inplace=True)

# Remove NaN in expenditures dataframe
expenditures.dropna(subset=['Value'], inplace=True)

### Remove rows corresponding to a total of Overnights visitors (tourists) and Same-day visitors (excursionists)

#### Create a function to check if a row corresponds to a sum of other rows (i.e. for a same country and year)

In [6]:
import pandas as pd
import numpy as np

def mark_total_rows(df):
    df = df.copy()
    df["Sum"] = np.nan  # Nouvelle colonne à remplir

    for (country, year), group in df.groupby(["Country", "Year"]):
        if len(group) == 3:
            values = group["Value"].dropna().values
            if len(values) == 3:
                a, b, c = sorted(values)
                if np.isclose(c, a + b):
                    # Trouver la ligne du total et marquer "Yes"
                    total_index = group[group["Value"] == c].index
                    if len(total_index) == 1:
                        df.loc[total_index[0], "Sum"] = "Yes"
    return df

In [7]:
# Apply the function on departures
departures = mark_total_rows(departures)

# Apply the function on expenditures
expenditures = mark_total_rows(expenditures)

  df.loc[total_index[0], "Sum"] = "Yes"
  df.loc[total_index[0], "Sum"] = "Yes"


#### Remove the rows containing Yes

In [8]:
# Remove these rows on departures
departures = departures.query('Sum != "Yes"')

# Remove these rows on expenditures
expenditures = expenditures.query('Sum != "Yes"')

### Remove the column Sum

In [9]:
# Remove the column on departures
departures.drop(columns=['Sum'], inplace=True)

# Remove the column on expenditures
expenditures.drop(columns=['Sum'], inplace=True)

### Rename the column Value

In [10]:
# Apply on departures
departures = departures.rename(columns={'Value':'Number_of_Tourists'})

# Apply on expenditures
expenditures = expenditures.rename(columns={'Value':'Expenditure'})

### Group all the categories of tourists in departures (we don't need to split between Overnight and same day visitors)

In [11]:
departures = departures.groupby(['Country','Year'], as_index=False)['Number_of_Tourists'].sum()

### Remove Passenger transport Expenditure in expenditures as we are not interested in how much tourists spend in transportation

In [12]:
expenditures = expenditures.query('Category!="Passenger transport"')

### Remove the Category column in expenditures as we don't need it anymore

In [13]:
expenditures.drop(columns='Category',inplace=True)

## Merge data

In [14]:
# Merge on rows corresponding exactly to the same country and the same year. Take only the rows included in both dataframes
df = pd.merge(departures, expenditures, on=['Country','Year'], how='inner')

## Add a column Expenditure per visitor

### Convert the Expenditure data expressed in millions of $ to number

In [15]:
df['Expenditure'] = df['Expenditure'] * 1000000

### Convert the Visitors data expressed in thousands of people to number

In [16]:
df['Number_of_Tourists'] = df['Number_of_Tourists'] * 1000

In [17]:
df['Expenditure_per_Visitor'] = df['Expenditure'] / df['Number_of_Tourists']

In [18]:
df.head()

Unnamed: 0,Country,Year,Number_of_Tourists,Expenditure,Expenditure_per_Visitor
0,ALGERIA,1995,1090000.0,186000000.0,170.642202
1,ALGERIA,1996,874000.0,188000000.0,215.102975
2,ALGERIA,1997,827000.0,144000000.0,174.123337
3,ALGERIA,1998,879000.0,269000000.0,306.029579
4,ALGERIA,1999,903000.0,251000000.0,277.962348


## Keep only three columns: Country, Year and Expenditure_per_Visitor

In [19]:
df = df[['Country', 'Year', 'Expenditure_per_Visitor']]

## Export the final dataframe for data vizualisation

In [21]:
df.to_csv('Visitors_Spendings.csv')