# Data Preprocessing

In [1]:
import numpy as np
import pandas as pd

import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
import pycountry_convert as pc
import pycountry
from difflib import SequenceMatcher

import warnings
warnings.filterwarnings('ignore')

## Load Data file

In [2]:
df = pd.read_csv(r'data/Train.csv')

In [3]:
df.shape

(4809, 23)

### Null Values

In [4]:
df.isnull().sum()/df.shape[0]

ID                       0.000000
country                  0.000000
age_group                0.000000
travel_with              0.231649
total_female             0.000624
total_male               0.001040
purpose                  0.000000
main_activity            0.000000
info_source              0.000000
tour_arrangement         0.000000
package_transport_int    0.000000
package_accomodation     0.000000
package_food             0.000000
package_transport_tz     0.000000
package_sightseeing      0.000000
package_guided_tour      0.000000
package_insurance        0.000000
night_mainland           0.000000
night_zanzibar           0.000000
payment_mode             0.000000
first_trip_tz            0.000000
most_impressing          0.065086
total_cost               0.000000
dtype: float64

## Imputing Missing Values

The following features need imputing for its missing values -
1. travel_with     | Categorical | 1114 | 23%
2. most_impressing | Categorical | 313  | 6.5%
3. total_male      | Numerical   | 5    | <0%
4. total_female    | Numerical   | 3    | <0%

Categorical variables :
    Imputing all the categorical fields using MissForest Imputer from missingpy library since more than 20% of the data is missing for the feature 'travel_with'. For using MissForest Imputer we need to first encode categorical data for the algorithm to function. Hence we will manually encode the categorical variables for ease of inversion to categorical data type.
    
Numerical variables :
    The variables total male and total female have a few null values but for a few rows both variables show 0 number of tourists attending the tour. These ghost numbers will be replaced with the mean for computing purposes.

In [5]:
# Dictionary for encoding the categorical features

# Most Impressing Feature
mi = {
    'Friendly People' : 0, 
    'Wonderful Country, Landscape, Nature': 1,
    'Excellent Experience': 2,
    'No comments': 3,
    ' Wildlife': 4,
    'Good service': 5, 
    'Satisfies and Hope Come Back': 6
}

# Travel With feature
trw = {
    'Friends/Relatives': 0, 
    'Alone': 1, 
    'Spouse': 2, 
    'Children': 3,
    'Spouse and Children': 4
}

In [6]:
def find_value(x, dictionary):
    """Find the value from dictionary for a key"""
    for k, v in dictionary.items():
        if x == k:
            return int(v)

def find_key(x, dictionary):
    """Find the key from dictionary for a value"""
    for k, v in dictionary.items():
        if x == v:
            return k

In [7]:
# Most Impressing
df['most_impressing'] = df['most_impressing'].apply(lambda x : find_value(x, mi))

# Travel With
df['travel_with'] = df['travel_with'].apply(lambda x : find_value(x, trw))

## Ghost Tourists
cond1 = df['total_male']==0 # Condition: Male Tourist is 0
cond2 = df['total_female']==0 # Condition: Female Tourist is 0
male_avg = round(df['total_male'].mean()) # Average Make tourists in the data
female_avg = round(df['total_female'].mean()) # Average Female tourists in the data

# Total Male
df.loc[cond1 & cond2, 'total_male'] = male_avg

# Total Female
df.loc[cond1 & cond2, 'total_female'] = female_avg

In [8]:
# Imputer
imputer = MissForest(random_state=7)
X = df.drop(['ID'], axis=1)
cat_cols = X.select_dtypes(include='object').columns

# Encode categorical data
le = LabelEncoder()
for i in cat_cols:
    X[i] = le.fit_transform(X[i])

# Impute Missing Values in 'travel_with' feature
X_imputed = imputer.fit_transform(X)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


In [9]:
X_clean = pd.DataFrame(X_imputed, columns=X.columns)

In [10]:
# Most Impressing
df['most_impressing'] = X_clean['most_impressing'].apply(lambda x : find_key(round(x), mi))

# Travel With
df['travel_with'] = X_clean['travel_with'].apply(lambda x : find_key(round(x), trw))

# Male Tourists
df['total_male'].fillna(male_avg, inplace=True)

# Female Tourists
df['total_female'].fillna(female_avg, inplace=True)

In [11]:
df['travel_with'].value_counts()

Alone                  2356
Spouse                 1018
Friends/Relatives       903
Spouse and Children     368
Children                164
Name: travel_with, dtype: int64

## Country Name Anomalies

The following country name inputs were entered incorrectly. We will have to correct it for the data analysis

1. SWIZERLAND : SWITZERLAND
2. MALT : MALTA
3. UKRAIN : UKRAINE
4. BURGARIA : BULGARIA
5. TRINIDAD TOBACCO : TRINIDAD AND TOBAGO
6. COMORO : COMOROS
7. PHILIPINES : PHILIPPINES
8. DJIBOUT : DJIBOUTI
9. MORROCO : MOROCCO
10. SCOTLAND : UNITED KINGDOM

Since we need to match the country names provided in the pycountry library for Geospatial analysis we convert them using a function.

In [12]:
df['country'] = df['country'].str.title()

In [13]:
[{country.alpha_2: country.name} for country in sorted(pycountry.countries, key=lambda x: x.name)]

[{'AF': 'Afghanistan'},
 {'AL': 'Albania'},
 {'DZ': 'Algeria'},
 {'AS': 'American Samoa'},
 {'AD': 'Andorra'},
 {'AO': 'Angola'},
 {'AI': 'Anguilla'},
 {'AQ': 'Antarctica'},
 {'AG': 'Antigua and Barbuda'},
 {'AR': 'Argentina'},
 {'AM': 'Armenia'},
 {'AW': 'Aruba'},
 {'AU': 'Australia'},
 {'AT': 'Austria'},
 {'AZ': 'Azerbaijan'},
 {'BS': 'Bahamas'},
 {'BH': 'Bahrain'},
 {'BD': 'Bangladesh'},
 {'BB': 'Barbados'},
 {'BY': 'Belarus'},
 {'BE': 'Belgium'},
 {'BZ': 'Belize'},
 {'BJ': 'Benin'},
 {'BM': 'Bermuda'},
 {'BT': 'Bhutan'},
 {'BO': 'Bolivia, Plurinational State of'},
 {'BQ': 'Bonaire, Sint Eustatius and Saba'},
 {'BA': 'Bosnia and Herzegovina'},
 {'BW': 'Botswana'},
 {'BV': 'Bouvet Island'},
 {'BR': 'Brazil'},
 {'IO': 'British Indian Ocean Territory'},
 {'BN': 'Brunei Darussalam'},
 {'BG': 'Bulgaria'},
 {'BF': 'Burkina Faso'},
 {'BI': 'Burundi'},
 {'CV': 'Cabo Verde'},
 {'KH': 'Cambodia'},
 {'CM': 'Cameroon'},
 {'CA': 'Canada'},
 {'KY': 'Cayman Islands'},
 {'CF': 'Central African Repu

In [14]:
# Corrected country names
c_name = {
    'Swizerland': 'Switzerland',
    'Malt': 'Malta',
    'Ukrain': 'Ukraine',
    'Burgaria': 'Bulgaria',
    'Trinidad Tobacco': 'Trinidad and Tobago',
    'Comoro': 'Comoros',
    'Philipines': 'Philippines',
    'Djibout': 'Djibouti',
    'Morroco': 'Morocco',
    'Scotland': 'United Kingdom',
    'United States Of America': 'United States',
    'Costarica': 'Costa Rica',
    'Uae': 'United Arab Emirates',
    'Drc': 'Congo',
    'Korea': 'Korea, Republic of',
    'Russia': 'Russian Federation',
    'Iran': 'Iran',
    'Czech Republic': 'Czechia'
}

# Applying correction on the function
df['country'] = df['country'].apply(lambda x: c_name.get(x, x))

In [15]:
df['country'].unique()

array(['Switzerland', 'United Kingdom', 'China', 'South Africa',
       'United States', 'Nigeria', 'India', 'Brazil', 'Canada', 'Malta',
       'Mozambique', 'Rwanda', 'Austria', 'Myanmar', 'Germany', 'Kenya',
       'Algeria', 'Ireland', 'Denmark', 'Spain', 'France', 'Italy',
       'Egypt', 'Qatar', 'Malawi', 'Japan', 'Sweden', 'Netherlands',
       'United Arab Emirates', 'Uganda', 'Australia', 'Yemen',
       'New Zealand', 'Belgium', 'Norway', 'Zimbabwe', 'Zambia', 'Congo',
       'Bulgaria', 'Pakistan', 'Greece', 'Mauritius', 'Oman', 'Portugal',
       'Korea, Republic of', 'Swaziland', 'Tunisia', 'Kuwait', 'Dominica',
       'Israel', 'Finland', 'Czechia', 'Ukraine', 'Ethiopia', 'Burundi',
       'Russian Federation', 'Ghana', 'Niger', 'Malaysia', 'Colombia',
       'Luxembourg', 'Nepal', 'Poland', 'Singapore', 'Lithuania',
       'Hungary', 'Indonesia', 'Turkey', 'Trinidad and Tobago', 'Iraq',
       'Slovenia', 'Comoros', 'Sri Lanka', 'Iran', 'Montenegro', 'Angola',
       'L

### Output

In [17]:
df.to_csv(r'data/clean_train.csv', index=False)