In [1]:
# For auto complete
%config Completer.use_jedi = True

In [2]:
# Used libraries imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# **1. DATA EXPLORATION**
## Upon exploring the training data, we can conclude some observations for pre-processing

In [3]:
# Reading data
airbnb_data_path = "../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip"
# df stands for train data for ease of use
df = pd.read_csv(airbnb_data_path)

In [4]:
# Explore data discription
df.describe()

In [5]:
# Explore data heads
df.head()

In [6]:
# Explore all data
df

# **2. DATA VISUALIZATION**
## Visualize the following attributes:
### 2.1. Gender
### 2.2. Age
### 2.3. Destination country

## **2.1. Gender Feature Visualization**

In [7]:
df.gender.value_counts(dropna=False).plot(kind = 'bar')
plt.xlabel('Gender')
plt.ylabel('Users count')
sns.despine()

## **2.2. Age Feature Visualization**

### **2.2.1. Data description**

In [8]:
# Describe the data
df.age.describe()

### 2014 ??? weird value for age

In [9]:
print(sum(df.age > 150))

### **2.2.2. Data distribution histogram**

In [10]:
# Using data visualization, explore data distribution
ages = df.age
bins = 100
plt.hist(ages, bins, color = 'blue',histtype = 'bar', rwidth = 0.6)
plt.show()

### Appears to has outlier value(s)

### **2.2.3. Data distribution boxplot**

In [11]:
# Using data visualization, explore data distribution
ages = df.age
fig = plt.figure(figsize =(10, 8))
plt.boxplot(ages)
plt.show()

## **2.1. Destination Country Feature Visualization**

In [12]:
perc = df.country_destination.value_counts() / df.shape[0] * 100
perc.plot(kind='bar')
plt.xlabel('Destination Country')
plt.ylabel('Percentage of travelers')
sns.despine()

# **3. Data Pre-Processing**

## **3.1. Ahmed --->>> gender, sign-up method & language**

# **gender**

In [13]:
# Explore gender
df.gender

In [14]:
# Code the nominal values
df.gender.replace("MALE", 1, inplace = True)
df.gender.replace("FEMALE", 0, inplace = True)
df.gender.replace("OTHER",np.nan,inplace=True)
df.gender.replace('-unknown-', np.nan, inplace=True)
df.gender.isnull().sum()

# **signup_method**

In [15]:
# Explore signup_method
df.signup_method.describe
df.signup_method.isnull().sum()
df.signup_method.unique()

In [16]:
# Code the nominal values
df.signup_method.replace("basic", 0, inplace = True)
df.signup_method.replace("facebook", 1, inplace = True)
df.signup_method.replace("google", 2, inplace = True)

# **language**

In [17]:
# Explore language
df.language.isnull().sum()
df.language.describe()
# So we have 25 languages
df.language.unique()

In [18]:
# Code nominal values
# Create a list with coded languages
languages_no = df.language.unique()
for i in range(25):
    languages_no[i] = i

# Replace
df.language.replace(to_replace = df.language.unique(), value = languages_no, inplace = True)
df.language.unique()

# **Age**

In [19]:
df.age.describe()

In [20]:
df['age'].fillna(df['age'].mean(),inplace =True)
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3-Q1
l = Q1 - 1.5*IQR
u = Q3 + 1.5*IQR
a = df['age']< l
b = df['age']> u
df = df [~(a|b)]
df.describe()

# **date_first_booking**

In [21]:
df.date_first_booking.describe()

In [22]:
df.date_first_booking.replace('NaN', np.nan, inplace=True)
df.date_first_booking.isnull().sum()

# **Signup App**

In [23]:
df.signup_app.describe()
df.signup_app.unique()

In [24]:
df.signup_app.replace("Web", 0, inplace = True)
df.signup_app.replace("Moweb", 1, inplace = True)
df.signup_app.replace("iOS", 2, inplace = True)
df.signup_app.replace("Android", 3, inplace = True)

# **affiliate_provider**

In [25]:
df.affiliate_provider.describe()

In [26]:
e = df.affiliate_provider.unique()
for k in range(18):
    e[k]= k
df.affiliate_provider.replace(df.affiliate_provider.unique(),e,inplace=True) 

# **first_device_type**

In [27]:
df.first_device_type.describe()

In [28]:
f = df.first_device_type.unique()
for j in range(9):
    f[j]= j
df.first_device_type.replace(df.first_device_type.unique(),f,inplace=True) 

# **date_account_created**

In [29]:
df.date_account_created.describe()
print(df.date_account_created.isnull().sum())

# **first_affiliate_tracked**

In [30]:
df.first_affiliate_tracked.describe()
print(df.first_affiliate_tracked.isnull().sum())
df.first_affiliate_tracked.unique()


In [31]:
fat = df.first_affiliate_tracked.unique()
for i in range(8):
    fat[i] = i

# Replace
df.first_affiliate_tracked.replace(to_replace = df.first_affiliate_tracked.unique(), value = fat, inplace = True)
df.first_affiliate_tracked.unique()
df.first_affiliate_tracked.describe()

# **first_browser**

In [32]:
df.first_browser.head(50)
df.first_browser.unique()

In [33]:
df.first_browser.value_counts(dropna=False).plot(kind = 'bar')
plt.xlabel('first_browser')
plt.ylabel('Users count')
sns.despine()

In [34]:
#replace unkown values with median (chrome) 
df.first_browser.replace('-unknown-', '-chrome-', inplace=True)

In [35]:
df.first_browser.value_counts(dropna=False).plot(kind = 'bar')
plt.xlabel('first_browser')
plt.ylabel('Users count')
sns.despine()

In [36]:
fb = df.first_browser.unique()
for i in range(52):
  fb[i] = i

# Replace
df.first_browser.replace(to_replace = df.first_browser.unique(), value = fb, inplace = True)
df.first_browser.unique()
#df.first_browser.describe()

# **country_destination**

In [37]:
df.country_destination.head(50)
df.country_destination.unique()
#df.country_destination.isnull().sum()

In [38]:
cd = df.country_destination.unique()
for i in range(12):
  cd[i] = i

# Replace
df.country_destination.replace(to_replace = df.country_destination.unique(), value = cd, inplace = True)
df.country_destination.unique()
#df.country_destination.describe()

# **timestamp_first-active**

In [39]:
df['timestamp_first_active'] = pd.to_datetime(df['timestamp_first_active'])
df['timestamp_first_active'] = df.timestamp_first_active.dt.weekday
df.timestamp_first_active.describe()

In [40]:
df[df.timestamp_first_active == 3]['timestamp_first_active'].describe()


This means that all the data is happening on the same day, which is Thursday.

In [41]:
fig, ax = plt.subplots()
timestamp_first_active_percentage = df.timestamp_first_active.value_counts() / df.shape[0] * 100
timestamp_first_active_percentage.plot(kind='bar',color='#3498DB')
plt.xlabel('timestamp_first_active')
plt.ylabel('Percentage')
sns.despine()

# **ID**

In [42]:
# Remove ID's since now we are not interested in making predictions
df.drop('id',axis=1, inplace=True)

df.head()

# **signup_flow**

In [43]:
df.signup_flow.describe()

In [44]:
fig, ax = plt.subplots()
signup_flow_percentage = df.signup_flow.value_counts() / df.shape[0] * 100
signup_flow_percentage.plot(kind='bar',color='#3498DB')
plt.xlabel('affilate_channel')
plt.ylabel('Percentage')
sns.despine()

# **affiliate_channel**

In [45]:
fig, ax = plt.subplots()
affiliate_channel_percentage = df.affiliate_channel.value_counts() / df.shape[0] * 100
affiliate_channel_percentage.plot(kind='bar',color='#3498DB')
plt.xlabel('affiliate_channel')
plt.ylabel('Percentage')
sns.despine()

In [46]:
df.affiliate_channel.replace("direct", 0, inplace = True)
df.affiliate_channel.replace("sem-brand", 1, inplace = True)
df.affiliate_channel.replace("sem-non-brand", 2, inplace = True)
df.affiliate_channel.replace("other", 3, inplace = True)
df.affiliate_channel.replace("seo", 4, inplace = True)
df.affiliate_channel.replace("api", 5, inplace = True)
df.affiliate_channel.replace("content", 6, inplace = True)
df.affiliate_channel.replace("remarketing", 7, inplace = True)
df.head()

# Extracting year for date_account_created


In [47]:
df['date_account_created'] = pd.to_datetime(df['date_account_created'])
df['acc_year'] = df['date_account_created'].dt.year
df['ac_month'] = df['date_account_created'].dt.month
df['ac_day'] = df['date_account_created'].dt.day
df.drop(['date_account_created','acc_year'], 1,inplace=True)
df

In [48]:
#years = [2010,2011,2012,2013,2014]
#yearsOrder=pd.api.types.CategoricalDtype(ordered=True, categories=years)
#df.acc_year = df.acc_year.astype(yearsOrder)
#df

# Split the dataframe into X and y

In [49]:
X = df.drop(['country_destination','date_first_booking'], 1)
y = df[['country_destination']]

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state =42)

In [51]:
X_train


In [52]:
y

In [56]:
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0,use_label_encoder=False)                  
xgb.fit(X_train, y_train)

In [57]:
y_pred = xgb.predict_proba(X_test)

In [62]:
from sklearn.metrics import accuracy_score, f1_score
accuracy_score(y_test,np.argmax(y_pred,axis = 1)) ,f1_score(y_test,np.argmax(y_pred,axis=1),average='weighted')