## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import chardet
import seaborn as sns
import math
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
age_gender = pd.read_csv('data/age_gender_bkts.csv')
countries = pd.read_csv('data/countries.csv')
users = pd.read_csv('data/train_users_2.csv')

## Sessions CSV is very large, and requires a lot of memory to read, so we read it in through chunks, and append the chunks to a list to later be used by pandas to create a dataframe.

In [3]:
session_list = []

for chunk in pd.read_csv('data/sessions.csv', chunksize=10000):
    session_list.append(chunk)
    
sessions = pd.concat(session_list)

In [4]:
sessions.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


In [5]:
age_gender.head(1)

Unnamed: 0,age_bucket,country_destination,gender,population_in_thousands,year
0,100+,AU,male,1.0,2015.0


In [6]:
countries.head(1)

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0


In [7]:
users.head(1)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF


### User and Session data frames can be joined by a user id
* Could gather useful nformation such as what users spend the most time on in the AirBnB site, or what they do the most frequently on the site.

### User and Country can be joined by country_destination
* Good for looking at size and main language of country destination

### User and Age_Gender_Bkts can be joined on gender/country_dest/modified age
* This may be useful in getting the total population of the country they are headed to, or the population of the age/group they are in. Maybe even girl to guy ratio of age group they are in, or ratio of age group compared to entire population they are in.

In [8]:
other_dict = {
    'country_destination': 'other', 
    'lat_destination': countries['lat_destination'].mean(),
    'lng_destination': countries['lng_destination'].mean(),
    'distance_km': countries['distance_km'].mean(),
    'destination_km2': countries['destination_km2'].mean(),
    'destination_language ': countries['destination_language '].iloc[0],
    'language_levenshtein_distance': countries['language_levenshtein_distance'].mean()
}

countries.append(other_dict, ignore_index=True)
# maybe get a dynamic dictionary of languages based on the distance from where they live ?

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
1,CA,62.393303,-96.818146,2828.1333,9984670.0,eng,0.0
2,DE,51.165707,10.452764,7879.568,357022.0,deu,72.61
3,ES,39.896027,-2.487694,7730.724,505370.0,spa,92.25
4,FR,46.232193,2.209667,7682.945,643801.0,fra,92.06
5,GB,54.63322,-3.432277,6883.659,243610.0,eng,0.0
6,IT,41.87399,12.564167,8636.631,301340.0,ita,89.4
7,NL,52.133057,5.29525,7524.3203,41543.0,nld,63.22
8,PT,39.553444,-7.839319,7355.2534,92090.0,por,95.45
9,US,36.966427,-95.84403,0.0,9826675.0,eng,0.0


In [9]:
user_and_country = pd.merge(users, countries, how='outer', on='country_destination')

In [10]:
def getAgeList(row):
    clean_age = row['age_bucket'].replace('+', '')
    min_max = clean_age.split('-')
    
    if len(min_max) > 1:
        min_max = list(range(int(min_max[0]), int(min_max[1]) + 1))
    return min_max

age_gender['age_list'] = age_gender.apply(lambda x: getAgeList(x), axis=1)

In [11]:
age_gender.head(5)

Unnamed: 0,age_bucket,country_destination,gender,population_in_thousands,year,age_list
0,100+,AU,male,1.0,2015.0,[100]
1,95-99,AU,male,9.0,2015.0,"[95, 96, 97, 98, 99]"
2,90-94,AU,male,47.0,2015.0,"[90, 91, 92, 93, 94]"
3,85-89,AU,male,118.0,2015.0,"[85, 86, 87, 88, 89]"
4,80-84,AU,male,199.0,2015.0,"[80, 81, 82, 83, 84]"


In [12]:
x = sessions.groupby('user_id')['action_detail'].value_counts()

In [13]:
x.head(1)

user_id     action_detail
00023iyk9l  p3               6
Name: action_detail, dtype: int64

In [14]:
l = age_gender['age_list'].str.len()
cols = age_gender.columns.difference(['age_list'])

df = pd.DataFrame({col: np.repeat(age_gender[col].values, l) for col in cols})
df['age'] = np.concatenate(age_gender['age_list'].values).astype(int)

user_country_buckets = pd.merge(df, user_and_country, on=['age', 'country_destination'])
user_country_buckets = user_country_buckets.drop('age_bucket', axis=1)
user_country_buckets = user_country_buckets.drop('year', axis=1)

In [15]:
user_country_buckets['id'].count()
# AFTER MERGE = 119595

119596

In [16]:
users['id'].count()

213451

In [17]:
countries['country_destination'].count()

10

In [18]:
user_and_country['id'].count()

213451

In [19]:
def convertUnknown(row):
    if row['first_browser'] == '-unknown-':
        return 'Other'
    else:
        return row['first_browser']

user_and_country['first_browser'] = user_and_country.apply(lambda x: convertUnknown(x), axis=1)
user_and_country['first_browser'] = user_and_country['first_browser'].astype('category')

# 213451 Before getting rid of no-bookings
user_and_country['id'].count()

# Remove user who did not make a booking
user_and_country = user_and_country.loc[user_and_country['country_destination'] != 'NDF']
user_and_country = user_and_country.drop('language_levenshtein_distance', axis=1)

In [None]:
avg_age = math.ceil(user_and_country['age'].mean())
avg_age

user_and_country['age'] = user_and_country['age'].fillna(avg_age)
# Gender, First Affiliate Tracked, 

In [None]:
sns.swarmplot(x=user_and_country['gender'][:500], y=user_and_country['age'][:500])
plt.show()

# START LOOKING FOR DATA TO FILL,CHANGE, OR CLEAN

In [None]:
user_country_buckets.isnull().any()