## Load Dependencies

In [None]:
import pandas as pd  
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt  
import seaborn as seabornInstance
import math
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.datasets import make_regression
from sklearn.neural_network import MLPRegressor
from sklearn import metrics
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from scipy import stats
from scipy.stats.mstats import gmean
from catboost import CatBoostRegressor
%matplotlib inline

In [None]:
# Mask that will later be used for Splitting data into Train and Val
# The '111974' is the number of rows after removing outliers
mask = np.random.rand( 111974 ) < 0.8

## Load Datasets

In [None]:
df_train = pd.read_csv('tcd ml 2019-20 income prediction training (with labels).csv')
df_test = pd.read_csv('tcd ml 2019-20 income prediction test (without labels).csv')
df_sub = pd.read_csv('tcd ml 2019-20 income prediction submission file.csv')

## Remove Outliers

In [None]:
# The value of 200000 was chosen from earlier examination of the data
df_train = df_train[df_train['Income in EUR'] < 2000000]

## Append Test set onto Training

In [None]:
# Get the number of rows in each set
num_train_rows = df_train.shape[0]
num_test_rows = df_test.shape[0]

# Rename the Income column for uniformity
df_train.rename(columns={'Income in EUR':'Income'}, inplace = True)

# Append the test set to train set
df_train_and_test = df_train.append(df_test, ignore_index=True, sort=False)

## Replace NaNs

In [None]:
df_train_and_test['Gender'].fillna('unknown', inplace=True);
df_train_and_test['Profession'].fillna('unknown', inplace=True);
df_train_and_test['University Degree'].fillna('unknown', inplace=True);
df_train_and_test['Hair Color'].fillna('Unknown', inplace=True);
                                       
# Replace the missing years and ages with their mean values
year_mean = (df_train_and_test['Year of Record'].dropna()).mean()
age_mean = ( df_train_and_test['Age'].dropna()).mean()

df_train_and_test['Year of Record'].fillna(year_mean, inplace=True)
df_train_and_test['Age'].fillna(age_mean, inplace=True)

## Merge Similar Features

In [None]:
df_train_and_test['Gender'].replace('0', 'unknown', inplace=True)
df_train_and_test['University Degree'].replace('0', 'No', inplace=True)
df_train_and_test['Hair Color'].replace(['Unknown', '0'], 'unknown', inplace=True)

## Encode Professions using mean Incomes

In [None]:
df_train['Profession'].fillna('unknown', inplace=True);  # Fill all missing NaNs in Training set
unique_proffs = df_train_and_test['Profession'].unique() # Get a list of all the professions

# For every unique profession, get the average of that profession
# and replace the string label with that value
for job in unique_proffs:
    job_rows = df_train.loc[ df_train['Profession'] == job ]
    job_mean = ( job_rows.iloc[:,-1] ).mean()
    
    df_train_and_test['Profession'].replace(job, job_mean, inplace=True)

# For any professions which we couldn't calculate a score, replace them with the total average income
avg_income = df_train['Income'].mean()
df_train_and_test['Profession'].fillna(avg_income, inplace=True);

## Encode Countries

In [None]:
df_train['Country'].fillna('unknown', inplace=True);     # Fill all missing NaNs in Training set
unique_countries = df_train_and_test['Country'].unique() # Get a list of all the countries

# For each unique country, get the average income for that country
# and replace the country label with that value
for country in unique_countries:
    country_rows = df_train.loc[ df_train['Country'] == country ]
    country_mean = ( country_rows.iloc[:,-1] ).mean()
    
    df_train_and_test['Country'].replace(country, country_mean, inplace=True)

# For any stragglers, replace their label with the total average income
avg_income = df_train['Income'].mean()
df_train_and_test['Country'].fillna(avg_income, inplace=True);

## One-hot encode

In [None]:
# Get one hot-enoded versions of the remaining categorical features
genders = pd.get_dummies(df_train_and_test['Gender'], prefix='gender_')
degrees = pd.get_dummies(df_train_and_test['University Degree'], prefix='degree_')

# Join dummy variables to original dataframe
frames = [df_train_and_test, genders, degrees]
df_train_and_test = pd.concat(frames, axis = 1, sort=False)

# Extract the target column from dataset
y = df_train_and_test['Income']

# Drop the original categorical columns (Since they've been replaced by one-hot versions)
cols_to_drop = ['Income','Gender', 'Hair Color', 'University Degree']
df_train_and_test.drop(cols_to_drop, inplace=True, axis=1)

## Split Data

In [None]:
# Split the data back into training and test sets
X_train_full = df_train_and_test[:num_train_rows]
Y_train_full = y[:num_train_rows]

X_test = df_train_and_test[num_train_rows:]
Y_test = y[num_train_rows:]

# From the full available data, create a training and validation set
X_train = X_train_full[mask]
X_val = X_train_full[~mask]

Y_train = Y_train_full[mask]
Y_val = Y_train_full[~mask]

## Train

In [None]:
# Use the popular CatBoost Regressor to fit a model to our data
regr = CatBoostRegressor(iterations = 1750)
regr.fit(X_train, Y_train)

## Test Performance

In [None]:
predictions = regr.predict(X_val)
score = np.sqrt( metrics.mean_squared_error( Y_val, predictions ) )
score

## Train on Full Data

In [None]:
regr = CatBoostRegressor(iterations = 1750)
regr.fit(X_train_full, Y_train_full)
         
predictions = regr.predict(X_test)

## Save Predictions to file

In [None]:
df_sub['Income'] = predictions
df_sub.to_csv('predicitons_CatBoost_1750_iter_Mean_encoding.csv', index=False)