In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import math
import scipy
import random
from catboost import CatBoostRegressor
%matplotlib inline

In [2]:
# Load datasets
df_train = pd.read_csv('data/tcd-ml-1920-group-income-train.csv')
df_test = pd.read_csv('data/tcd-ml-1920-group-income-test.csv')
df_sub = pd.read_csv('data/tcd-ml-1920-group-income-submission.csv')

# Define shorter names
new_columns = {
    'Year of Record': 'Year',
    'Housing Situation': 'House',
    'Crime Level in the City of Employement': 'Crime',
    'Work Experience in Current Job [years]': 'Experience',
    'Satisfation with employer': 'Satisfaction',
    'University Degree':'Degree',
    'Wears Glasses': 'Glasses',
    'Hair Color': 'Hair',
    'Body Height [cm]':'Height',
    'Yearly Income in addition to Salary (e.g. Rental Income)': 'Other Income',
    'Total Yearly Income [EUR]': 'Income'
          }

# Rename columns
df_train.rename(columns = new_columns, inplace = True)
df_test.rename(columns = new_columns, inplace = True)

# Remove trailing 'EUR' from entries
df_test['Other Income'] = df_test['Other Income'].str[:-4]
df_train['Other Income'] = df_train['Other Income'].str[:-4]

# Convert from string to int
df_test['Other Income'] = pd.to_numeric( df_test['Other Income'], errors='coerce').astype(np.int32)
df_train['Other Income'] = pd.to_numeric( df_train['Other Income'], errors='coerce').astype(np.int32)

# # Only a handful of '#NUM!' so drop these rows
# df_test = (df_test[df_test['Experience'] != '#NUM!'])
# df_train = (df_train[df_train['Experience'] != '#NUM!'])

# # Only 4013 missing years so drop
# df_test = (df_test[df_test['Year'].notnull()])
# df_train = (df_train[df_train['Year'].notnull()])

# # Only 2805 missing Professions so drop
# df_test = df_test[df_test['Profession'].notnull()]
# df_train = df_train[df_train['Profession'].notnull()]

# # There's 1 missing Country in test set so drop row
# df_test = df_test[df_test['Country'].notnull()]

# Convert strings to floats
df_test['Experience'] = pd.to_numeric(df_test['Experience'], errors='coerce')
df_train['Experience'] = pd.to_numeric(df_train['Experience'], errors='coerce')

#  Fix missing values
exp_test_median = df_test['Experience'].median()
exp_train_median = df_train['Experience'].median()
df_test['Experience'].fillna(exp_test_median, inplace=True)
df_train['Experience'].fillna(exp_train_median, inplace=True)

year_test_median = df_test['Year'].median()
year_train_median = df_train['Year'].median()
df_test['Year'].fillna(year_test_median, inplace=True)
df_train['Year'].fillna(year_train_median, inplace=True)

df_test['Profession'].fillna('Unknown', inplace=True)
df_train['Profession'].fillna('Unknown', inplace=True)

df_test['Country'].fillna('Unknown', inplace=True)
df_train['Country'].fillna('Unknown', inplace=True)

# Convert to ints for space
df_test['Year'] = df_test['Year'].astype(np.int16)
df_train['Year'] = df_train['Year'].astype(np.int16)

df_test['Experience'] = df_test['Experience'].astype(np.int8)
df_train['Experience'] = df_train['Experience'].astype(np.int8)

df_test['Crime'] = df_test['Crime'].astype(np.uint8)
df_train['Crime'] = df_train['Crime'].astype(np.uint8)

df_test['Age'] = df_test['Age'].astype(np.uint8)
df_train['Age'] = df_train['Age'].astype(np.uint8)

df_test['Height'] = df_test['Height'].astype(np.int16)
df_train['Height'] = df_train['Height'].astype(np.int16)

# Drop Instance Column
df_test.drop('Instance', inplace=True, axis=1)
df_train.drop('Instance', inplace=True, axis=1)

# Drop Income column from test set
df_test.drop('Income', inplace=True, axis=1)

# Replace NaNs merge 'f' and 'female'
df_test['Satisfaction'].fillna('Unknown', inplace=True)
df_train['Satisfaction'].fillna('Unknown', inplace=True)

df_test['Gender'].fillna('Unknown', inplace=True)
df_train['Gender'].fillna('Unknown', inplace=True)

df_test['Degree'].fillna('Unknown', inplace=True)
df_train['Degree'].fillna('Unknown', inplace=True)

df_test['Hair'].fillna('Unknown', inplace=True)
df_train['Hair'].fillna('Unknown', inplace=True)

df_test['Gender'].replace('unknown','Unknown',inplace=True)
df_train['Gender'].replace('unknown','Unknown', inplace=True)
df_test['Gender'].replace('f','female', inplace=True)
df_train['Gender'].replace('f','female', inplace=True)

df_test['Country'].replace("""b'Zimbabwe""",'Zimbabwe',inplace=True)

# Apply some preprocessing
# df_test['Experience'] = (df_test["Experience"] - 23).abs()
# df_train['Experience'] = (df_train["Experience"] - 23).abs()

# df_test['Height'] = (df_test['Height'] - 177).abs()
# df_train['Height'] = (df_train['Height'] - 177).abs()

# df_test['Size of City'] = df_test['Size of City'] < 1e7
# df_train['Size of City'] = df_train['Size of City'] < 1e7

# Maybe try sqrt() instead of log
# df_train['Income'] = np.sqrt(df_train['Income'])
# df_train['Income'] = np.log(df_train['Income'])


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df_train_and_test = df_train.append(df_test, ignore_index=True, sort=False)

In [4]:
# The following runs target (income) encoding
cols_to_encode = ['House', 'Satisfaction', 'Gender', 'Country', 'Profession', 'Degree', 'Hair']


for col in cols_to_encode:
    unique_values = df_train_and_test[col].unique()
    for unique_value in unique_values:
        matching_rows = df_train['Income'].loc[ df_train[col] == unique_value ]
        value_avg = matching_rows.mean()
        if value_avg > 0:
            df_test[col].replace(unique_value, value_avg, inplace=True)
            df_train[col].replace(unique_value, value_avg, inplace=True)
        else:
            rand_num = random.randint(10000,150000)
            df_test[col].replace(unique_value, rand_num, inplace=True)
            df_train[col].replace(unique_value, rand_num, inplace=True)

  result = method(y)


In [6]:
Y = df_train['Income']
df_train.drop('Income', inplace=True, axis=1)
X = df_train

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2, random_state = 0)

KeyError: 'Income'

In [7]:
X.dtypes

Year              int16
House           float64
Crime             uint8
Experience         int8
Satisfaction    float64
Gender          float64
Age               uint8
Country         float64
Size of City      int64
Profession      float64
Degree          float64
Glasses           int64
Hair            float64
Height            int16
Other Income      int32
dtype: object

In [None]:
# categorical_features_indices = np.where(X.dtypes != np.float)[0]

In [15]:
reg = CatBoostRegressor(iterations = 70000, loss_function='RMSE', task_type="GPU").fit(X_train, Y_train, metric_period=500)

0:	learn: 109384.7583266	total: 8.01ms	remaining: 9m 20s
500:	learn: 24090.0777346	total: 3.32s	remaining: 7m 40s
1000:	learn: 22362.4102692	total: 6.49s	remaining: 7m 27s
1500:	learn: 21645.1538530	total: 9.73s	remaining: 7m 24s
2000:	learn: 21165.7289333	total: 13s	remaining: 7m 20s
2500:	learn: 20809.8209800	total: 16.2s	remaining: 7m 16s
3000:	learn: 20537.5915109	total: 19.3s	remaining: 7m 11s
3500:	learn: 20301.3714898	total: 22.5s	remaining: 7m 8s
4000:	learn: 20097.3019751	total: 25.8s	remaining: 7m 5s
4500:	learn: 19912.6264170	total: 29s	remaining: 7m 2s
5000:	learn: 19744.4229217	total: 32.2s	remaining: 6m 59s
5500:	learn: 19583.6396756	total: 35.5s	remaining: 6m 55s
6000:	learn: 19442.9928551	total: 38.7s	remaining: 6m 52s
6500:	learn: 19310.5494523	total: 41.9s	remaining: 6m 49s
7000:	learn: 19181.5846451	total: 45.1s	remaining: 6m 46s
7500:	learn: 19061.3236589	total: 48.3s	remaining: 6m 42s
8000:	learn: 18947.0918696	total: 51.6s	remaining: 6m 40s
8500:	learn: 18837.4892

69500:	learn: 13922.3509480	total: 7m 32s	remaining: 3.25s
69999:	learn: 13900.5645431	total: 7m 35s	remaining: 0us


In [16]:
predictions = reg.predict(X_val)
score = metrics.mean_absolute_error( Y_val, predictions )
score

8527.182313007204

In [18]:
reg = CatBoostRegressor(iterations = 70000, task_type="GPU").fit(X, Y, metric_period=500)

0:	learn: 109372.6402370	total: 7.5ms	remaining: 8m 45s
500:	learn: 24219.1124277	total: 3.63s	remaining: 8m 24s
1000:	learn: 22500.0083021	total: 7.18s	remaining: 8m 15s
1500:	learn: 21810.2283184	total: 10.7s	remaining: 8m 9s
2000:	learn: 21346.1499362	total: 14.2s	remaining: 8m 4s
2500:	learn: 21016.1555579	total: 17.8s	remaining: 7m 59s
3000:	learn: 20746.3960472	total: 21.3s	remaining: 7m 55s
3500:	learn: 20520.8016215	total: 24.9s	remaining: 7m 53s
4000:	learn: 20322.2294968	total: 28.4s	remaining: 7m 49s
4500:	learn: 20145.7474942	total: 32s	remaining: 7m 45s
5000:	learn: 19990.7441141	total: 35.6s	remaining: 7m 42s
5500:	learn: 19844.8876830	total: 39.2s	remaining: 7m 39s
6000:	learn: 19711.7947714	total: 42.8s	remaining: 7m 36s
6500:	learn: 19592.8051128	total: 46.3s	remaining: 7m 32s
7000:	learn: 19479.8945524	total: 50s	remaining: 7m 29s
7500:	learn: 19366.5710795	total: 53.6s	remaining: 7m 26s
8000:	learn: 19257.6302632	total: 57.2s	remaining: 7m 23s
8500:	learn: 19158.3956

69500:	learn: 14646.3257220	total: 8m 20s	remaining: 3.6s
69999:	learn: 14626.8342462	total: 8m 24s	remaining: 0us


In [19]:
predictions = reg.predict(df_test)
df_sub['Total Yearly Income [EUR]'] = predictions
df_sub.to_csv('CatBoost_target_enc_median_70000_iter.csv', index=False)

## Results
Preprocessing, one-hot, unmodified income, linear regression: 695,862  
Preprocessing, target encode, unmodified income, linear regression: 63,226  
Preprocessing, target encode, sqrt income, linear regression: 59296  
Preprocessing (no abs on height and Experience), target encode, sqrt income, linear regression: 59186
Preprocessing, target encode, sqrt income, Catboost: 33219  
Preprocessing, target encode, income, Catboost: 15227  
No Height & Exp processing, target encode, income, Catboost: 15116
No Height & Exp processing, target encode, log income, Catboost: 1500
No processing, target encode, normal income, Catboost: 9648  
No processing, target encode, normal income, Catboost(2000): 9270  
No processing, target encode, normal income, Catboost(3000): 9110
No processing, target encode, normal income, Catboost(10000): 8823
No processing, target encode, normal income, Catboost(3000): 8698
No processing, target encode, normal income, Catboost(3000): 8660
No processing, target encode, normal income, Catboost(3000): 8636
No processing, target encode, normal income, Catboost(3000): 8611
No processing, target encode, normal income, Catboost(40,000): 8591
No processing, target encode, normal income, Catboost(50,000): 8560
No processing, target encode, normal income, Catboost(60,000): 8544
No processing, target encode, normal income, Catboost(70,000): 8527