In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import csv
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
from sklearn import impute 
from sklearn.svm import LinearSVR

In [2]:
def simplify_ages(df):
	df.Age = df.Age.fillna(-0.5)
	bins = (0,15,25,36,50,120)
	group_names = ['0-15', '15-25', '25-36', '36-50', '50-120']
	df.Age = pd.cut(df.Age, bins, labels=group_names)
	return df

def simplify_city_size(df):
	df['Size of City'] = df['Size of City'].fillna(-0.5)
	bins = (-1,0,72734,286000,506092,750000,1184501,25000000,50000000)
	group_names = ['Unknown', '1_quartile', '1b_quartile', '2_quartile', '2_quartileb', '3_quartile', '3_quartileb', '4_quartile']
	df['Size of City'] = pd.cut(df['Size of City'], bins, labels=group_names)
	return df

def simplify_yor(df):
	df['Year of Record'] = df['Year of Record'].fillna(-0.5)
	bins = (-1,1979,1990,2000,2010,2020)
	group_names = ['unknown', '1980s', '1990s', '2000s', '2010s']
	df['Year of Record'] = pd.cut(df['Year of Record'], bins, labels=group_names)
	return df

def simplify_height(df):
	df['Body Height [cm]'] = df['Body Height [cm]'].fillna(-0.5)
	bins = (-1, 90, 160, 175, 191, 270)
	group_names = ['Unknown', '90-160', '160-175', '175-191', '191-270']
	df['Body Height [cm]'] = pd.cut(df['Body Height [cm]'], bins, labels=group_names)
	return df

def drop_features(df):
	df = df.drop('Hair Color', axis=1)
	df = df.drop('Wears Glasses', axis=1)
	df = df.drop('Instance', axis=1)
	#df = df.drop('University Degree', axis=1)
	#df = df.drop('Gender', axis=1)
	#df = df.drop('Profession', axis=1)
	return df

def encode_features(df):
	features = ['University Degree', 'Age', 'Country', 'Size of City', 'Body Height [cm]', 'Year of Record', 'Gender', 'Profession']
	for feature in features:
		le = preprocessing.LabelEncoder()
		df[feature] = le.fit_transform(df[feature].astype(str))
	return df


In [3]:
train_data = pd.read_csv('tcd ml 2019-20 income prediction training (with labels).csv')

So far just imported some libraries, defined some functions and read in our csv file.

In [4]:
train_data.head()

Unnamed: 0,Instance,Year of Record,Gender,Age,Country,Size of City,Profession,University Degree,Wears Glasses,Hair Color,Body Height [cm],Income in EUR
0,1,1997.0,0,41.0,Belarus,1239930,steel workers,Bachelor,0,Blond,193,61031.94416
1,2,1996.0,other,41.0,Singapore,1603504,safe event coordinator,Master,0,Black,186,91001.32764
2,3,2018.0,other,28.0,Norway,1298017,receivables/payables analyst,PhD,1,Brown,170,157982.1767
3,4,2006.0,other,33.0,Cuba,751903,fleet assistant,No,1,Black,171,45993.75793
4,5,2010.0,female,46.0,United Arab Emirates,95389,lead trainer,0,0,Blond,188,38022.16217


Having a look at the data...

In [5]:
#plt.figure(figsize=(12,10))
#sns.heatmap(train_data.corr(), annot=True)

In [6]:
train_data[(train_data['Gender'] == '0')] = None
train_data[(train_data['University Degree'] == '0')] = None
train_data = train_data.fillna(train_data.median())
train_data = train_data.fillna("unknown")


Changing all 0's to NAs and filling NAs with median or 'unknown'

In [7]:
train_data = simplify_ages(train_data)
train_data = simplify_city_size(train_data)
train_data = simplify_yor(train_data)
train_data = simplify_height(train_data)
train_data = drop_features(train_data)
train_data.head()

Unnamed: 0,Year of Record,Gender,Age,Country,Size of City,Profession,University Degree,Body Height [cm],Income in EUR
0,1990s,unknown,25-36,unknown,2_quartileb,unknown,unknown,160-175,57224.682505
1,1990s,other,36-50,Singapore,3_quartileb,safe event coordinator,Master,175-191,91001.32764
2,2010s,other,25-36,Norway,3_quartileb,receivables/payables analyst,PhD,160-175,157982.1767
3,2000s,other,25-36,Cuba,3_quartile,fleet assistant,No,160-175,45993.75793
4,1990s,unknown,25-36,unknown,2_quartileb,unknown,unknown,160-175,57224.682505


Simplified some columns and dropped unnecessary features


In [8]:
train_data = encode_features(train_data)


In [9]:
X = train_data[train_data.columns[:-1]]
Y = train_data[train_data.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1) 


Extracting features and income and splitting them into training/testing data

In [10]:
# Linear Regression
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print("\n")
print(regr.score(X_test, y_test))



0.07624150465933854


In [11]:
# Lasso
regr = linear_model.Lasso()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print(regr.score(X_test, y_test))

0.07624151860263129


In [12]:
# Random Forest Regression
rf = RandomForestRegressor(n_estimators = 10, random_state = 42, max_depth=7)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print(rf.score(X_test, y_test))

0.23882179368484993


In [13]:
# SGD Regressor
regr = linear_model.SGDRegressor()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print(regr.score(X_test, y_test))

-2.1961923740752835e+17


In [14]:
print(y_test.head())
print("\n")
for i in range(5):
	print(y_pred[i])

82266      39776.03646
111182    262639.93520
12959      61691.45239
58024     134414.74310
65984      27224.69960
Name: Income in EUR, dtype: float64


76444805120539.39
-181179398912.44656
69811234944467.31
61381686646974.61
62410045707793.94


Some different algorithms and a comparison of the first 5 from test data vs predicted data

In [None]:
# SVR
clf = LinearSVR(gamma='scale', C=1.0, epsilon=0.2)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))