In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from math import sqrt


In [2]:
def FeatureExtraction(dataFrame,columnName):

    # OneHeartEncoder
    encoder = OneHotEncoder(sparse = False, handle_unknown = 'ignore')

    #reshape the column
    column = dataFrame[columnName]
    column = np.array(column).reshape(-1,1)

    #Extract and join the data frame
    dataFrame = dataFrame.join(pd.DataFrame(encoder.fit_transform(column),columns=encoder.categories_,index=dataFrame.index))

    #Remove the Column
    dataFrame = dataFrame.drop([columnName], axis = 1)

    return dataFrame



In [3]:
def ExtractingSplFeatures(uniques,dataFrame,columnName):
    
    # OneHeartEncoder
    encoder = OneHotEncoder(categories = [uniques],sparse = False, handle_unknown = 'ignore')

    #reshape the column
    column = dataFrame[columnName]
    column = np.array(column).reshape(-1,1)
    print(column)
    #Extract the column and join the data frame
    dataFrame = dataFrame.join(pd.DataFrame(encoder.fit_transform(column),columns=encoder.categories_,index=dataFrame.index))

    #Remove the profession Column
    dataFrame = dataFrame.drop([columnName], axis = 1)

    return dataFrame

In [16]:
data = pd.read_csv('training-data-with-labels.csv')
data_test = pd.read_csv('test-data-without-labels.csv')

data = data[(data['Income in EUR']>0) & (data['Income in EUR']<2600000)]

#data = data.drop(['Hair Color'], axis=1)
#data_test = data_test.drop(['Hair Color'], axis=1)
#data = data.drop(['Size of City'], axis=1)
#data_test = data_test.drop(['Size of City'], axis=1)
#data = data.drop(['Instance'], axis=1)
#data_test = data_test.drop(['Instance'], axis=1)


#data_Q1 = data.quantile(0.25)
#data_Q3 = data.quantile(0.75)
#data_IQR = data_Q3 - data_Q1
#data = data[~((data < (data_Q1 - 1.5 * data_IQR)) |(data > (data_Q3 + 1.5 * data_IQR))).any(axis=1)]

#data_test_Q1 = data_test.quantile(0.25)
#data_test_Q3 = data_test.quantile(0.75)
#data_test_IQR = data_test_Q3 - data_test_Q1
#data_test = data_test[~((data_test < (data_test_Q1 - 1.5 * data_test_IQR)) |(data_test > (data_test_Q3 + 1.5 * data_test_IQR))).any(axis=1)]


In [17]:
data['Year of Record'].fillna(data['Year of Record'].mean(), inplace=True)   #filling missing numeric values
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Gender'] = data['Gender'].fillna('unknown gender')
data['Profession'] = data['Profession'].fillna('Unknown Profession')
data['University Degree'] = data['University Degree'].fillna('No degree')
data['Hair Color'] = data['Hair Color'].fillna('Unknown hair')


data_test['Year of Record'].fillna(data_test['Year of Record'].mean(), inplace=True)   #filling missing numeric values
data_test['Age'].fillna(data_test['Age'].mean(), inplace=True)
data_test['Gender'] = data_test['Gender'].fillna('unknown gender')
data_test['Profession'] = data_test['Profession'].fillna('unknown Profession')
data_test['University Degree'] = data_test['University Degree'].fillna('No degree')
data_test['Hair Color'] = data_test['Hair Color'].fillna('Unknown hair')


In [18]:
data['Gender'] = data['Gender'].replace(['0','unknown' ], 'unknown gender')                   #replacing 0s with categorical values
data['University Degree'] = data['University Degree'].replace(['0','#N/A'], 'No degree')
data['Hair Color'] = data['Hair Color'].replace(['0','unknown'], 'Unknown Hair')
#data['Year of Record'] = data['Year of Record'].replace(['0','#N/A'], 'Unknown Profession')

data_test['Gender'] = data_test['Gender'].replace(['0','unknown'], 'unknown')           #replacing 0s with categorical values
data_test['University Degree'] = data_test['University Degree'].replace(['0','#N/A'], 'No degree')
data_test['Hair Color'] = data_test['Hair Color'].replace(['0','unknown'], 'Unknown')
#data_test['Year of Record'] = data_test['Year of Record'].replace(['0','#N/A'], 'Unknown')


In [19]:
data = FeatureExtraction(data, 'Gender')
data = FeatureExtraction(data, 'University Degree')
data = FeatureExtraction(data, 'Hair Color')
data_unique_prof = data['Profession'].unique()
data_unique_country = data['Country'].unique()
data = ExtractingSplFeatures(data_unique_prof, data, 'Profession')
data = ExtractingSplFeatures(data_unique_country, data, 'Country')

data_test = FeatureExtraction(data_test, 'Gender')
data_test = FeatureExtraction(data_test, 'University Degree')
data_test = FeatureExtraction(data_test, 'Hair Color')
data_test = ExtractingSplFeatures(data_unique_prof, data_test, 'Profession')
data_test = ExtractingSplFeatures(data_unique_country, data_test, 'Country')


[['steel workers']
 ['safe event coordinator']
 ['receivables/payables analyst']
 ...
 ['messenger']
 ['senior case support associate']
 ['project manager ']]
[['Belarus']
 ['Singapore']
 ['Norway']
 ...
 ['Sri Lanka']
 ['Denmark']
 ['State of Palestine']]
[['senior project analyst']
 ['greeter']
 ['liaison']
 ...
 ['neigborhood resiliency specialist']
 ['it infrastructure project manager']
 ['materials engineer']]
[['Honduras']
 ['Kyrgyzstan']
 ['Portugal']
 ...
 ['Sweden']
 ['Netherlands']
 ['Tunisia']]


In [20]:
X = data.drop(['Income in EUR'], axis = 1)
y = data['Income in EUR']

xTrain, xValidate, yTrain, yValidate = train_test_split(X, y, test_size = 0.2, random_state = 0)

X_test =  data_test.drop("Income", axis=1)




#dummy_data = data.copy
#X_train = dummy_data.drop("Income in EUR", axis=1, inplace=False)
#y_train = data['Income in EUR']
#X_test = data_test.drop("Income in EUR", axis=1, inplace=False)

In [21]:
#applying the model
#regressor = LinearRegression()
regressor = BayesianRidge()
regressor.fit(xTrain, yTrain)
#regressor.fit(X, y)
y_pred = regressor.predict(xValidate)
result = regressor.predict(X_test)
res = pd.DataFrame(X_test['Instance'])
res['Income'] = result
res.index = X_test.index # for comparison
res.to_csv("tcd ml 2019-20 income prediction submission file.csv")
#y_rmse = np.y_pred[1:22400, :]
rms = np.sqrt(mean_squared_error(yValidate, y_pred))
print("rmes is" + str(rms))
#y_pred = pd.DataFrame(y_pred, columns=['Income']).to_csv('prediction.csv')

rmes is78531.66694608485
