In [5]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Preprocessing

In [0]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns
import pickle
import sys
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
# from catboost import CatBoostRegressor
# from category_encoders import TargetEncoder

import lightgbm
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

## importing Dataset
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/tcd-ml-1920-group-income-train.csv', dtype={'Year of Record': object, 'Housing Situation': object, 'Work Experience in Current Job [years]': object})
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/tcd-ml-1920-group-income-test.csv', dtype={'Year of Record': object, 'Housing Situation': object, 'Work Experience in Current Job [years]': object})
dataset = pd.concat([train,test],ignore_index=True)

def preprocessing(dataset):

    # Converting Dataset to numeric type
    dataset['Year of Record'] = pd.to_numeric(dataset['Year of Record'])
    dataset['Total Yearly Income [EUR]'] = pd.to_numeric(dataset['Total Yearly Income [EUR]'])
    
    # Year of Record
    dataset['Year of Record'] = dataset['Year of Record'].fillna(1940)

    # Housing
    dataset['Housing Situation'] = dataset['Housing Situation'].replace('0', 'Unknown House')
    dataset['Housing Situation'] = dataset['Housing Situation'].replace('nA', 'Unknown House')
    dataset['Housing Situation'] = dataset['Housing Situation'].fillna('Unknown House')

    # Crime Level
    dataset['Crime Level in the City of Employement'] = dataset['Crime Level in the City of Employement'].fillna(0)

    # Work exp
    dataset['Work Experience in Current Job [years]'] = dataset['Work Experience in Current Job [years]'].fillna(0)
    dataset["Work Experience in Current Job [years]"] = dataset["Work Experience in Current Job [years]"].replace('#NUM!',0)
    dataset['Work Experience in Current Job [years]'] = pd.to_numeric(dataset['Work Experience in Current Job [years]'])


    # Satisfaction
    dataset["Satisfation with employer"] = dataset["Satisfation with employer"].fillna("Unknown Satisfaction")


    # Gender
    dataset['Gender'] = dataset['Gender'].replace({'f':'female', '0':'unknown'})
    dataset['Gender'] = dataset['Gender'].fillna('unknown')

    # Age
    dataset['Age'] = dataset['Age'].fillna(0)

    # Country
    dataset['Country'] = dataset['Country'].replace('0', 'Unknown Country')
    dataset['Country'] = dataset['Country'].replace('#N/A', 'Unknown Country')


    # Size city
    dataset['Size of City'] = dataset['Size of City'].fillna(0)

    # Profession
    dataset['Profession'] = dataset['Profession'].replace('0', 'Unknown Profession')

    # Degree
    dataset['University Degree'] = dataset['University Degree'].replace('0', 'Unknown University').fillna('Unknown University')

    # Glasses
    dataset['Wears Glasses'] = dataset['Wears Glasses']+1

    # Hair
    dataset['Hair Color'] = dataset['Hair Color'].replace({'0':'unknown'})
    dataset['Hair Color'] = dataset['Hair Color'].fillna('unknown')

    # Height was changed using gaussian distribution height feature
    '''
    dataset.loc[(dataset['Body Height [cm]'] > 145) & (dataset['Body Height [cm]'] < 200), 'Average Height'] = 1
    dataset.loc[(dataset['Body Height [cm]'] <= 145) | (dataset['Body Height [cm]'] >= 200), 'Average Height'] = 0
    '''

    # Add salary
    dataset['Yearly Income in addition to Salary (e.g. Rental Income)'] = dataset['Yearly Income in addition to Salary (e.g. Rental Income)'].replace(' EUR', '', regex=True).astype('float')
    
    # dropping some columns
    # dataset = dataset.drop(columns=['University Degree'])
    # dataset = dataset.drop(columns=['Wears Glasses'])
    # dataset = dataset.drop(columns=['Year of Record'])
    # dataset = dataset.drop(columns=['Gender'])
    # dataset = dataset.drop(columns=['Crime Level in the City of Employement'])

    return dataset

def addCountFeatures(dataset,features):
    # print(dataset.columns)
    for feature in features:
      new_feature = "no.of_"+str(feature)
      countFeature = dataset[feature].value_counts(dropna=False,normalize=False).to_dict()
      dataset[new_feature] = dataset[feature].map(countFeature)
      dataset[new_feature] = dataset[new_feature].astype('float32')
    return dataset


def HandlingCategorical(dataset):
    for col in dataset.columns:
      col_type = dataset[col].dtype
      # print("type",dataset[col].dtype)
      if col_type.name == 'category':
        print("Handling",col)
        encoder = LabelEncoder()
        dataset[col] = encoder.fit_transform(dataset[col].astype(str))
    
    return dataset


In [7]:
## preprocessing
dataset = preprocessing(dataset)

## seperation of income column
y = train['Total Yearly Income [EUR]']
dataset.drop(['Instance', 'Total Yearly Income [EUR]'], inplace=True, axis=1)

## adding count of each feature
dataset = addCountFeatures(dataset,dataset.columns)
print(dataset.columns)

## converting to categorical data types
for c in dataset.columns:
  col_type = dataset[c].dtype
  if col_type == 'object' or col_type.name == 'category':
      dataset[c] = dataset[c].astype('category')

## Categorical Encoding
dataset = HandlingCategorical(dataset)

## splitting the dataset into Train and test again
train = dataset.iloc[:len(train),:]
test = dataset.iloc[len(train):,:]

## splitting dataset in training and validation sets
x = train
# y = np.log(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=619)
print("\n\n   Dataset to be trained \n",x.dtypes)

# Create the LightGBM data containers
train_data = lightgbm.Dataset(x_train, label=y_train)
test_data = lightgbm.Dataset(x_test, label=y_test)



Index(['Year of Record', 'Housing Situation',
       'Crime Level in the City of Employement',
       'Work Experience in Current Job [years]', 'Satisfation with employer',
       'Gender', 'Age', 'Country', 'Size of City', 'Profession',
       'University Degree', 'Wears Glasses', 'Hair Color', 'Body Height [cm]',
       'Yearly Income in addition to Salary (e.g. Rental Income)',
       'no.of_Year of Record', 'no.of_Housing Situation',
       'no.of_Crime Level in the City of Employement',
       'no.of_Work Experience in Current Job [years]',
       'no.of_Satisfation with employer', 'no.of_Gender', 'no.of_Age',
       'no.of_Country', 'no.of_Size of City', 'no.of_Profession',
       'no.of_University Degree', 'no.of_Wears Glasses', 'no.of_Hair Color',
       'no.of_Body Height [cm]',
       'no.of_Yearly Income in addition to Salary (e.g. Rental Income)'],
      dtype='object')
Handling Housing Situation
Handling Satisfation with employer
Handling Gender
Handling Country
Handling P

# Training

In [0]:

parameters = {
    'objective': 'tweedie',
    'max_depth':30,
    'learning_rate': 0.1,
    'metric': 'mae',
    'feature_fraction': 0.8,
    'boosting': 'gbdt',
    'bagging_freq': 20,
    'verbose': -1
}

model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=[train_data,test_data],
                       verbose_eval=100,
                       num_boost_round=10000,
                       early_stopping_rounds=500)


Training until validation scores don't improve for 500 rounds.
[100]	training's l1: 9986.23	valid_1's l1: 10047.6
[200]	training's l1: 9134.61	valid_1's l1: 9247.36
[300]	training's l1: 8866.93	valid_1's l1: 9022.21
[400]	training's l1: 8668.36	valid_1's l1: 8868.04
[500]	training's l1: 8541.72	valid_1's l1: 8788.07
[600]	training's l1: 8422.34	valid_1's l1: 8709.7
[700]	training's l1: 8312.91	valid_1's l1: 8648.97
[800]	training's l1: 8223.54	valid_1's l1: 8605.74
[900]	training's l1: 8136.57	valid_1's l1: 8563.31
[1000]	training's l1: 8056.92	valid_1's l1: 8526.17
[1100]	training's l1: 7975.44	valid_1's l1: 8492.8
[1200]	training's l1: 7905.52	valid_1's l1: 8469.42
[1300]	training's l1: 7837.83	valid_1's l1: 8449.59
[1400]	training's l1: 7775.79	valid_1's l1: 8431.12
[1500]	training's l1: 7715.57	valid_1's l1: 8415.24
[1600]	training's l1: 7655.6	valid_1's l1: 8399.15
[1700]	training's l1: 7605.53	valid_1's l1: 8390.2
[1800]	training's l1: 7552.29	valid_1's l1: 8379.46
[1900]	trainin

# Prediction

In [0]:

y = model.predict(test)
print(y)
sub = pd.DataFrame(y)

index = (np.arange(1,len(sub),1).astype(np.int32))
# print(testDataset_prediction.index)
sub.index += 1
print(sub.index)

## making csv
sub.to_csv("/content/drive/My Drive/Colab Notebooks/lgbmCountGBDtABnormal3.csv",header=['Total Yearly Income [EUR]'],index_label='Instance')
