# 1. PREPROCESSING

## 1.1 Imports libraries

In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 
# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer #process null values
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import lightgbm as lgb
# system
import itertools
# matplotlib and seaborn for plotting
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import matplotlib.pyplot as plt
#import seaborn as sns

## 1.2 Load datasets

In [2]:
#The question data
question_data = pd.read_csv('data_all/question.csv')
print(question_data.shape)
question_data.head(5)


(129, 2)


Unnamed: 0,Column,QuestionText
0,Respondent,Randomized respondent ID number (not in order ...
1,Hobby,Do you code as a hobby?
2,OpenSource,Do you contribute to open source projects?
3,Country,In which country do you currently reside?
4,Student,"Are you currently enrolled in a formal, degree..."


In [3]:
#The training data
train_data = pd.read_csv('data_all/train.csv')
#train_data.head(5)

In [4]:
#The test data
test_data = pd.read_csv('data_all/test.csv')
#test_data['SalaryType'] = test_data['SalaryType'].fillna('Yearly')
#test_data.head(5)

In [5]:
#'Student','Employment', 'FormalEducation','UndergradMajor'
option_choices_cols_name= ['Hobby', 'OpenSource',
 'CompanySize', 'JobSatisfaction', 'CareerSatisfaction',
 'HopeFiveYears', 'JobSearchStatus', 'LastNewJob', 'UpdateCV', 'Currency', 'SalaryType',
 'TimeFullyProductive','AgreeDisagree1','AgreeDisagree2', 'AgreeDisagree3',
 'OperatingSystem','NumberMonitors', 'CheckInCode', 'AdBlocker','AdBlockerDisable','AdsAgreeDisagree1',
 'AdsAgreeDisagree2', 'AdsAgreeDisagree3', 'AIDangerous','AIInteresting',
 'AIResponsible', 'AIFuture','EthicsChoice', 'EthicsReport','EthicsResponsible','EthicalImplications',
 'StackOverflowRecommend','StackOverflowVisit','StackOverflowHasAccount', 'StackOverflowParticipate',
 'StackOverflowJobs','StackOverflowDevStory','StackOverflowJobsRecommend','StackOverflowConsiderMember', 'HypotheticalTools1',
 'HypotheticalTools2', 'HypotheticalTools3', 'HypotheticalTools4', 'HypotheticalTools5','WakeTime','HoursComputer',
 'HoursOutside', 'SkipMeals', 'Exercise', 'EducationParents','Age', 'Dependents',]

# Generate and mapping features by country

In [6]:
#Generate some features by country
Gen_map_country=train_data[train_data['Salary']>0]
Gen_map_country['avg_sal'] = Gen_map_country['Salary'].groupby(Gen_map_country['Country']).transform('mean')
Gen_map_country['min_sal'] = Gen_map_country['Salary'].groupby(Gen_map_country['Country']).transform('min')
Gen_map_country['max_sal'] = Gen_map_country['Salary'].groupby(Gen_map_country['Country']).transform('max')
Gen_map_country['count_sal'] = Gen_map_country['No'].groupby(Gen_map_country['Country']).transform('count')
Gen_map_country['mode_sal'] = Gen_map_country['Salary'].groupby(Gen_map_country['Country']).transform(lambda x:x.value_counts().index[0])
Gen_map_country['std_sal']=Gen_map_country.Country.map(Gen_map_country.groupby('Country')['Salary'].std())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [7]:
#Create map for country
Map_country=Gen_map_country[['Country','avg_sal','min_sal','max_sal','count_sal','mode_sal','std_sal']]
Salmap_country=Map_country.drop_duplicates()
#Salmap

In [8]:
#Mapping new gen features to train, test base on Country
train_data=pd.merge(train_data,Salmap_country,on='Country',how='left')
test_data=pd.merge(test_data,Salmap_country,on='Country',how='left')

# Generate and mapping features by YearsCoding

In [9]:
#Generate some features by YearsCoding
Gen_map_YearsCoding=train_data[train_data['Salary']>0]
Gen_map_YearsCoding['YearsCoding_avg_sal'] = Gen_map_YearsCoding['Salary'].groupby(Gen_map_YearsCoding['YearsCoding']).transform('mean')
Gen_map_YearsCoding['YearsCoding_min_sal'] = Gen_map_YearsCoding['Salary'].groupby(Gen_map_YearsCoding['YearsCoding']).transform('min')
Gen_map_YearsCoding['YearsCoding_max_sal'] = Gen_map_YearsCoding['Salary'].groupby(Gen_map_YearsCoding['YearsCoding']).transform('max')
Gen_map_YearsCoding['YearsCoding_count_sal'] = Gen_map_YearsCoding['No'].groupby(Gen_map_YearsCoding['YearsCoding']).transform('count')
Gen_map_YearsCoding['YearsCoding_std_sal']=Gen_map_YearsCoding.YearsCoding.map(Gen_map_YearsCoding.groupby('YearsCoding')['Salary'].std())
#Gen_map_YearsCoding['mode_sal'] = Gen_map_YearsCoding['Salary'].groupby(Gen_map_YearsCoding['YearsCoding']).transform(lambda x:x.value_counts().index[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [10]:
#Create map for year coding
Map_Yearscoding=Gen_map_YearsCoding[['YearsCoding','YearsCoding_avg_sal','YearsCoding_min_sal',
                                     'YearsCoding_max_sal','YearsCoding_count_sal','YearsCoding_std_sal']]
Salmap_Yearscoding=Map_Yearscoding.drop_duplicates()
#Salmap

In [11]:
#Mapping new gen features to train, test base on Yearscoding
train_data=pd.merge(train_data,Salmap_Yearscoding,on='YearsCoding',how='left')
test_data=pd.merge(test_data,Salmap_Yearscoding,on='YearsCoding',how='left')

# Generate some features by RaceEthnicity

In [12]:
#Generate some features by RaceEthnicity
Gen_map_RaceEthnicity=train_data[train_data['Salary']>0]
Gen_map_RaceEthnicity['RaceEthnicity_avg_sal'] = Gen_map_RaceEthnicity['Salary'].groupby(Gen_map_RaceEthnicity['RaceEthnicity']).transform('mean')
Gen_map_RaceEthnicity['RaceEthnicity_min_sal'] = Gen_map_RaceEthnicity['Salary'].groupby(Gen_map_RaceEthnicity['RaceEthnicity']).transform('min')
Gen_map_RaceEthnicity['RaceEthnicity_max_sal'] = Gen_map_RaceEthnicity['Salary'].groupby(Gen_map_RaceEthnicity['RaceEthnicity']).transform('max')
Gen_map_RaceEthnicity['RaceEthnicity_count_sal'] = Gen_map_RaceEthnicity['No'].groupby(Gen_map_RaceEthnicity['RaceEthnicity']).transform('count')
Gen_map_RaceEthnicity['RaceEthnicity_std_sal']=Gen_map_RaceEthnicity.RaceEthnicity.map(Gen_map_RaceEthnicity.groupby('RaceEthnicity')['Salary'].std())
#Gen_map_YearsCoding['mode_sal'] = Gen_map_YearsCoding['Salary'].groupby(Gen_map_YearsCoding['YearsCoding']).transform(lambda x:x.value_counts().index[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [13]:
#Create map for RaceEthnicity
Map_RaceEthnicity=Gen_map_RaceEthnicity[['RaceEthnicity','RaceEthnicity_avg_sal','RaceEthnicity_min_sal',
                                         'RaceEthnicity_max_sal','RaceEthnicity_count_sal','RaceEthnicity_std_sal']]
Salmap_RaceEthnicity=Map_RaceEthnicity.drop_duplicates()
#Salmap

In [14]:
#Mapping new gen features to train, test base on RaceEthnicity
train_data=pd.merge(train_data,Salmap_RaceEthnicity,on='RaceEthnicity',how='left')
test_data=pd.merge(test_data,Salmap_RaceEthnicity,on='RaceEthnicity',how='left')

# Generate some features by YearsCodingProf

In [15]:
#Generate some features by YearsCodingProf
Gen_map_YearsCodingProf=train_data[train_data['Salary']>0]
Gen_map_YearsCodingProf['YearsCodingProf_avg_sal'] = Gen_map_YearsCodingProf['Salary'].groupby(Gen_map_YearsCodingProf['YearsCodingProf']).transform('mean')
Gen_map_YearsCodingProf['YearsCodingProf_min_sal'] = Gen_map_YearsCodingProf['Salary'].groupby(Gen_map_YearsCodingProf['YearsCodingProf']).transform('min')
Gen_map_YearsCodingProf['YearsCodingProf_max_sal'] = Gen_map_YearsCodingProf['Salary'].groupby(Gen_map_YearsCodingProf['YearsCodingProf']).transform('max')
Gen_map_YearsCodingProf['YearsCodingProf_std_sal']=Gen_map_YearsCodingProf.YearsCodingProf.map(Gen_map_YearsCodingProf.groupby('YearsCodingProf')['Salary'].std())
#Gen_map_YearsCodingProf['YearsCoding_count_sal'] = Gen_map_YearsCodingProf['No'].groupby(Gen_map_YearsCodingProf['YearsCodingProf']).transform('count')
#Gen_map_YearsCoding['mode_sal'] = Gen_map_YearsCoding['Salary'].groupby(Gen_map_YearsCoding['YearsCoding']).transform(lambda x:x.value_counts().index[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [16]:
#Create map for YearsCodingProf
Map_YearsCodingProf=Gen_map_YearsCodingProf[['YearsCodingProf','YearsCodingProf_avg_sal','YearsCodingProf_min_sal',
                                     'YearsCodingProf_max_sal','YearsCodingProf_std_sal']]
Salmap_YearsCodingProf=Map_YearsCodingProf.drop_duplicates()
#Salmap
#Mapping new gen features to train, test base on YearsCodingProf
train_data=pd.merge(train_data,Salmap_YearsCodingProf,on='YearsCodingProf',how='left')
test_data=pd.merge(test_data,Salmap_YearsCodingProf,on='YearsCodingProf',how='left')

# Generate some features by CurrencySymbol

In [17]:
#Generate some features by CurrencySymbol
Gen_map_CurrencySymbol=train_data[train_data['Salary']>0]
Gen_map_CurrencySymbol['CurrencySymbol_avg_sal'] = Gen_map_CurrencySymbol['Salary'].groupby(Gen_map_CurrencySymbol['CurrencySymbol']).transform('mean')
Gen_map_CurrencySymbol['CurrencySymbol_min_sal'] = Gen_map_CurrencySymbol['Salary'].groupby(Gen_map_CurrencySymbol['CurrencySymbol']).transform('min')
Gen_map_CurrencySymbol['CurrencySymbol_max_sal'] = Gen_map_CurrencySymbol['Salary'].groupby(Gen_map_CurrencySymbol['CurrencySymbol']).transform('max')
Gen_map_CurrencySymbol['CurrencySymbol_std_sal']=Gen_map_CurrencySymbol.CurrencySymbol.map(Gen_map_CurrencySymbol.groupby('CurrencySymbol')['Salary'].std())
#Gen_map_YearsCodingProf['YearsCoding_count_sal'] = Gen_map_YearsCodingProf['No'].groupby(Gen_map_YearsCodingProf['YearsCodingProf']).transform('count')
#Gen_map_YearsCoding['mode_sal'] = Gen_map_YearsCoding['Salary'].groupby(Gen_map_YearsCoding['YearsCoding']).transform(lambda x:x.value_counts().index[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [18]:
#Create map for YearsCodingProf
Map_CurrencySymbol=Gen_map_CurrencySymbol[['CurrencySymbol','CurrencySymbol_avg_sal','CurrencySymbol_min_sal',
                                     'CurrencySymbol_max_sal','CurrencySymbol_std_sal']]
Salmap_CurrencySymbol=Map_CurrencySymbol.drop_duplicates()
#Salmap
#Mapping new gen features to train, test base on YearsCodingProf
train_data=pd.merge(train_data,Salmap_CurrencySymbol,on='CurrencySymbol',how='left')
test_data=pd.merge(test_data,Salmap_CurrencySymbol,on='CurrencySymbol',how='left')

# Generate some features for all choice columns

In [19]:
for col in option_choices_cols_name:
    print('Start column: '+col)
    name='Gen_map_'+col
    avgname=col+'_avg_sal'
    minname=col+'_min_sal'
    maxname=col+'_max_sal'
    Mapname='Map_'+col
    Salmapname='Salmap_'+col
    name=train_data[train_data['Salary']>0]
    name[avgname] = name['Salary'].groupby(name[col]).transform('mean')
    name[minname] = name['Salary'].groupby(name[col]).transform('min')
    name[maxname] = name['Salary'].groupby(name[col]).transform('max')
    #Create map for YearsCodingProf
    Mapname=name[[col,avgname,minname,maxname]]
    Salmapname=Mapname.drop_duplicates()
    #Salmap
    #Mapping new gen features to train, test base on YearsCodingProf
    train_data=pd.merge(train_data,Salmapname,on=col,how='left')
    test_data=pd.merge(test_data,Salmapname,on=col,how='left')

Start column: Hobby


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Start column: OpenSource
Start column: CompanySize
Start column: JobSatisfaction
Start column: CareerSatisfaction
Start column: HopeFiveYears
Start column: JobSearchStatus
Start column: LastNewJob
Start column: UpdateCV
Start column: Currency
Start column: SalaryType
Start column: TimeFullyProductive
Start column: AgreeDisagree1
Start column: AgreeDisagree2
Start column: AgreeDisagree3
Start column: OperatingSystem
Start column: NumberMonitors
Start column: CheckInCode
Start column: AdBlocker
Start column: AdBlockerDisable
Start column: AdsAgreeDisagree1
Start column: AdsAgreeDisagree2
Start column: AdsAgreeDisagree3
Start column: AIDangerous
Start column: AIInteresting
Start column: AIResponsible
Start column: AIFuture
Start column: EthicsChoice
Start column: EthicsReport
Start column: EthicsResponsible
Start column: EthicalImplications
Start column: StackOverflowRecommend
Start column: StackOverflowVisit
Start column: StackOverflowHasAccount
Start column: StackOverflowParticipate
Sta

In [20]:
print(test_data.shape)
print(train_data.shape)

(11259, 307)
(33857, 308)


### Target data

In [22]:
target = train_data['Salary'].astype(int)

### Missing values on training data

In [23]:
# Function to calculate missing values by column
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [25]:
missing_values = missing_values_table(train_data)
missing_values.head()

Your selected dataframe has 308 columns.
There are 293 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
TimeAfterBootcamp,30638,90.5
MilitaryUS,25092,74.1
HackathonReasons,20939,61.8
ErgonomicDevices,18628,55.0
AdBlockerReasons,16431,48.5


In [26]:
missing_row_values = train_data.isnull().sum(axis=1)
missing_row_percentage = 100*missing_row_values/(train_data.shape[1]-1)

In [27]:
missing_row_df = pd.concat([missing_row_values,missing_row_percentage],axis=1)

In [28]:
missing_row_df_ren = missing_row_df.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})

In [29]:
missing_row_df_ren[missing_row_df_ren.iloc[:,1]!=0].sort_values(
        '% of Total Values', ascending=False).round(1).head(10)

Unnamed: 0,Missing Values,% of Total Values
15718,241,78.5
29860,239,77.9
23584,238,77.5
21537,236,76.9
15283,236,76.9
13451,235,76.5
30123,235,76.5
2173,234,76.2
23209,234,76.2
22301,234,76.2


## View the data by datatypes

Convert some category columns to numeric

In [30]:
train_data["YearsCoding_convert"] = train_data["YearsCoding"].map( {'0-2 years': 1, '3-5 years': 4,'6-8 years': 7, 
                                                            '9-11 years': 10, '12-14 years': 13, '15-17 years': 16,
                                                           '18-20 years': 19, '21-23 years': 22, '24-26 years': 25,
                                                           '27-29 years': 28, '30 or more years': 30})
test_data["YearsCoding_convert"] = test_data["YearsCoding"].map( {'0-2 years': 1, '3-5 years': 4,'6-8 years': 7, 
                                                            '9-11 years': 10, '12-14 years': 13, '15-17 years': 16,
                                                           '18-20 years': 19, '21-23 years': 22, '24-26 years': 25,
                                                           '27-29 years': 28, '30 or more years': 30})

In [31]:
train_data["YearsCodingProf_convert"] = train_data["YearsCodingProf"].map( {'0-2 years': 1, '3-5 years': 4,'6-8 years': 7, 
                                                            '9-11 years': 10, '12-14 years': 13, '15-17 years': 16,
                                                           '18-20 years': 19, '21-23 years': 22, '24-26 years': 25,
                                                           '27-29 years': 28, '30 or more years': 30})
test_data["YearsCodingProf_convert"] = test_data["YearsCodingProf"].map( {'0-2 years': 1, '3-5 years': 4,'6-8 years': 7, 
                                                            '9-11 years': 10, '12-14 years': 13, '15-17 years': 16,
                                                           '18-20 years': 19, '21-23 years': 22, '24-26 years': 25,
                                                           '27-29 years': 28, '30 or more years': 30})

In [32]:
train_data["StackOverflowRecommend_convert"] = train_data["StackOverflowRecommend"].map( {'10 (Very Likely)': 10,
                                                                                  '0 (Not Likely)': 0})
test_data["StackOverflowRecommend_convert"] = test_data["StackOverflowRecommend"].map( {'10 (Very Likely)': 10,
                                                                                  '0 (Not Likely)': 0})

In [33]:
train_data["StackOverflowJobsRecommend_convert"] = train_data["StackOverflowJobsRecommend"].map( {'10 (Very Likely)': 10,
                                                                                  '0 (Not Likely)': 0})
test_data["StackOverflowJobsRecommend_convert"] = test_data["StackOverflowJobsRecommend"].map( {'10 (Very Likely)': 10,
                                                                                  '0 (Not Likely)': 0})

In [34]:
train_data["HoursComputer_convert"] = train_data["HoursComputer"].map( {'1 - 4 hours': 3,'5 - 8 hours': 7,
                                                               '9 - 12 hours': 11,'Over 12 hours': 14})
test_data["HoursComputer_convert"] = test_data["HoursComputer"].map( {'1 - 4 hours': 3,'5 - 8 hours': 7,
                                                               '9 - 12 hours': 11,'Over 12 hours': 14})

In [35]:
train_data["HoursOutside_convert"] = train_data["HoursOutside"].map( {'Less than 30 minutes': 0.3,'30 - 59 minutes': 0.8,
                                                               '1 - 2 hours': 1.5,'3 - 4 hours': 3.5, 'Over 4 hours': 5})
test_data["HoursOutside_convert"] = test_data["HoursOutside"].map( {'Less than 30 minutes': 0.3,'30 - 59 minutes': 0.8,
                                                               '1 - 2 hours': 1.5,'3 - 4 hours': 3.5, 'Over 4 hours': 5})

In [36]:
train_data["Age_convert"] = train_data["Age"].map( {'18 - 24 years old': 21,'25 - 34 years old': 30,
                                                               '35 - 44 years old': 40,'45 - 54 years old': 50})
test_data["Age_convert"] = test_data["Age"].map( {'18 - 24 years old': 21,'25 - 34 years old': 30,
                                                               '35 - 44 years old': 40,'45 - 54 years old': 50})

In [37]:
train_data["TimeFullyProductive_convert"] = train_data["TimeFullyProductive"].map( {'Less than a month': 1,'One to three months': 3,
                                                               'Six to nine months': 9,'Nine months to a year': 12,
                                                                           'More than a year': 15, 'Three to six months':6})
test_data["TimeFullyProductive_convert"] = test_data["TimeFullyProductive"].map( {'Less than a month': 1,'One to three months': 3,
                                                               'Six to nine months': 9,'Nine months to a year': 12,
                                                                           'More than a year': 15, 'Three to six months':6})

In [38]:
train_data["Hobby_convert"] = train_data["Hobby"].map( {'Yes': 1.0,'No': 0.0})
test_data["Hobby_convert"] = test_data["Hobby"].map( {'Yes': 1.0,'No': 0.0})

In [39]:
train_data["OpenSource_convert"] = train_data["OpenSource"].map( {'Yes': 1.0,'No': 0.0})
test_data["OpenSource_convert"] = test_data["OpenSource"].map( {'Yes': 1.0,'No': 0.0})

In [40]:
train_data["Student_convert"] = train_data["Student"].map( {'No': 2.0,'Yes, part-time': 1.0,'Yes, full-time': 0.0})
test_data["Student_convert"] = test_data["Student"].map( {'No': 2.0,'Yes, part-time': 1.0,'Yes, full-time': 0.0})

In [41]:
train_data["FormalEducation_convert"] = train_data["FormalEducation"].map( {'I never completed any formal education': 0.0,
            'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 1.0,
            'Primary/elementary school': 2.0,
            'Associate degree': 3.0,
            'Some college/university study without earning a degree': 4.0,
            'Bachelor’s degree (BA, BS, B.Eng., etc.)': 5.0,
            'Master’s degree (MA, MS, M.Eng., MBA, etc.)': 6.0,
            'Other doctoral degree (Ph.D, Ed.D., etc.)': 7.0,
            'Professional degree (JD, MD, etc.)': 8.0})
test_data["FormalEducation_convert"] = test_data["FormalEducation"].map( {'I never completed any formal education': 0.0,
            'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 1.0,
            'Primary/elementary school': 2.0,
            'Associate degree': 3.0,
            'Some college/university study without earning a degree': 4.0,
            'Bachelor’s degree (BA, BS, B.Eng., etc.)': 5.0,
            'Master’s degree (MA, MS, M.Eng., MBA, etc.)': 6.0,
            'Other doctoral degree (Ph.D, Ed.D., etc.)': 7.0,
            'Professional degree (JD, MD, etc.)': 8.0})

In [42]:
train_data["UndergradMajor_convert"] = train_data["UndergradMajor"].map({'Computer science, computer engineering, or software engineering': 2.0,
            'A natural science (ex. biology, chemistry, physics)': 1.0,
            'Web development or web design': 2.0,
            'Information systems, information technology, or system administration': 2.0,
            'A social science (ex. anthropology, psychology, political science)': 0.0,
            'A business discipline (ex. accounting, finance, marketing))': 0.0,
            'Another engineering discipline (ex. civil, electrical, mechanical)': 1.0,
            'A humanities discipline (ex. literature, history, philosophy)': 0.0,
            'Mathematics or statistics)': 2.0,
            'Fine arts or performing arts (ex. graphic design, music, studio art': 1.0,
            'I never declared a major': 0.0,
            'A health science (ex. nursing, pharmacy, radiology)': 0.0})
test_data["UndergradMajor_convert"] = test_data["UndergradMajor"].map({'Computer science, computer engineering, or software engineering': 2.0,
            'A natural science (ex. biology, chemistry, physics)': 1.0,
            'Web development or web design': 2.0,
            'Information systems, information technology, or system administration': 2.0,
            'A social science (ex. anthropology, psychology, political science)': 0.0,
            'A business discipline (ex. accounting, finance, marketing))': 0.0,
            'Another engineering discipline (ex. civil, electrical, mechanical)': 1.0,
            'A humanities discipline (ex. literature, history, philosophy)': 0.0,
            'Mathematics or statistics)': 2.0,
            'Fine arts or performing arts (ex. graphic design, music, studio art': 1.0,
            'I never declared a major': 0.0,
            'A health science (ex. nursing, pharmacy, radiology)': 0.0})

In [43]:
# Number of each type of column
train_data.dtypes.value_counts()

float64    234
object      86
int64        1
dtype: int64

In [49]:
# drop int column type
train_data = train_data.drop('No',axis=1)
test_data = test_data.drop('No',axis=1)

# Data preprocessing

## Clean data

Drop the columns and the rows have more 80% missing values

In [50]:
missing_cols_drop = missing_values[missing_values.iloc[:,1]>80].index

In [51]:
train_data = train_data.drop(missing_cols_drop,axis =1)
test_data = test_data.drop(missing_cols_drop,axis =1)

In [52]:
missing_rows_drop = train_data.isnull().sum(axis=1)

In [53]:
train_data = train_data[missing_rows_drop<(train_data.shape[1]/2)]

In [54]:
print('Training data shape:',train_data.shape)
print('Testing data shape:',test_data.shape)

Training data shape: (33076, 319)
Testing data shape: (11259, 318)


In [55]:
target = train_data['Salary']
train_data = train_data.drop('Salary',axis=1)

## Transform data by datatype

### Float columns

In [56]:
# get columns name
float_cols_name_train = train_data.select_dtypes('float').columns
float_cols_name_test = test_data.select_dtypes('float').columns

In [57]:
#fill na by median
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
train_arr = imputer.fit_transform(train_data.select_dtypes('float'))
test_arr = imputer.transform(test_data.select_dtypes('float'))

In [58]:
float_dataset_train = pd.DataFrame(train_arr,columns=float_cols_name_train)
float_dataset_test = pd.DataFrame(test_arr,columns=float_cols_name_test)

In [59]:
print('Training float dataset shape: ', float_dataset_train.shape)
print('Testing dloat dataset shape: ', float_dataset_test.shape)

Training float dataset shape:  (33076, 233)
Testing dloat dataset shape:  (11259, 233)


In [60]:
float_dataset_train.head()

Unnamed: 0,AssessJob1,AssessJob2,AssessJob3,AssessJob4,AssessJob5,AssessJob6,AssessJob7,AssessJob8,AssessJob9,AssessJob10,...,StackOverflowJobsRecommend_convert,HoursComputer_convert,HoursOutside_convert,Age_convert,TimeFullyProductive_convert,Hobby_convert,OpenSource_convert,Student_convert,FormalEducation_convert,UndergradMajor_convert
0,9.0,7.0,8.0,3.0,6.0,10.0,2.0,1.0,5.0,4.0,...,10.0,7.0,5.0,40.0,3.0,1.0,0.0,2.0,5.0,2.0
1,9.0,2.0,1.0,6.0,4.0,8.0,3.0,5.0,10.0,7.0,...,10.0,14.0,0.8,40.0,15.0,1.0,1.0,2.0,6.0,1.0
2,9.0,4.0,6.0,1.0,3.0,5.0,2.0,8.0,10.0,7.0,...,10.0,7.0,1.5,40.0,3.0,1.0,1.0,2.0,4.0,2.0
3,7.0,7.0,6.0,4.0,3.0,4.0,6.0,4.0,8.0,6.0,...,10.0,7.0,0.8,30.0,3.0,1.0,0.0,2.0,5.0,2.0
4,3.0,8.0,10.0,2.0,7.0,9.0,1.0,5.0,6.0,4.0,...,10.0,14.0,0.3,30.0,3.0,0.0,1.0,2.0,2.0,2.0


In [61]:
#Check corr
#Checkcorr= float_dataset_train
#Checkcorr['label']=target

In [62]:
# The undersample dataframe
#plt.figure(figsize = (20, 20))
#corr = float_dataset_train.corr()
#sns.heatmap(corr, cmap="coolwarm_r", annot=True)

In [63]:
#corr['label']

### Categorical multiple choice columns

In [66]:
#function for check which column contains multiple-choice
def check_multiple_choices(text):
    if type(text)==str:
        if ';' in text:
            return 1
        else:
            return 0
    return 0

In [67]:
#Categorical columns is columns, which are not in float cols
categorical_cols_name=[col for col in train_data.columns if col not in float_cols_name_train]
#Multiple choices is categorical with character: ";"
mul_choices_cols_name=[]
for i in categorical_cols_name:
    if sum(train_data[i].apply(check_multiple_choices))>0:
        mul_choices_cols_name.append(i)
option_choices_cols_name=[col for col in categorical_cols_name if col not in mul_choices_cols_name]

In [69]:
#list of columns have multiple-choice on train_dataset
#mul_choices_cols_name

In [70]:
#function for count the number of choices on each column
def count_multiple_choice(text):
    if type(text)==str:
        return text.split(";")
    else:
        return []

In [71]:
num_multiple_choice={}
for i in mul_choices_cols_name:
    flat_ls = list(train_data[i].apply(count_multiple_choice))
    num_multiple_choice[i]=len(set(list(itertools.chain(*flat_ls))))

In [72]:
#display the number of choice on columns
#frame_num_choice = pd.DataFrame(num_multiple_choice.items(),columns=['Feature names','Number of choice']).sort_values('Number of choice',ascending=False)
#frame_num_choice

#### Encode multiple-choices to one-hot

In [73]:
#function for encoding multiple-choice to one hot
def encode(ls,ref):
    ini_vec = np.zeros((len(ref)))
    ini_ls=[]
    for i in ls:
        ini_ls.append(ref.index(i))
    ini_vec[ini_ls]=1
    return ini_vec.astype('int')

In [74]:
#apply the function to handle multiple-choices columns
ls_v4_train=[]
ls_v4_test=[]
ls_v2=[]
for c in mul_choices_cols_name:
    
    v1_train = train_data[c].apply(count_multiple_choice) #series
    v1_test = test_data[c].apply(count_multiple_choice) 
    
    v2 = list(set(list(itertools.chain(*v1_train)))) #list of unique of multiple-choices
    v3_train = v1_train.apply(encode,ref=v2) #numpy array
    v3_test = v1_test.apply(encode,ref=v2)
    
    v4_train = np.concatenate(v3_train.values).reshape(-1,len(v2))
    v4_test = np.concatenate(v3_test.values).reshape(-1,len(v2))
    ls_v4_train.append(v4_train)
    ls_v4_test.append(v4_test)
    ls_v2.append(v2)
v5_train = np.concatenate(ls_v4_train,axis=1)
v5_test = np.concatenate(ls_v4_test,axis=1)
v6 = list(itertools.chain(*ls_v2))
multi_choice_dataset_train = pd.DataFrame(v5_train,columns = v6)
multi_choice_dataset_test = pd.DataFrame(v5_test,columns = v6)

In [75]:
multi_choice_dataset_train.head()

Unnamed: 0,Embedded applications or devices developer,QA or test developer,DevOps specialist,Data scientist or machine learning specialist,Full-stack developer,Product manager,Front-end developer,"C-suite executive (CEO, CTO, etc.)",Game or graphics developer,Educator or academic researcher,...,Bisexual or Queer,Gay or Lesbian,Straight or heterosexual,Hispanic or Latino/Latina,White or of European descent,Black or of African descent,Middle Eastern,South Asian,East Asian,"Native American, Pacific Islander, or Indigenous Australian"
0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0


In [76]:
multi_choice_dataset_test.head()

Unnamed: 0,Embedded applications or devices developer,QA or test developer,DevOps specialist,Data scientist or machine learning specialist,Full-stack developer,Product manager,Front-end developer,"C-suite executive (CEO, CTO, etc.)",Game or graphics developer,Educator or academic researcher,...,Bisexual or Queer,Gay or Lesbian,Straight or heterosexual,Hispanic or Latino/Latina,White or of European descent,Black or of African descent,Middle Eastern,South Asian,East Asian,"Native American, Pacific Islander, or Indigenous Australian"
0,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,0,0,0,0,1,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [77]:
cols=pd.Series(multi_choice_dataset_train.columns)
for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
multi_choice_dataset_train.columns=cols
multi_choice_dataset_test.columns=cols

In [78]:
test_data['DevType'].head(5)

0    Back-end developer;Front-end developer;Full-st...
1    Back-end developer;Database administrator;DevO...
2             Front-end developer;Full-stack developer
3                                   Back-end developer
4    Front-end developer;Full-stack developer;Mobil...
Name: DevType, dtype: object

In [79]:
multi_choice_dataset_train.iloc[:5,:20]

Unnamed: 0,Embedded applications or devices developer,QA or test developer,DevOps specialist,Data scientist or machine learning specialist,Full-stack developer,Product manager,Front-end developer,"C-suite executive (CEO, CTO, etc.)",Game or graphics developer,Educator or academic researcher,Desktop or enterprise applications developer,System administrator,Student,Back-end developer,Marketing or sales professional,Database administrator,Designer,Data or business analyst,Mobile developer,Engineering manager
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,1,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0


In [80]:
multi_choice_dataset_train_generate= multi_choice_dataset_train[cols]
multi_choice_dataset_train_generate['Salary']=target
multi_choice_dataset_train_generate.head()

Unnamed: 0,Embedded applications or devices developer,QA or test developer,DevOps specialist,Data scientist or machine learning specialist,Full-stack developer,Product manager,Front-end developer,"C-suite executive (CEO, CTO, etc.)",Game or graphics developer,Educator or academic researcher,...,Gay or Lesbian,Straight or heterosexual,Hispanic or Latino/Latina,White or of European descent,Black or of African descent,Middle Eastern,South Asian,East Asian,"Native American, Pacific Islander, or Indigenous Australian",Salary
0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,10800.0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,73433.0
2,0,0,0,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,66672.0
3,1,0,1,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,31848.0
4,0,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0.0


In [81]:
for col in multi_choice_dataset_train.columns:
    print('Start column: '+col)
    name='Gen_map_'+col
    avgname=col+'_avg_sal'
    minname=col+'_min_sal'
    maxname=col+'_max_sal'
    Mapname='Map_'+col
    Salmapname='Salmap_'+col
    name=multi_choice_dataset_train_generate[multi_choice_dataset_train_generate['Salary']>0]
    name[avgname] = name['Salary'].groupby(name[col]).transform('mean')
    name[minname] = name['Salary'].groupby(name[col]).transform('min')
    name[maxname] = name['Salary'].groupby(name[col]).transform('max')
    #Create map for YearsCodingProf
    Mapname=name[[col,avgname,minname,maxname]]
    Salmapname=Mapname.drop_duplicates()
    #print(Salmapname)
    #Mapping new gen features to train, test base on YearsCodingProf
    multi_choice_dataset_train=pd.merge(multi_choice_dataset_train,Salmapname,on=col,how='left')
    multi_choice_dataset_test=pd.merge(multi_choice_dataset_test,Salmapname,on=col,how='left')

Start column: Embedded applications or devices developer


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Start column: QA or test developer
Start column: DevOps specialist
Start column: Data scientist or machine learning specialist
Start column: Full-stack developer
Start column: Product manager
Start column: Front-end developer
Start column: C-suite executive (CEO, CTO, etc.)
Start column: Game or graphics developer
Start column: Educator or academic researcher
Start column: Desktop or enterprise applications developer
Start column: System administrator
Start column: Student
Start column: Back-end developer
Start column: Marketing or sales professional
Start column: Database administrator
Start column: Designer
Start column: Data or business analyst
Start column: Mobile developer
Start column: Engineering manager
Start column: Facebook
Start column: Stack Overflow Enterprise
Start column: Office / productivity suite (Microsoft Office, Google Suite, etc.)
Start column: Google Hangouts/Chat
Start column: Other chat system (IRC, proprietary software, etc.)
Start column: Slack
Start column: 

Start column: NetBeans
Start column: TextMate
Start column: RubyMine
Start column: Coda
Start column: Vim
Start column: IntelliJ
Start column: Agile
Start column: Pair programming
Start column: Scrum
Start column: Evidence-based software engineering
Start column: Formal standard such as ISO 9001 or IEEE 12207 (aka “waterfall” methodologies)
Start column: Mob programming
Start column: Kanban
Start column: Extreme programming (XP)
Start column: Lean
Start column: PRINCE2
Start column: Copying and pasting files to network shares
Start column: Mercurial
Start column: Team Foundation Version Control
Start column: Git
Start column: Subversion
Start column: I don't use version control
Start column: Zip file back-ups
Start column: The website I was visiting has interesting ads
Start column: The website I was visiting forced me to disable it to access their content
Start column: The website I was visiting asked me to disable it
Start column: The ad-blocking software was causing display issues o

In [82]:
#multi_choice_dataset_train= multi_choice_dataset_train.drop('Salary',axis=1)
print('Training multiple-choices dataset shape: ', multi_choice_dataset_train.shape)
print('Testing multiple-choices dataset shape: ', multi_choice_dataset_test.shape)

Training multiple-choices dataset shape:  (33076, 1280)
Testing multiple-choices dataset shape:  (11259, 1280)


### Categorical "option-choices" columns type

In [83]:
option_choices_cols_name=[col for col in categorical_cols_name if col not in mul_choices_cols_name]

In [84]:
#impute missing value by majority class
imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
train_data_arr = imputer.fit_transform(train_data[option_choices_cols_name])
test_data_arr = imputer.transform(test_data[option_choices_cols_name])

In [85]:
option_choices_dataset_train = pd.DataFrame(train_data_arr,columns=option_choices_cols_name)
option_choices_dataset_test = pd.DataFrame(test_data_arr,columns=option_choices_cols_name)

#### Encode option-choices to one-hot columns

In [86]:
# Create a label encoder for option columns
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in option_choices_cols_name:
    # If 2 or fewer unique categories
    if len(list(option_choices_dataset_train[col].unique())) <= 2:
        # Train on the training data
        le.fit(option_choices_dataset_train[col])
        # Transform both training and testing data
        option_choices_dataset_train[col] = le.transform(option_choices_dataset_train[col])
        option_choices_dataset_test[col] = le.transform(option_choices_dataset_test[col])    
        # Keep track of how many columns were label encoded
        le_count += 1
            
print('%d columns were label encoded.' % le_count)

4 columns were label encoded.


In [87]:
#one-hot encode for train_data
option_choices_dataset_train = pd.get_dummies(option_choices_dataset_train)
option_choices_dataset_train.shape

(33076, 587)

In [88]:
#one-hot encode for test_data
option_choices_dataset_test = pd.get_dummies(option_choices_dataset_test)
option_choices_dataset_test.shape

(11259, 547)

In [89]:
#Align the training and testing data, keep only columns present in both dataframes
option_choices_dataset_train, option_choices_dataset_test = option_choices_dataset_train.align(option_choices_dataset_test, join = 'inner', axis = 1)
print('Training option-choices dataset shape: ', option_choices_dataset_train.shape)
print('Testing option-choices dataset shape: ', option_choices_dataset_test.shape)

Training option-choices dataset shape:  (33076, 537)
Testing option-choices dataset shape:  (11259, 537)


### Combine all dataframes

In [91]:
#float_dataset_train,
train_dataset = pd.concat([float_dataset_train,multi_choice_dataset_train,option_choices_dataset_train],axis=1)

In [92]:
#float_dataset_test,
test_dataset = pd.concat([float_dataset_test,multi_choice_dataset_test,option_choices_dataset_test],axis=1)

In [93]:
for col in train_dataset.columns:
    print (col)
    namecol='squared_of_'+col
    train_dataset[namecol]=train_dataset[col]**2

AssessJob1
AssessJob2
AssessJob3
AssessJob4
AssessJob5
AssessJob6
AssessJob7
AssessJob8
AssessJob9
AssessJob10
AssessBenefits1
AssessBenefits2
AssessBenefits3
AssessBenefits4
AssessBenefits5
AssessBenefits6
AssessBenefits7
AssessBenefits8
AssessBenefits9
AssessBenefits10
AssessBenefits11
JobContactPriorities1
JobContactPriorities2
JobContactPriorities3
JobContactPriorities4
JobContactPriorities5
JobEmailPriorities1
JobEmailPriorities2
JobEmailPriorities3
JobEmailPriorities4
JobEmailPriorities5
JobEmailPriorities6
JobEmailPriorities7
AdsPriorities1
AdsPriorities2
AdsPriorities3
AdsPriorities4
AdsPriorities5
AdsPriorities6
AdsPriorities7
avg_sal
min_sal
max_sal
count_sal
mode_sal
std_sal
YearsCoding_avg_sal
YearsCoding_min_sal
YearsCoding_max_sal
YearsCoding_count_sal
YearsCoding_std_sal
RaceEthnicity_avg_sal
RaceEthnicity_min_sal
RaceEthnicity_max_sal
RaceEthnicity_count_sal
RaceEthnicity_std_sal
YearsCodingProf_avg_sal
YearsCodingProf_min_sal
YearsCodingProf_max_sal
YearsCodingProf_std

SQLite_1
Apache Hive_1
Memcached_1
Drupal
Amazon Echo
Linux
Arduino
Heroku
Android
AWS
Firebase
Google Home
Predix
Azure
iOS
Google Cloud Platform/App Engine
Raspberry Pi
Windows Phone
Serverless
SharePoint
Salesforce
Windows Desktop or Server
ESP 8266
WordPress
Gaming console
Mac OS
Mainframe
IBM Cloud or Watson
Apple Watch or Apple TV
ESP8266
Amazon Echo_1
Drupal_1
Linux_1
Arduino_1
Heroku_1
Android_1
AWS_1
Firebase_1
Google Home_1
Predix_1
Azure_1
iOS_1
Google Cloud Platform/App Engine_1
Raspberry Pi_1
Windows Phone_1
Serverless_1
SharePoint_1
Salesforce_1
Windows Desktop or Server_1
ESP 8266_1
WordPress_1
Gaming console_1
Mac OS_1
Mainframe_1
IBM Cloud or Watson_1
Apple Watch or Apple TV_1
ESP8266_1
Torch/PyTorch
Hadoop
.NET Core
Django
React
Angular
Cordova
Spring
Spark
TensorFlow
Xamarin
Node.js
Torch/PyTorch_1
Hadoop_1
.NET Core_1
Django_1
Angular_1
React_1
Spark_1
Spring_1
Cordova_1
TensorFlow_1
Xamarin_1
Node.js_1
Zend
PHPStorm
Sublime Text
Komodo
Android Studio
Xcode
PyCharm


Internal Wikis, chat rooms, or documentation set up by my company for employees_max_sal
Questions & answers on Stack Overflow_avg_sal
Questions & answers on Stack Overflow_min_sal
Questions & answers on Stack Overflow_max_sal
Pre-scheduled tutoring or mentoring sessions with a friend or colleague_avg_sal
Pre-scheduled tutoring or mentoring sessions with a friend or colleague_min_sal
Pre-scheduled tutoring or mentoring sessions with a friend or colleague_max_sal
A book or e-book from O’Reilly, Apress, or a similar publisher_avg_sal
A book or e-book from O’Reilly, Apress, or a similar publisher_min_sal
A book or e-book from O’Reilly, Apress, or a similar publisher_max_sal
The official documentation and/or standards for the technology_avg_sal
The official documentation and/or standards for the technology_min_sal
The official documentation and/or standards for the technology_max_sal
Online developer communities other than Stack Overflow (ex. forums, listservs, IRC channels, etc.)_avg_sal
O

Drupal_min_sal
Drupal_max_sal
Amazon Echo_avg_sal
Amazon Echo_min_sal
Amazon Echo_max_sal
Linux_avg_sal
Linux_min_sal
Linux_max_sal
Arduino_avg_sal
Arduino_min_sal
Arduino_max_sal
Heroku_avg_sal
Heroku_min_sal
Heroku_max_sal
Android_avg_sal
Android_min_sal
Android_max_sal
AWS_avg_sal
AWS_min_sal
AWS_max_sal
Firebase_avg_sal
Firebase_min_sal
Firebase_max_sal
Google Home_avg_sal
Google Home_min_sal
Google Home_max_sal
Predix_avg_sal
Predix_min_sal
Predix_max_sal
Azure_avg_sal
Azure_min_sal
Azure_max_sal
iOS_avg_sal
iOS_min_sal
iOS_max_sal
Google Cloud Platform/App Engine_avg_sal
Google Cloud Platform/App Engine_min_sal
Google Cloud Platform/App Engine_max_sal
Raspberry Pi_avg_sal
Raspberry Pi_min_sal
Raspberry Pi_max_sal
Windows Phone_avg_sal
Windows Phone_min_sal
Windows Phone_max_sal
Serverless_avg_sal
Serverless_min_sal
Serverless_max_sal
SharePoint_avg_sal
SharePoint_min_sal
SharePoint_max_sal
Salesforce_avg_sal
Salesforce_min_sal
Salesforce_max_sal
Windows Desktop or Server_avg_sal


Female_avg_sal
Female_min_sal
Female_max_sal
Transgender_avg_sal
Transgender_min_sal
Transgender_max_sal
Non-binary, genderqueer, or gender non-conforming_avg_sal
Non-binary, genderqueer, or gender non-conforming_min_sal
Non-binary, genderqueer, or gender non-conforming_max_sal
Male_avg_sal
Male_min_sal
Male_max_sal
Asexual_avg_sal
Asexual_min_sal
Asexual_max_sal
Bisexual or Queer_avg_sal
Bisexual or Queer_min_sal
Bisexual or Queer_max_sal
Gay or Lesbian_avg_sal
Gay or Lesbian_min_sal
Gay or Lesbian_max_sal
Straight or heterosexual_avg_sal
Straight or heterosexual_min_sal
Straight or heterosexual_max_sal
Hispanic or Latino/Latina_avg_sal
Hispanic or Latino/Latina_min_sal
Hispanic or Latino/Latina_max_sal
White or of European descent_avg_sal
White or of European descent_min_sal
White or of European descent_max_sal
Black or of African descent_avg_sal
Black or of African descent_min_sal
Black or of African descent_max_sal
Middle Eastern_avg_sal
Middle Eastern_min_sal
Middle Eastern_max_sa

CurrencySymbol_TTD
CurrencySymbol_TWD
CurrencySymbol_TZS
CurrencySymbol_UAH
CurrencySymbol_UGX
CurrencySymbol_USD
CurrencySymbol_UYU
CurrencySymbol_VND
CurrencySymbol_ZAR
TimeFullyProductive_Less than a month
TimeFullyProductive_More than a year
TimeFullyProductive_Nine months to a year
TimeFullyProductive_One to three months
TimeFullyProductive_Six to nine months
TimeFullyProductive_Three to six months
AgreeDisagree1_Agree
AgreeDisagree1_Disagree
AgreeDisagree1_Neither Agree nor Disagree
AgreeDisagree1_Strongly agree
AgreeDisagree1_Strongly disagree
AgreeDisagree2_Agree
AgreeDisagree2_Disagree
AgreeDisagree2_Neither Agree nor Disagree
AgreeDisagree2_Strongly agree
AgreeDisagree2_Strongly disagree
AgreeDisagree3_Agree
AgreeDisagree3_Disagree
AgreeDisagree3_Neither Agree nor Disagree
AgreeDisagree3_Strongly agree
AgreeDisagree3_Strongly disagree
OperatingSystem_BSD/Unix
OperatingSystem_Linux-based
OperatingSystem_MacOS
OperatingSystem_Windows
NumberMonitors_1
NumberMonitors_2
NumberMoni

In [94]:
for col in test_dataset.columns:
    print (col)
    namecol='squared_of_'+col
    test_dataset[namecol]=test_dataset[col]**2

AssessJob1
AssessJob2
AssessJob3
AssessJob4
AssessJob5
AssessJob6
AssessJob7
AssessJob8
AssessJob9
AssessJob10
AssessBenefits1
AssessBenefits2
AssessBenefits3
AssessBenefits4
AssessBenefits5
AssessBenefits6
AssessBenefits7
AssessBenefits8
AssessBenefits9
AssessBenefits10
AssessBenefits11
JobContactPriorities1
JobContactPriorities2
JobContactPriorities3
JobContactPriorities4
JobContactPriorities5
JobEmailPriorities1
JobEmailPriorities2
JobEmailPriorities3
JobEmailPriorities4
JobEmailPriorities5
JobEmailPriorities6
JobEmailPriorities7
AdsPriorities1
AdsPriorities2
AdsPriorities3
AdsPriorities4
AdsPriorities5
AdsPriorities6
AdsPriorities7
avg_sal
min_sal
max_sal
count_sal
mode_sal
std_sal
YearsCoding_avg_sal
YearsCoding_min_sal
YearsCoding_max_sal
YearsCoding_count_sal
YearsCoding_std_sal
RaceEthnicity_avg_sal
RaceEthnicity_min_sal
RaceEthnicity_max_sal
RaceEthnicity_count_sal
RaceEthnicity_std_sal
YearsCodingProf_avg_sal
YearsCodingProf_min_sal
YearsCodingProf_max_sal
YearsCodingProf_std

Salesforce_1
Windows Desktop or Server_1
ESP 8266_1
WordPress_1
Gaming console_1
Mac OS_1
Mainframe_1
IBM Cloud or Watson_1
Apple Watch or Apple TV_1
ESP8266_1
Torch/PyTorch
Hadoop
.NET Core
Django
React
Angular
Cordova
Spring
Spark
TensorFlow
Xamarin
Node.js
Torch/PyTorch_1
Hadoop_1
.NET Core_1
Django_1
Angular_1
React_1
Spark_1
Spring_1
Cordova_1
TensorFlow_1
Xamarin_1
Node.js_1
Zend
PHPStorm
Sublime Text
Komodo
Android Studio
Xcode
PyCharm
Eclipse
Atom
IPython / Jupyter
Emacs
RStudio
Visual Studio
Visual Studio Code
Light Table
Notepad++
NetBeans
TextMate
RubyMine
Coda
Vim
IntelliJ
Agile
Pair programming
Scrum
Evidence-based software engineering
Formal standard such as ISO 9001 or IEEE 12207 (aka “waterfall” methodologies)
Mob programming
Kanban
Extreme programming (XP)
Lean
PRINCE2
Copying and pasting files to network shares
Mercurial
Team Foundation Version Control
Git
Subversion
I don't use version control
Zip file back-ups
The website I was visiting has interesting ads
The websi

Perl_min_sal
Perl_max_sal
Lua_avg_sal
Lua_min_sal
Lua_max_sal
R_avg_sal
R_min_sal
R_max_sal
Hack_avg_sal
Hack_min_sal
Hack_max_sal
VB.NET_avg_sal
VB.NET_min_sal
VB.NET_max_sal
JavaScript_avg_sal
JavaScript_min_sal
JavaScript_max_sal
C++_avg_sal
C++_min_sal
C++_max_sal
Assembly_avg_sal
Assembly_min_sal
Assembly_max_sal
Objective-C_avg_sal
Objective-C_min_sal
Objective-C_max_sal
Kotlin_avg_sal
Kotlin_min_sal
Kotlin_max_sal
Java_avg_sal
Java_min_sal
Java_max_sal
HTML_avg_sal
HTML_min_sal
HTML_max_sal
SQL_avg_sal
SQL_min_sal
SQL_max_sal
Julia_avg_sal
Julia_min_sal
Julia_max_sal
C#_avg_sal
C#_min_sal
C#_max_sal
Python_avg_sal
Python_min_sal
Python_max_sal
Rust_avg_sal
Rust_min_sal
Rust_max_sal
Ocaml_avg_sal
Ocaml_min_sal
Ocaml_max_sal
Swift_1_avg_sal
Swift_1_min_sal
Swift_1_max_sal
Ruby_1_avg_sal
Ruby_1_min_sal
Ruby_1_max_sal
Scala_1_avg_sal
Scala_1_min_sal
Scala_1_max_sal
CoffeeScript_1_avg_sal
CoffeeScript_1_min_sal
CoffeeScript_1_max_sal
Bash/Shell_1_avg_sal
Bash/Shell_1_min_sal
Bash/She

Node.js_max_sal
Torch/PyTorch_1_avg_sal
Torch/PyTorch_1_min_sal
Torch/PyTorch_1_max_sal
Hadoop_1_avg_sal
Hadoop_1_min_sal
Hadoop_1_max_sal
.NET Core_1_avg_sal
.NET Core_1_min_sal
.NET Core_1_max_sal
Django_1_avg_sal
Django_1_min_sal
Django_1_max_sal
Angular_1_avg_sal
Angular_1_min_sal
Angular_1_max_sal
React_1_avg_sal
React_1_min_sal
React_1_max_sal
Spark_1_avg_sal
Spark_1_min_sal
Spark_1_max_sal
Spring_1_avg_sal
Spring_1_min_sal
Spring_1_max_sal
Cordova_1_avg_sal
Cordova_1_min_sal
Cordova_1_max_sal
TensorFlow_1_avg_sal
TensorFlow_1_min_sal
TensorFlow_1_max_sal
Xamarin_1_avg_sal
Xamarin_1_min_sal
Xamarin_1_max_sal
Node.js_1_avg_sal
Node.js_1_min_sal
Node.js_1_max_sal
Zend_avg_sal
Zend_min_sal
Zend_max_sal
PHPStorm_avg_sal
PHPStorm_min_sal
PHPStorm_max_sal
Sublime Text_avg_sal
Sublime Text_min_sal
Sublime Text_max_sal
Komodo_avg_sal
Komodo_min_sal
Komodo_max_sal
Android Studio_avg_sal
Android Studio_min_sal
Android Studio_max_sal
Xcode_avg_sal
Xcode_min_sal
Xcode_max_sal
PyCharm_avg_sal

HopeFiveYears_Retirement
HopeFiveYears_Working as a founder or co-founder of my own company
HopeFiveYears_Working as a product manager or project manager
HopeFiveYears_Working as an engineering manager or other functional manager
HopeFiveYears_Working in a career completely unrelated to software development
HopeFiveYears_Working in a different or more specialized technical role than the one I'm in now
JobSearchStatus_I am actively looking for a job
JobSearchStatus_I am not interested in new job opportunities
JobSearchStatus_I’m not actively looking, but I am open to new opportunities
LastNewJob_Between 1 and 2 years ago
LastNewJob_Between 2 and 4 years ago
LastNewJob_I've never had a job
LastNewJob_Less than a year ago
LastNewJob_More than 4 years ago
UpdateCV_A friend told me about a job opportunity
UpdateCV_A recruiter contacted me
UpdateCV_I did not receive an expected change in compensation
UpdateCV_I had a negative experience or interaction at work
UpdateCV_I received bad news abo

In [95]:
print('Training dataset shape: ', train_dataset.shape)
#print('Training label shape: ', y_train.shape)
print('Testing dataset shape: ', test_dataset.shape)

Training dataset shape:  (33076, 4100)
Testing dataset shape:  (11259, 4100)


# 2. TRAINING

In [96]:
from sklearn.utils.validation import column_or_1d

In [97]:
X_train= train_dataset
y_train= target.astype(int)
X_test= test_dataset

In [98]:
y_train.dtype

dtype('int32')

In [99]:
import re
X_train = X_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
X_test = X_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

### Split training set to validation set

In [100]:
params = {
    'objective'         : 'regression',    
    'metric'            : 'l2', 
    'nthread'           : 4,
    'learning_rate'     : 0.001,

    'num_leaves'        : 23,
    'feature_fraction'  : 0.106,
    'bagging_fraction'  : 0.825,
    'max_depth'         : -1,
    'lambda_l1'         : 0.1,
    'lambda_l2'         : 2.5,
    'min_split_gain'    : 0.007,
}

# # Create parameters to search
# gridParams = {
#     'lambda_l1' : [0.1,0.2,0.3],
#     'lambda_l2' : [2.5,2.7,2.9],
#     }

# # Create REGRESSOR to use. Note that parameters have to be input manually
# # not as a dict!
# mdl = lgb.LGBMRegressor(
#           objective = params['objective'],
#           metric = params['metric'],
#           nthread = params['nthread'],
#           learning_rate = params['learning_rate'],
#           num_leaves = params['num_leaves'],
#           feature_fraction = params['feature_fraction'],
#           bagging_fraction = params['bagging_fraction'],
#           max_depth = params['max_depth'],
#           lambda_l1 = params['lambda_l1'],
#           lambda_l2 = params['lambda_l2'],
#           min_split_gain = params['min_split_gain'])

# # To view the default model params:
# mdl.get_params().keys()

# # Create the grid
# grid = GridSearchCV(mdl, gridParams,
#                     verbose=0,
#                     cv=4,
#                     n_jobs=2)

# # Run the grid
# grid.fit(X_train, y_train)

# # Print the best parameters found
# print(grid.best_params_)
# print(grid.best_score_)

# # Using parameters already set above, replace in the best from the grid search
# params['lambda_l1'] = grid.best_params_['lambda_l1']
# params['lambda_l2'] = grid.best_params_['lambda_l2']

# print('Fitting with params: ')
# print(params)

In [101]:
skf = StratifiedKFold(n_splits=4, random_state=3462873, shuffle=True)
preds = 0.0
for itrain, ivalid in skf.split(X_train, y_train):
    lgb_train = lgb.Dataset(X_train.iloc[itrain], y_train.iloc[itrain])
    lgb_eval  = lgb.Dataset(X_train.iloc[ivalid], y_train.iloc[ivalid], reference = lgb_train)
    model = lgb.train(params,
                lgb_train,
                num_boost_round = 999999,  
                early_stopping_rounds = 800,
                verbose_eval = 100,
                valid_sets = [lgb_train, lgb_eval])
    pred = model.predict(X_test)
    preds += pred/skf.n_splits



Training until validation scores don't improve for 800 rounds
[100]	training's l2: 1.74247e+09	valid_1's l2: 1.71803e+09
[200]	training's l2: 1.55881e+09	valid_1's l2: 1.5352e+09
[300]	training's l2: 1.40287e+09	valid_1's l2: 1.38021e+09
[400]	training's l2: 1.27112e+09	valid_1's l2: 1.24954e+09
[500]	training's l2: 1.16134e+09	valid_1's l2: 1.14071e+09
[600]	training's l2: 1.06859e+09	valid_1's l2: 1.049e+09
[700]	training's l2: 9.90184e+08	valid_1's l2: 9.71515e+08
[800]	training's l2: 9.2295e+08	valid_1's l2: 9.05372e+08
[900]	training's l2: 8.66074e+08	valid_1's l2: 8.49719e+08
[1000]	training's l2: 8.17416e+08	valid_1's l2: 8.02303e+08
[1100]	training's l2: 7.75638e+08	valid_1's l2: 7.61656e+08
[1200]	training's l2: 7.39616e+08	valid_1's l2: 7.26743e+08
[1300]	training's l2: 7.08244e+08	valid_1's l2: 6.96512e+08
[1400]	training's l2: 6.81234e+08	valid_1's l2: 6.70561e+08
[1500]	training's l2: 6.57501e+08	valid_1's l2: 6.47855e+08
[1600]	training's l2: 6.36546e+08	valid_1's l2: 6.2

[13700]	training's l2: 3.21593e+08	valid_1's l2: 4.05767e+08
[13800]	training's l2: 3.20898e+08	valid_1's l2: 4.05639e+08
[13900]	training's l2: 3.20222e+08	valid_1's l2: 4.05514e+08
[14000]	training's l2: 3.19532e+08	valid_1's l2: 4.05379e+08
[14100]	training's l2: 3.1885e+08	valid_1's l2: 4.05243e+08
[14200]	training's l2: 3.18163e+08	valid_1's l2: 4.05128e+08
[14300]	training's l2: 3.17482e+08	valid_1's l2: 4.05003e+08
[14400]	training's l2: 3.16822e+08	valid_1's l2: 4.04888e+08
[14500]	training's l2: 3.16159e+08	valid_1's l2: 4.04777e+08
[14600]	training's l2: 3.15515e+08	valid_1's l2: 4.04654e+08
[14700]	training's l2: 3.14859e+08	valid_1's l2: 4.04558e+08
[14800]	training's l2: 3.14209e+08	valid_1's l2: 4.04448e+08
[14900]	training's l2: 3.13568e+08	valid_1's l2: 4.04347e+08
[15000]	training's l2: 3.12914e+08	valid_1's l2: 4.04246e+08
[15100]	training's l2: 3.12277e+08	valid_1's l2: 4.04182e+08
[15200]	training's l2: 3.11651e+08	valid_1's l2: 4.04093e+08
[15300]	training's l2: 3.

[27200]	training's l2: 2.54998e+08	valid_1's l2: 3.98132e+08
[27300]	training's l2: 2.5462e+08	valid_1's l2: 3.98101e+08
[27400]	training's l2: 2.54238e+08	valid_1's l2: 3.98065e+08
[27500]	training's l2: 2.53855e+08	valid_1's l2: 3.98033e+08
[27600]	training's l2: 2.53475e+08	valid_1's l2: 3.98009e+08
[27700]	training's l2: 2.53097e+08	valid_1's l2: 3.97985e+08
[27800]	training's l2: 2.52724e+08	valid_1's l2: 3.97962e+08
[27900]	training's l2: 2.52359e+08	valid_1's l2: 3.97937e+08
[28000]	training's l2: 2.51978e+08	valid_1's l2: 3.97902e+08
[28100]	training's l2: 2.51605e+08	valid_1's l2: 3.97881e+08
[28200]	training's l2: 2.51244e+08	valid_1's l2: 3.97863e+08
[28300]	training's l2: 2.50872e+08	valid_1's l2: 3.9783e+08
[28400]	training's l2: 2.50514e+08	valid_1's l2: 3.97798e+08
[28500]	training's l2: 2.50136e+08	valid_1's l2: 3.9777e+08
[28600]	training's l2: 2.4977e+08	valid_1's l2: 3.9773e+08
[28700]	training's l2: 2.49407e+08	valid_1's l2: 3.97704e+08
[28800]	training's l2: 2.4904

[40700]	training's l2: 2.10684e+08	valid_1's l2: 3.95135e+08
[40800]	training's l2: 2.10402e+08	valid_1's l2: 3.9512e+08
[40900]	training's l2: 2.10115e+08	valid_1's l2: 3.95105e+08
[41000]	training's l2: 2.09829e+08	valid_1's l2: 3.95091e+08
[41100]	training's l2: 2.09541e+08	valid_1's l2: 3.9508e+08
[41200]	training's l2: 2.09256e+08	valid_1's l2: 3.9507e+08
[41300]	training's l2: 2.0897e+08	valid_1's l2: 3.95054e+08
[41400]	training's l2: 2.08696e+08	valid_1's l2: 3.95039e+08
[41500]	training's l2: 2.08414e+08	valid_1's l2: 3.95021e+08
[41600]	training's l2: 2.08138e+08	valid_1's l2: 3.95009e+08
[41700]	training's l2: 2.07861e+08	valid_1's l2: 3.94991e+08
[41800]	training's l2: 2.07577e+08	valid_1's l2: 3.94981e+08
[41900]	training's l2: 2.07293e+08	valid_1's l2: 3.94957e+08
[42000]	training's l2: 2.07017e+08	valid_1's l2: 3.94939e+08
[42100]	training's l2: 2.06734e+08	valid_1's l2: 3.94931e+08
[42200]	training's l2: 2.06456e+08	valid_1's l2: 3.94909e+08
[42300]	training's l2: 2.061

[54200]	training's l2: 1.76358e+08	valid_1's l2: 3.93444e+08
[54300]	training's l2: 1.76136e+08	valid_1's l2: 3.93439e+08
[54400]	training's l2: 1.75908e+08	valid_1's l2: 3.93426e+08
[54500]	training's l2: 1.75684e+08	valid_1's l2: 3.93418e+08
[54600]	training's l2: 1.75459e+08	valid_1's l2: 3.93405e+08
[54700]	training's l2: 1.75242e+08	valid_1's l2: 3.93394e+08
[54800]	training's l2: 1.75021e+08	valid_1's l2: 3.93381e+08
[54900]	training's l2: 1.74795e+08	valid_1's l2: 3.93372e+08
[55000]	training's l2: 1.74573e+08	valid_1's l2: 3.93366e+08
[55100]	training's l2: 1.74351e+08	valid_1's l2: 3.93352e+08
[55200]	training's l2: 1.74126e+08	valid_1's l2: 3.93336e+08
[55300]	training's l2: 1.73905e+08	valid_1's l2: 3.93321e+08
[55400]	training's l2: 1.73686e+08	valid_1's l2: 3.93309e+08
[55500]	training's l2: 1.73469e+08	valid_1's l2: 3.93299e+08
[55600]	training's l2: 1.73255e+08	valid_1's l2: 3.93298e+08
[55700]	training's l2: 1.73029e+08	valid_1's l2: 3.93281e+08
[55800]	training's l2: 1

[67700]	training's l2: 1.48991e+08	valid_1's l2: 3.92118e+08
[67800]	training's l2: 1.48811e+08	valid_1's l2: 3.92106e+08
[67900]	training's l2: 1.48618e+08	valid_1's l2: 3.92097e+08
[68000]	training's l2: 1.48427e+08	valid_1's l2: 3.92084e+08
[68100]	training's l2: 1.48236e+08	valid_1's l2: 3.92069e+08
[68200]	training's l2: 1.48059e+08	valid_1's l2: 3.92059e+08
[68300]	training's l2: 1.47871e+08	valid_1's l2: 3.92046e+08
[68400]	training's l2: 1.47697e+08	valid_1's l2: 3.92042e+08
[68500]	training's l2: 1.4751e+08	valid_1's l2: 3.92034e+08
[68600]	training's l2: 1.47328e+08	valid_1's l2: 3.92023e+08
[68700]	training's l2: 1.4715e+08	valid_1's l2: 3.92018e+08
[68800]	training's l2: 1.46968e+08	valid_1's l2: 3.92019e+08
[68900]	training's l2: 1.46795e+08	valid_1's l2: 3.92016e+08
[69000]	training's l2: 1.46619e+08	valid_1's l2: 3.92006e+08
[69100]	training's l2: 1.46437e+08	valid_1's l2: 3.91997e+08
[69200]	training's l2: 1.46251e+08	valid_1's l2: 3.91992e+08
[69300]	training's l2: 1.4

[81200]	training's l2: 1.26787e+08	valid_1's l2: 3.91335e+08
[81300]	training's l2: 1.26637e+08	valid_1's l2: 3.91327e+08
[81400]	training's l2: 1.26493e+08	valid_1's l2: 3.91325e+08
[81500]	training's l2: 1.26349e+08	valid_1's l2: 3.91322e+08
[81600]	training's l2: 1.26196e+08	valid_1's l2: 3.91316e+08
[81700]	training's l2: 1.26049e+08	valid_1's l2: 3.91312e+08
[81800]	training's l2: 1.25901e+08	valid_1's l2: 3.91314e+08
[81900]	training's l2: 1.25753e+08	valid_1's l2: 3.91311e+08
[82000]	training's l2: 1.25606e+08	valid_1's l2: 3.91305e+08
[82100]	training's l2: 1.25458e+08	valid_1's l2: 3.91303e+08
[82200]	training's l2: 1.25308e+08	valid_1's l2: 3.91295e+08
[82300]	training's l2: 1.25164e+08	valid_1's l2: 3.91294e+08
[82400]	training's l2: 1.25018e+08	valid_1's l2: 3.91288e+08
[82500]	training's l2: 1.24876e+08	valid_1's l2: 3.91284e+08
[82600]	training's l2: 1.24729e+08	valid_1's l2: 3.91279e+08
[82700]	training's l2: 1.24593e+08	valid_1's l2: 3.91273e+08
[82800]	training's l2: 1

[94700]	training's l2: 1.08441e+08	valid_1's l2: 3.90878e+08
[94800]	training's l2: 1.08321e+08	valid_1's l2: 3.90874e+08
[94900]	training's l2: 1.08194e+08	valid_1's l2: 3.90866e+08
[95000]	training's l2: 1.08069e+08	valid_1's l2: 3.90864e+08
[95100]	training's l2: 1.07945e+08	valid_1's l2: 3.90872e+08
[95200]	training's l2: 1.07817e+08	valid_1's l2: 3.90873e+08
[95300]	training's l2: 1.07692e+08	valid_1's l2: 3.90869e+08
[95400]	training's l2: 1.0757e+08	valid_1's l2: 3.90863e+08
[95500]	training's l2: 1.07445e+08	valid_1's l2: 3.90864e+08
[95600]	training's l2: 1.0732e+08	valid_1's l2: 3.90856e+08
[95700]	training's l2: 1.07194e+08	valid_1's l2: 3.90853e+08
[95800]	training's l2: 1.07062e+08	valid_1's l2: 3.90845e+08
[95900]	training's l2: 1.06943e+08	valid_1's l2: 3.90844e+08
[96000]	training's l2: 1.06814e+08	valid_1's l2: 3.90836e+08
[96100]	training's l2: 1.0669e+08	valid_1's l2: 3.90836e+08
[96200]	training's l2: 1.06567e+08	valid_1's l2: 3.90834e+08
[96300]	training's l2: 1.06

[9400]	training's l2: 3.5096e+08	valid_1's l2: 4.39527e+08
[9500]	training's l2: 3.49924e+08	valid_1's l2: 4.39157e+08
[9600]	training's l2: 3.48899e+08	valid_1's l2: 4.38807e+08
[9700]	training's l2: 3.47899e+08	valid_1's l2: 4.38453e+08
[9800]	training's l2: 3.46902e+08	valid_1's l2: 4.3812e+08
[9900]	training's l2: 3.45929e+08	valid_1's l2: 4.3777e+08
[10000]	training's l2: 3.44964e+08	valid_1's l2: 4.37477e+08
[10100]	training's l2: 3.43994e+08	valid_1's l2: 4.37162e+08
[10200]	training's l2: 3.43041e+08	valid_1's l2: 4.36841e+08
[10300]	training's l2: 3.4211e+08	valid_1's l2: 4.36553e+08
[10400]	training's l2: 3.41181e+08	valid_1's l2: 4.3627e+08
[10500]	training's l2: 3.4026e+08	valid_1's l2: 4.35961e+08
[10600]	training's l2: 3.39356e+08	valid_1's l2: 4.35656e+08
[10700]	training's l2: 3.38476e+08	valid_1's l2: 4.35392e+08
[10800]	training's l2: 3.37589e+08	valid_1's l2: 4.35113e+08
[10900]	training's l2: 3.36694e+08	valid_1's l2: 4.34848e+08
[11000]	training's l2: 3.35818e+08	v

[22900]	training's l2: 2.65729e+08	valid_1's l2: 4.21731e+08
[23000]	training's l2: 2.65306e+08	valid_1's l2: 4.21678e+08
[23100]	training's l2: 2.64884e+08	valid_1's l2: 4.21635e+08
[23200]	training's l2: 2.64452e+08	valid_1's l2: 4.216e+08
[23300]	training's l2: 2.64036e+08	valid_1's l2: 4.21582e+08
[23400]	training's l2: 2.63607e+08	valid_1's l2: 4.21542e+08
[23500]	training's l2: 2.63181e+08	valid_1's l2: 4.21506e+08
[23600]	training's l2: 2.62751e+08	valid_1's l2: 4.21461e+08
[23700]	training's l2: 2.62325e+08	valid_1's l2: 4.21438e+08
[23800]	training's l2: 2.6191e+08	valid_1's l2: 4.21391e+08
[23900]	training's l2: 2.61494e+08	valid_1's l2: 4.21364e+08
[24000]	training's l2: 2.61088e+08	valid_1's l2: 4.21324e+08
[24100]	training's l2: 2.60685e+08	valid_1's l2: 4.21285e+08
[24200]	training's l2: 2.60273e+08	valid_1's l2: 4.21262e+08
[24300]	training's l2: 2.59872e+08	valid_1's l2: 4.21223e+08
[24400]	training's l2: 2.59469e+08	valid_1's l2: 4.21186e+08
[24500]	training's l2: 2.59

[36400]	training's l2: 2.17566e+08	valid_1's l2: 4.17906e+08
[36500]	training's l2: 2.17271e+08	valid_1's l2: 4.17877e+08
[36600]	training's l2: 2.16977e+08	valid_1's l2: 4.17847e+08
[36700]	training's l2: 2.16676e+08	valid_1's l2: 4.17834e+08
[36800]	training's l2: 2.16368e+08	valid_1's l2: 4.17805e+08
[36900]	training's l2: 2.16056e+08	valid_1's l2: 4.17801e+08
[37000]	training's l2: 2.15757e+08	valid_1's l2: 4.17799e+08
[37100]	training's l2: 2.15457e+08	valid_1's l2: 4.17786e+08
[37200]	training's l2: 2.15158e+08	valid_1's l2: 4.17778e+08
[37300]	training's l2: 2.14871e+08	valid_1's l2: 4.1776e+08
[37400]	training's l2: 2.14574e+08	valid_1's l2: 4.17747e+08
[37500]	training's l2: 2.14274e+08	valid_1's l2: 4.1773e+08
[37600]	training's l2: 2.13974e+08	valid_1's l2: 4.17702e+08
[37700]	training's l2: 2.13677e+08	valid_1's l2: 4.17696e+08
[37800]	training's l2: 2.13376e+08	valid_1's l2: 4.17669e+08
[37900]	training's l2: 2.13077e+08	valid_1's l2: 4.17655e+08
[38000]	training's l2: 2.1

[49900]	training's l2: 1.81255e+08	valid_1's l2: 4.15994e+08
[50000]	training's l2: 1.81021e+08	valid_1's l2: 4.15984e+08
[50100]	training's l2: 1.80786e+08	valid_1's l2: 4.15968e+08
[50200]	training's l2: 1.80557e+08	valid_1's l2: 4.15955e+08
[50300]	training's l2: 1.80327e+08	valid_1's l2: 4.1594e+08
[50400]	training's l2: 1.80092e+08	valid_1's l2: 4.15927e+08
[50500]	training's l2: 1.79857e+08	valid_1's l2: 4.15908e+08
[50600]	training's l2: 1.79619e+08	valid_1's l2: 4.15895e+08
[50700]	training's l2: 1.79383e+08	valid_1's l2: 4.15884e+08
[50800]	training's l2: 1.79154e+08	valid_1's l2: 4.15883e+08
[50900]	training's l2: 1.7893e+08	valid_1's l2: 4.15876e+08
[51000]	training's l2: 1.78696e+08	valid_1's l2: 4.15867e+08
[51100]	training's l2: 1.78469e+08	valid_1's l2: 4.15853e+08
[51200]	training's l2: 1.78236e+08	valid_1's l2: 4.1584e+08
[51300]	training's l2: 1.78007e+08	valid_1's l2: 4.15827e+08
[51400]	training's l2: 1.77772e+08	valid_1's l2: 4.15812e+08
[51500]	training's l2: 1.77

[63400]	training's l2: 1.5278e+08	valid_1's l2: 4.15e+08
[63500]	training's l2: 1.52591e+08	valid_1's l2: 4.15001e+08
[63600]	training's l2: 1.52398e+08	valid_1's l2: 4.14993e+08
[63700]	training's l2: 1.52204e+08	valid_1's l2: 4.14978e+08
[63800]	training's l2: 1.52014e+08	valid_1's l2: 4.14966e+08
[63900]	training's l2: 1.51828e+08	valid_1's l2: 4.1496e+08
[64000]	training's l2: 1.51645e+08	valid_1's l2: 4.14957e+08
[64100]	training's l2: 1.51452e+08	valid_1's l2: 4.14955e+08
[64200]	training's l2: 1.51264e+08	valid_1's l2: 4.14949e+08
[64300]	training's l2: 1.51072e+08	valid_1's l2: 4.14946e+08
[64400]	training's l2: 1.50881e+08	valid_1's l2: 4.14941e+08
[64500]	training's l2: 1.50695e+08	valid_1's l2: 4.14933e+08
[64600]	training's l2: 1.50506e+08	valid_1's l2: 4.14934e+08
[64700]	training's l2: 1.50323e+08	valid_1's l2: 4.14935e+08
[64800]	training's l2: 1.50139e+08	valid_1's l2: 4.14931e+08
[64900]	training's l2: 1.49955e+08	valid_1's l2: 4.1493e+08
[65000]	training's l2: 1.49771

[4200]	training's l2: 4.41886e+08	valid_1's l2: 4.66671e+08
[4300]	training's l2: 4.38882e+08	valid_1's l2: 4.64659e+08
[4400]	training's l2: 4.36052e+08	valid_1's l2: 4.62794e+08
[4500]	training's l2: 4.33292e+08	valid_1's l2: 4.61015e+08
[4600]	training's l2: 4.30609e+08	valid_1's l2: 4.59294e+08
[4700]	training's l2: 4.27988e+08	valid_1's l2: 4.57573e+08
[4800]	training's l2: 4.25493e+08	valid_1's l2: 4.56022e+08
[4900]	training's l2: 4.23074e+08	valid_1's l2: 4.54508e+08
[5000]	training's l2: 4.20713e+08	valid_1's l2: 4.53063e+08
[5100]	training's l2: 4.18397e+08	valid_1's l2: 4.51631e+08
[5200]	training's l2: 4.1623e+08	valid_1's l2: 4.50317e+08
[5300]	training's l2: 4.14091e+08	valid_1's l2: 4.49038e+08
[5400]	training's l2: 4.12018e+08	valid_1's l2: 4.47833e+08
[5500]	training's l2: 4.10013e+08	valid_1's l2: 4.46662e+08
[5600]	training's l2: 4.08016e+08	valid_1's l2: 4.45504e+08
[5700]	training's l2: 4.06113e+08	valid_1's l2: 4.4443e+08
[5800]	training's l2: 4.04217e+08	valid_1'

[17800]	training's l2: 2.94823e+08	valid_1's l2: 4.07295e+08
[17900]	training's l2: 2.94291e+08	valid_1's l2: 4.07216e+08
[18000]	training's l2: 2.9377e+08	valid_1's l2: 4.07138e+08
[18100]	training's l2: 2.93258e+08	valid_1's l2: 4.07053e+08
[18200]	training's l2: 2.92737e+08	valid_1's l2: 4.06983e+08
[18300]	training's l2: 2.92196e+08	valid_1's l2: 4.06901e+08
[18400]	training's l2: 2.91666e+08	valid_1's l2: 4.06837e+08
[18500]	training's l2: 2.91138e+08	valid_1's l2: 4.06769e+08
[18600]	training's l2: 2.9062e+08	valid_1's l2: 4.06699e+08
[18700]	training's l2: 2.90118e+08	valid_1's l2: 4.06647e+08
[18800]	training's l2: 2.89602e+08	valid_1's l2: 4.0658e+08
[18900]	training's l2: 2.89087e+08	valid_1's l2: 4.0651e+08
[19000]	training's l2: 2.88557e+08	valid_1's l2: 4.06436e+08
[19100]	training's l2: 2.88034e+08	valid_1's l2: 4.06375e+08
[19200]	training's l2: 2.87514e+08	valid_1's l2: 4.06292e+08
[19300]	training's l2: 2.87011e+08	valid_1's l2: 4.06236e+08
[19400]	training's l2: 2.865

[31300]	training's l2: 2.38171e+08	valid_1's l2: 4.01765e+08
[31400]	training's l2: 2.3783e+08	valid_1's l2: 4.0174e+08
[31500]	training's l2: 2.37481e+08	valid_1's l2: 4.01725e+08
[31600]	training's l2: 2.37145e+08	valid_1's l2: 4.01702e+08
[31700]	training's l2: 2.36808e+08	valid_1's l2: 4.01678e+08
[31800]	training's l2: 2.36464e+08	valid_1's l2: 4.01656e+08
[31900]	training's l2: 2.36128e+08	valid_1's l2: 4.01635e+08
[32000]	training's l2: 2.35813e+08	valid_1's l2: 4.01612e+08
[32100]	training's l2: 2.35479e+08	valid_1's l2: 4.01579e+08
[32200]	training's l2: 2.35138e+08	valid_1's l2: 4.01557e+08
[32300]	training's l2: 2.34812e+08	valid_1's l2: 4.01533e+08
[32400]	training's l2: 2.34469e+08	valid_1's l2: 4.01505e+08
[32500]	training's l2: 2.34141e+08	valid_1's l2: 4.01484e+08
[32600]	training's l2: 2.3381e+08	valid_1's l2: 4.01464e+08
[32700]	training's l2: 2.33475e+08	valid_1's l2: 4.01433e+08
[32800]	training's l2: 2.33138e+08	valid_1's l2: 4.01408e+08
[32900]	training's l2: 2.32

[44800]	training's l2: 1.9814e+08	valid_1's l2: 3.98987e+08
[44900]	training's l2: 1.97871e+08	valid_1's l2: 3.98973e+08
[45000]	training's l2: 1.9762e+08	valid_1's l2: 3.98952e+08
[45100]	training's l2: 1.97357e+08	valid_1's l2: 3.98937e+08
[45200]	training's l2: 1.97099e+08	valid_1's l2: 3.98919e+08
[45300]	training's l2: 1.96845e+08	valid_1's l2: 3.98902e+08
[45400]	training's l2: 1.96587e+08	valid_1's l2: 3.98883e+08
[45500]	training's l2: 1.96328e+08	valid_1's l2: 3.98865e+08
[45600]	training's l2: 1.96078e+08	valid_1's l2: 3.98842e+08
[45700]	training's l2: 1.95822e+08	valid_1's l2: 3.98821e+08
[45800]	training's l2: 1.95568e+08	valid_1's l2: 3.98807e+08
[45900]	training's l2: 1.9532e+08	valid_1's l2: 3.98797e+08
[46000]	training's l2: 1.95067e+08	valid_1's l2: 3.98788e+08
[46100]	training's l2: 1.9482e+08	valid_1's l2: 3.98766e+08
[46200]	training's l2: 1.9456e+08	valid_1's l2: 3.98754e+08
[46300]	training's l2: 1.94316e+08	valid_1's l2: 3.98741e+08
[46400]	training's l2: 1.9406

[58300]	training's l2: 1.67085e+08	valid_1's l2: 3.97191e+08
[58400]	training's l2: 1.66876e+08	valid_1's l2: 3.97193e+08
[58500]	training's l2: 1.6667e+08	valid_1's l2: 3.97186e+08
[58600]	training's l2: 1.66473e+08	valid_1's l2: 3.97183e+08
[58700]	training's l2: 1.6627e+08	valid_1's l2: 3.97175e+08
[58800]	training's l2: 1.6606e+08	valid_1's l2: 3.97161e+08
[58900]	training's l2: 1.65851e+08	valid_1's l2: 3.97155e+08
[59000]	training's l2: 1.65648e+08	valid_1's l2: 3.97155e+08
[59100]	training's l2: 1.65441e+08	valid_1's l2: 3.97146e+08
[59200]	training's l2: 1.65239e+08	valid_1's l2: 3.97134e+08
[59300]	training's l2: 1.65036e+08	valid_1's l2: 3.97121e+08
[59400]	training's l2: 1.64831e+08	valid_1's l2: 3.9712e+08
[59500]	training's l2: 1.64623e+08	valid_1's l2: 3.97111e+08
[59600]	training's l2: 1.6442e+08	valid_1's l2: 3.97098e+08
[59700]	training's l2: 1.64209e+08	valid_1's l2: 3.97078e+08
[59800]	training's l2: 1.64014e+08	valid_1's l2: 3.97071e+08
[59900]	training's l2: 1.6382

[71800]	training's l2: 1.42018e+08	valid_1's l2: 3.96208e+08
[71900]	training's l2: 1.41853e+08	valid_1's l2: 3.96202e+08
[72000]	training's l2: 1.41687e+08	valid_1's l2: 3.96189e+08
[72100]	training's l2: 1.41523e+08	valid_1's l2: 3.96181e+08
[72200]	training's l2: 1.41359e+08	valid_1's l2: 3.96163e+08
[72300]	training's l2: 1.412e+08	valid_1's l2: 3.9616e+08
[72400]	training's l2: 1.41028e+08	valid_1's l2: 3.9615e+08
[72500]	training's l2: 1.40862e+08	valid_1's l2: 3.96144e+08
[72600]	training's l2: 1.40686e+08	valid_1's l2: 3.96136e+08
[72700]	training's l2: 1.40511e+08	valid_1's l2: 3.96122e+08
[72800]	training's l2: 1.40345e+08	valid_1's l2: 3.96119e+08
[72900]	training's l2: 1.40181e+08	valid_1's l2: 3.96119e+08
[73000]	training's l2: 1.40009e+08	valid_1's l2: 3.96117e+08
[73100]	training's l2: 1.39845e+08	valid_1's l2: 3.96113e+08
[73200]	training's l2: 1.39685e+08	valid_1's l2: 3.96108e+08
[73300]	training's l2: 1.39524e+08	valid_1's l2: 3.96103e+08
[73400]	training's l2: 1.393

[85300]	training's l2: 1.21282e+08	valid_1's l2: 3.95577e+08
[85400]	training's l2: 1.21139e+08	valid_1's l2: 3.95573e+08
[85500]	training's l2: 1.20997e+08	valid_1's l2: 3.95566e+08
[85600]	training's l2: 1.20854e+08	valid_1's l2: 3.95565e+08
[85700]	training's l2: 1.20714e+08	valid_1's l2: 3.95558e+08
[85800]	training's l2: 1.20574e+08	valid_1's l2: 3.95552e+08
[85900]	training's l2: 1.2044e+08	valid_1's l2: 3.95548e+08
[86000]	training's l2: 1.20305e+08	valid_1's l2: 3.9555e+08
[86100]	training's l2: 1.20171e+08	valid_1's l2: 3.95547e+08
[86200]	training's l2: 1.20039e+08	valid_1's l2: 3.95547e+08
[86300]	training's l2: 1.19899e+08	valid_1's l2: 3.95543e+08
[86400]	training's l2: 1.19756e+08	valid_1's l2: 3.95543e+08
[86500]	training's l2: 1.19621e+08	valid_1's l2: 3.95535e+08
[86600]	training's l2: 1.19486e+08	valid_1's l2: 3.9553e+08
[86700]	training's l2: 1.19348e+08	valid_1's l2: 3.95527e+08
[86800]	training's l2: 1.19208e+08	valid_1's l2: 3.95519e+08
[86900]	training's l2: 1.19

[98800]	training's l2: 1.03883e+08	valid_1's l2: 3.95006e+08
[98900]	training's l2: 1.03768e+08	valid_1's l2: 3.95004e+08
[99000]	training's l2: 1.03645e+08	valid_1's l2: 3.95008e+08
[99100]	training's l2: 1.03518e+08	valid_1's l2: 3.95007e+08
[99200]	training's l2: 1.03405e+08	valid_1's l2: 3.95e+08
[99300]	training's l2: 1.0328e+08	valid_1's l2: 3.95002e+08
[99400]	training's l2: 1.0316e+08	valid_1's l2: 3.94994e+08
[99500]	training's l2: 1.03037e+08	valid_1's l2: 3.94991e+08
[99600]	training's l2: 1.02919e+08	valid_1's l2: 3.94992e+08
[99700]	training's l2: 1.02804e+08	valid_1's l2: 3.94988e+08
[99800]	training's l2: 1.02686e+08	valid_1's l2: 3.9498e+08
[99900]	training's l2: 1.02576e+08	valid_1's l2: 3.94986e+08
[100000]	training's l2: 1.02455e+08	valid_1's l2: 3.94981e+08
[100100]	training's l2: 1.02336e+08	valid_1's l2: 3.94975e+08
[100200]	training's l2: 1.02219e+08	valid_1's l2: 3.94969e+08
[100300]	training's l2: 1.02106e+08	valid_1's l2: 3.94968e+08
[100400]	training's l2: 1.

[112100]	training's l2: 8.96239e+07	valid_1's l2: 3.94596e+08
[112200]	training's l2: 8.95258e+07	valid_1's l2: 3.94591e+08
[112300]	training's l2: 8.94278e+07	valid_1's l2: 3.94592e+08
[112400]	training's l2: 8.93335e+07	valid_1's l2: 3.94588e+08
[112500]	training's l2: 8.9233e+07	valid_1's l2: 3.94587e+08
[112600]	training's l2: 8.91365e+07	valid_1's l2: 3.94585e+08
[112700]	training's l2: 8.90337e+07	valid_1's l2: 3.94581e+08
[112800]	training's l2: 8.89353e+07	valid_1's l2: 3.94582e+08
[112900]	training's l2: 8.88361e+07	valid_1's l2: 3.94579e+08
[113000]	training's l2: 8.8742e+07	valid_1's l2: 3.94576e+08
[113100]	training's l2: 8.86459e+07	valid_1's l2: 3.94574e+08
[113200]	training's l2: 8.85485e+07	valid_1's l2: 3.94574e+08
[113300]	training's l2: 8.84526e+07	valid_1's l2: 3.94573e+08
[113400]	training's l2: 8.83541e+07	valid_1's l2: 3.9457e+08
[113500]	training's l2: 8.82571e+07	valid_1's l2: 3.94567e+08
[113600]	training's l2: 8.81625e+07	valid_1's l2: 3.94566e+08
[113700]	tr

[125400]	training's l2: 7.74618e+07	valid_1's l2: 3.94218e+08
[125500]	training's l2: 7.73761e+07	valid_1's l2: 3.94214e+08
[125600]	training's l2: 7.72912e+07	valid_1's l2: 3.94212e+08
[125700]	training's l2: 7.72044e+07	valid_1's l2: 3.94204e+08
[125800]	training's l2: 7.71239e+07	valid_1's l2: 3.94203e+08
[125900]	training's l2: 7.70401e+07	valid_1's l2: 3.94202e+08
[126000]	training's l2: 7.69556e+07	valid_1's l2: 3.94205e+08
[126100]	training's l2: 7.68724e+07	valid_1's l2: 3.94206e+08
[126200]	training's l2: 7.67859e+07	valid_1's l2: 3.94208e+08
[126300]	training's l2: 7.67005e+07	valid_1's l2: 3.94206e+08
[126400]	training's l2: 7.6619e+07	valid_1's l2: 3.94206e+08
[126500]	training's l2: 7.65307e+07	valid_1's l2: 3.94203e+08
[126600]	training's l2: 7.64453e+07	valid_1's l2: 3.94204e+08
[126700]	training's l2: 7.6357e+07	valid_1's l2: 3.94199e+08
[126800]	training's l2: 7.62707e+07	valid_1's l2: 3.94199e+08
[126900]	training's l2: 7.61836e+07	valid_1's l2: 3.94193e+08
[127000]	t

[9600]	training's l2: 3.47213e+08	valid_1's l2: 4.45802e+08
[9700]	training's l2: 3.4623e+08	valid_1's l2: 4.45446e+08
[9800]	training's l2: 3.45269e+08	valid_1's l2: 4.45095e+08
[9900]	training's l2: 3.44293e+08	valid_1's l2: 4.44733e+08
[10000]	training's l2: 3.43329e+08	valid_1's l2: 4.44429e+08
[10100]	training's l2: 3.4237e+08	valid_1's l2: 4.4411e+08
[10200]	training's l2: 3.41437e+08	valid_1's l2: 4.43765e+08
[10300]	training's l2: 3.40521e+08	valid_1's l2: 4.43449e+08
[10400]	training's l2: 3.39595e+08	valid_1's l2: 4.43158e+08
[10500]	training's l2: 3.38696e+08	valid_1's l2: 4.42883e+08
[10600]	training's l2: 3.37786e+08	valid_1's l2: 4.42583e+08
[10700]	training's l2: 3.36897e+08	valid_1's l2: 4.42317e+08
[10800]	training's l2: 3.35997e+08	valid_1's l2: 4.42048e+08
[10900]	training's l2: 3.35114e+08	valid_1's l2: 4.4176e+08
[11000]	training's l2: 3.34225e+08	valid_1's l2: 4.41493e+08
[11100]	training's l2: 3.33369e+08	valid_1's l2: 4.41216e+08
[11200]	training's l2: 3.32517e+

[23100]	training's l2: 2.63775e+08	valid_1's l2: 4.28179e+08
[23200]	training's l2: 2.63337e+08	valid_1's l2: 4.28123e+08
[23300]	training's l2: 2.62909e+08	valid_1's l2: 4.2807e+08
[23400]	training's l2: 2.62493e+08	valid_1's l2: 4.28034e+08
[23500]	training's l2: 2.62068e+08	valid_1's l2: 4.27999e+08
[23600]	training's l2: 2.61652e+08	valid_1's l2: 4.27951e+08
[23700]	training's l2: 2.61237e+08	valid_1's l2: 4.27911e+08
[23800]	training's l2: 2.60815e+08	valid_1's l2: 4.27849e+08
[23900]	training's l2: 2.6041e+08	valid_1's l2: 4.2782e+08
[24000]	training's l2: 2.59984e+08	valid_1's l2: 4.27763e+08
[24100]	training's l2: 2.59577e+08	valid_1's l2: 4.27714e+08
[24200]	training's l2: 2.59175e+08	valid_1's l2: 4.27656e+08
[24300]	training's l2: 2.58774e+08	valid_1's l2: 4.27599e+08
[24400]	training's l2: 2.58369e+08	valid_1's l2: 4.27559e+08
[24500]	training's l2: 2.57959e+08	valid_1's l2: 4.27502e+08
[24600]	training's l2: 2.57543e+08	valid_1's l2: 4.27461e+08
[24700]	training's l2: 2.57

[36600]	training's l2: 2.15906e+08	valid_1's l2: 4.23913e+08
[36700]	training's l2: 2.15615e+08	valid_1's l2: 4.23895e+08
[36800]	training's l2: 2.15315e+08	valid_1's l2: 4.23884e+08
[36900]	training's l2: 2.15029e+08	valid_1's l2: 4.23869e+08
[37000]	training's l2: 2.14731e+08	valid_1's l2: 4.23848e+08
[37100]	training's l2: 2.14439e+08	valid_1's l2: 4.23822e+08
[37200]	training's l2: 2.1415e+08	valid_1's l2: 4.23807e+08
[37300]	training's l2: 2.13848e+08	valid_1's l2: 4.23796e+08
[37400]	training's l2: 2.13549e+08	valid_1's l2: 4.23784e+08
[37500]	training's l2: 2.13259e+08	valid_1's l2: 4.23754e+08
[37600]	training's l2: 2.12976e+08	valid_1's l2: 4.23737e+08
[37700]	training's l2: 2.12677e+08	valid_1's l2: 4.23712e+08
[37800]	training's l2: 2.1239e+08	valid_1's l2: 4.23697e+08
[37900]	training's l2: 2.12108e+08	valid_1's l2: 4.23681e+08
[38000]	training's l2: 2.11823e+08	valid_1's l2: 4.23656e+08
[38100]	training's l2: 2.11537e+08	valid_1's l2: 4.23632e+08
[38200]	training's l2: 2.1

[50100]	training's l2: 1.80748e+08	valid_1's l2: 4.21798e+08
[50200]	training's l2: 1.80516e+08	valid_1's l2: 4.21785e+08
[50300]	training's l2: 1.80286e+08	valid_1's l2: 4.21775e+08
[50400]	training's l2: 1.80058e+08	valid_1's l2: 4.21772e+08
[50500]	training's l2: 1.79838e+08	valid_1's l2: 4.21752e+08
[50600]	training's l2: 1.79614e+08	valid_1's l2: 4.21754e+08
[50700]	training's l2: 1.79391e+08	valid_1's l2: 4.21744e+08
[50800]	training's l2: 1.79167e+08	valid_1's l2: 4.2173e+08
[50900]	training's l2: 1.78939e+08	valid_1's l2: 4.21707e+08
[51000]	training's l2: 1.78717e+08	valid_1's l2: 4.21701e+08
[51100]	training's l2: 1.78487e+08	valid_1's l2: 4.21683e+08
[51200]	training's l2: 1.7827e+08	valid_1's l2: 4.21677e+08
[51300]	training's l2: 1.78051e+08	valid_1's l2: 4.21675e+08
[51400]	training's l2: 1.77837e+08	valid_1's l2: 4.21665e+08
[51500]	training's l2: 1.77624e+08	valid_1's l2: 4.21656e+08
[51600]	training's l2: 1.77401e+08	valid_1's l2: 4.21648e+08
[51700]	training's l2: 1.7

[63600]	training's l2: 1.53053e+08	valid_1's l2: 4.20429e+08
[63700]	training's l2: 1.52873e+08	valid_1's l2: 4.20427e+08
[63800]	training's l2: 1.52689e+08	valid_1's l2: 4.20423e+08
[63900]	training's l2: 1.5251e+08	valid_1's l2: 4.20422e+08
[64000]	training's l2: 1.52328e+08	valid_1's l2: 4.20416e+08
[64100]	training's l2: 1.52147e+08	valid_1's l2: 4.20412e+08
[64200]	training's l2: 1.51965e+08	valid_1's l2: 4.20396e+08
[64300]	training's l2: 1.51776e+08	valid_1's l2: 4.20388e+08
[64400]	training's l2: 1.51592e+08	valid_1's l2: 4.20384e+08
[64500]	training's l2: 1.51418e+08	valid_1's l2: 4.20371e+08
[64600]	training's l2: 1.51229e+08	valid_1's l2: 4.2037e+08
[64700]	training's l2: 1.5105e+08	valid_1's l2: 4.20361e+08
[64800]	training's l2: 1.50873e+08	valid_1's l2: 4.20352e+08
[64900]	training's l2: 1.5069e+08	valid_1's l2: 4.20339e+08
[65000]	training's l2: 1.50508e+08	valid_1's l2: 4.20328e+08
[65100]	training's l2: 1.50337e+08	valid_1's l2: 4.20328e+08
[65200]	training's l2: 1.501

[77100]	training's l2: 1.30716e+08	valid_1's l2: 4.19445e+08
[77200]	training's l2: 1.3056e+08	valid_1's l2: 4.19438e+08
[77300]	training's l2: 1.30404e+08	valid_1's l2: 4.19429e+08
[77400]	training's l2: 1.30258e+08	valid_1's l2: 4.19417e+08
[77500]	training's l2: 1.30116e+08	valid_1's l2: 4.19413e+08
[77600]	training's l2: 1.29968e+08	valid_1's l2: 4.19397e+08
[77700]	training's l2: 1.29814e+08	valid_1's l2: 4.19394e+08
[77800]	training's l2: 1.29667e+08	valid_1's l2: 4.19391e+08
[77900]	training's l2: 1.2952e+08	valid_1's l2: 4.19387e+08
[78000]	training's l2: 1.29368e+08	valid_1's l2: 4.19377e+08
[78100]	training's l2: 1.29217e+08	valid_1's l2: 4.19366e+08
[78200]	training's l2: 1.29074e+08	valid_1's l2: 4.19357e+08
[78300]	training's l2: 1.28926e+08	valid_1's l2: 4.19351e+08
[78400]	training's l2: 1.2878e+08	valid_1's l2: 4.19348e+08
[78500]	training's l2: 1.28635e+08	valid_1's l2: 4.19348e+08
[78600]	training's l2: 1.28489e+08	valid_1's l2: 4.19335e+08
[78700]	training's l2: 1.28

[90600]	training's l2: 1.11872e+08	valid_1's l2: 4.18731e+08
[90700]	training's l2: 1.11744e+08	valid_1's l2: 4.18729e+08
[90800]	training's l2: 1.11623e+08	valid_1's l2: 4.18725e+08
[90900]	training's l2: 1.11497e+08	valid_1's l2: 4.18721e+08
[91000]	training's l2: 1.1137e+08	valid_1's l2: 4.18715e+08
[91100]	training's l2: 1.11241e+08	valid_1's l2: 4.18719e+08
[91200]	training's l2: 1.11114e+08	valid_1's l2: 4.1872e+08
[91300]	training's l2: 1.10988e+08	valid_1's l2: 4.18717e+08
[91400]	training's l2: 1.10865e+08	valid_1's l2: 4.18718e+08
[91500]	training's l2: 1.10739e+08	valid_1's l2: 4.18712e+08
[91600]	training's l2: 1.10616e+08	valid_1's l2: 4.18711e+08
[91700]	training's l2: 1.10489e+08	valid_1's l2: 4.18702e+08
[91800]	training's l2: 1.10362e+08	valid_1's l2: 4.18699e+08
[91900]	training's l2: 1.10231e+08	valid_1's l2: 4.18691e+08
[92000]	training's l2: 1.10109e+08	valid_1's l2: 4.18686e+08
[92100]	training's l2: 1.09979e+08	valid_1's l2: 4.18673e+08
[92200]	training's l2: 1.0

[104000]	training's l2: 9.60237e+07	valid_1's l2: 4.18259e+08
[104100]	training's l2: 9.59253e+07	valid_1's l2: 4.1826e+08
[104200]	training's l2: 9.58237e+07	valid_1's l2: 4.18256e+08
[104300]	training's l2: 9.57223e+07	valid_1's l2: 4.1825e+08
[104400]	training's l2: 9.56148e+07	valid_1's l2: 4.18249e+08
[104500]	training's l2: 9.5507e+07	valid_1's l2: 4.18243e+08
[104600]	training's l2: 9.53999e+07	valid_1's l2: 4.1824e+08
[104700]	training's l2: 9.52936e+07	valid_1's l2: 4.18234e+08
[104800]	training's l2: 9.51824e+07	valid_1's l2: 4.1823e+08
[104900]	training's l2: 9.50817e+07	valid_1's l2: 4.18224e+08
[105000]	training's l2: 9.4978e+07	valid_1's l2: 4.18221e+08
[105100]	training's l2: 9.48757e+07	valid_1's l2: 4.1822e+08
[105200]	training's l2: 9.47694e+07	valid_1's l2: 4.18216e+08
[105300]	training's l2: 9.46626e+07	valid_1's l2: 4.18207e+08
[105400]	training's l2: 9.456e+07	valid_1's l2: 4.18201e+08
[105500]	training's l2: 9.44585e+07	valid_1's l2: 4.18197e+08
[105600]	training

[117300]	training's l2: 8.26944e+07	valid_1's l2: 4.17885e+08
[117400]	training's l2: 8.26007e+07	valid_1's l2: 4.17884e+08
[117500]	training's l2: 8.25103e+07	valid_1's l2: 4.1788e+08
[117600]	training's l2: 8.24167e+07	valid_1's l2: 4.17879e+08
[117700]	training's l2: 8.23265e+07	valid_1's l2: 4.17882e+08
[117800]	training's l2: 8.2232e+07	valid_1's l2: 4.1788e+08
[117900]	training's l2: 8.21317e+07	valid_1's l2: 4.17863e+08
[118000]	training's l2: 8.2034e+07	valid_1's l2: 4.17859e+08
[118100]	training's l2: 8.19358e+07	valid_1's l2: 4.17853e+08
[118200]	training's l2: 8.18412e+07	valid_1's l2: 4.17847e+08
[118300]	training's l2: 8.17509e+07	valid_1's l2: 4.17853e+08
[118400]	training's l2: 8.16631e+07	valid_1's l2: 4.17848e+08
[118500]	training's l2: 8.15759e+07	valid_1's l2: 4.17847e+08
[118600]	training's l2: 8.14831e+07	valid_1's l2: 4.17848e+08
[118700]	training's l2: 8.13945e+07	valid_1's l2: 4.17848e+08
[118800]	training's l2: 8.13062e+07	valid_1's l2: 4.17847e+08
[118900]	tra

# 3. PREDICTING

In [110]:
def to_submit_file(file_name, values):
    submit = test_id.copy()
    submit['Salary'] = values
    submit.loc[submit['Salary'] < 0] = 0
    submit.to_csv(file_name, index = False)
    print ('Done writting submit file: ', file_name)

In [111]:
test_id = pd.read_csv('data_all/submit.csv')

In [112]:
y_pred = preds

In [113]:
to_submit_file('submitfile.csv',y_pred)

Done writting submit file:  submitfile.csv
