In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
pd.set_option('display.max_columns', None)

In [2]:
def preprocessing(df):
    df.drop('enrollee_id',inplace=True,axis=1)
    
    df.company_type.fillna(value='missing',inplace=True)
    df.company_size.fillna(value='missing',inplace=True)
    df.gender.fillna(value='missing',inplace=True)
    df.major_discipline.fillna(value='missing',inplace=True)
    df.education_level.fillna(value='missing',inplace=True)
    df.last_new_job.fillna(value='missing',inplace=True)
    df.enrolled_university.fillna(value='missing',inplace=True)
    df.experience.fillna(value=-99,inplace=True)
    
    #PROCESSING CITY COL
    a=sorted(df.city.unique())
    b=[]
    for i in a:
        b.append(i[5:])
    dic=dict(zip(a,b))
    df['city']=df['city'].map(dic).astype(int)

    #LABEL ENCODING ORDINAL FEATURES
    education=['Primary School','High School','Graduate','Masters','Phd','missing']
    educationmapping=dict(zip(education,[1,2,3,4,5,-99]))
    df['education_level_val']=df['education_level'].map(educationmapping).astype(int)
    df.drop('education_level',axis=1,inplace=True)

    company_size=['<10','10/49','50-99','100-500','500-999','1000-4999','5000-9999','10000+','missing']
    #company_sizemapping=dict(zip(company_size,[5,25,75,250,750,2500,7500,1250,-99]))
    company_sizemapping=dict(zip(company_size,[1,2,3,4,5,6,7,8,-99]))    
    df['company_size_val']=df['company_size'].map(company_sizemapping).astype(int)
    df.drop('company_size',axis=1,inplace=True)

    last_new_job=['never','1', '2', '3', '4', '>4', 'missing']
    last_new_jobmapping=dict(zip(last_new_job,[0,1,2,3,4,5,-99]))
    df['last_new_job_val']=df['last_new_job'].map(last_new_jobmapping).astype(int)
    df.drop('last_new_job',axis=1,inplace=True)

    enrolled_university=['no_enrollment','Part time course', 'Full time course', 'missing']
    enrolled_universitymapping=dict(zip(enrolled_university,[1,2,3,-99]))
    df['enrolled_university_val']=df['enrolled_university'].map(enrolled_universitymapping).astype(int)
    df.drop('enrolled_university',axis=1,inplace=True)

    #lABEL ENCODING BINARY FEATURE
    relevent_experience=sorted(df['relevent_experience'].unique())
    relevent_experiencemapping=dict(zip(relevent_experience,range(0,len(relevent_experience))))
    df['relevent_experience_val']=df['relevent_experience'].map(relevent_experiencemapping).astype(int)
    df.drop('relevent_experience',axis=1,inplace=True)


    #FREQUENCY ENCODING NOMINAL CATEGORICAL FEATURES
    train=pd.read_csv('train_jqd04QH.csv')
    test=pd.read_csv('test_GYi4Gz5.csv')
    total=train.append(test)
    
    total.company_type.fillna(value='missing',inplace=True)
    total.gender.fillna(value='missing',inplace=True)
    total.major_discipline.fillna(value='missing',inplace=True)
    
    encoding=total.groupby('company_type').size()
    encoding/=len(total)
    #encoding=df.groupby('company_type').size()
    #encoding/=len(df)
    df['company_type_freq_enc']=df['company_type'].map(encoding)
    df.drop('company_type',axis=1,inplace=True)

    encoding=total.groupby('gender').size()
    encoding/=len(total)
    df['gender_freq_enc']=df['gender'].map(encoding)
    df.drop('gender',axis=1,inplace=True)

    encoding=total.groupby('major_discipline').size()
    encoding/=len(total)
    df['major_discipline_enc']=df['major_discipline'].map(encoding)
    df.drop('major_discipline',axis=1,inplace=True)


    #CLEANING COL EXPERIENCE
    df.loc[df.experience=='<1','experience']=0
    df.loc[df.experience=='>20','experience']=25
    df['experience']=df.experience.astype(int)
    
    
    train=pd.read_csv('train_jqd04QH.csv')
    test=pd.read_csv('test_GYi4Gz5.csv')
    total=train.append(test)
    
    total.experience.fillna(value=-99,inplace=True)
    total.loc[total.experience=='<1','experience']=0
    total.loc[total.experience=='>20','experience']=25
    total['experience']=total.experience.astype(int)
    
    unique_experience_hours=total.experience.unique()
    dic={}
    for i in unique_experience_hours:
        s=total[total.experience==i].city_development_index.sum()
        l=len(total[total.experience==i])
        avg=float(s/l)
        dic[i]=avg
    df['avg_city_development_index_wrt_experience']=df['experience'].map(dic) 
    
    return(df)
    

In [3]:
def FE_train(train):

      
    #experience
    c=pd.crosstab(train.experience,train.target)
    c['experience']=c.index
    c['want']=c[1]
    c['notwant']=c[0]
    c['target_diff_experience']=c['want']-c['notwant']
    dic2=dict(zip(c['experience'],c['target_diff_experience']))
    train['target_diff_experience']=train['experience'].map(dic2).astype(int)

    
    #city_development_index
    c=pd.crosstab(train.city_development_index,train.target)
    c['city_development_index']=c.index
    c['want']=c[1]
    c['notwant']=c[0]
    c['target_diff_city_development_index']=c['want']-c['notwant']
    dic3=dict(zip(c['city_development_index'],c['target_diff_city_development_index']))
    train['target_diff_city_development_index']=train['city_development_index'].map(dic3).astype(int)

    
    
    #city
    c=pd.crosstab(train.city,train.target)
    c['city']=c.index
    c['want']=c[1]
    c['notwant']=c[0]
    c['target_diff_city']=c['want']-c['notwant']
    dic4=dict(zip(c['city'],c['target_diff_city']))
    train['target_diff_city']=train['city'].map(dic4).astype(int)
    
    #company_size_val
    c=pd.crosstab(train.company_size_val,train.target)
    c['company_size_val']=c.index
    c['want']=c[1]
    c['notwant']=c[0]
    c['target_diff_company_size_val']=c['want']-c['notwant']
    dic5=dict(zip(c['company_size_val'],c['target_diff_company_size_val']))
    train['target_diff_company_size_val']=train['company_size_val'].map(dic5).astype(int)

    
    #last_new_job_val
    c=pd.crosstab(train.last_new_job_val,train.target)
    c['last_new_job_val']=c.index
    c['want']=c[1]
    c['notwant']=c[0]
    c['target_diff_last_new_job_val']=c['want']-c['notwant']
    dic8=dict(zip(c['last_new_job_val'],c['target_diff_last_new_job_val']))
    train['target_diff_last_new_job_val']=train['last_new_job_val'].map(dic8).astype(int)
    
    #major_discipline_enc
    c=pd.crosstab(train.major_discipline_enc,train.target)
    c['major_discipline_enc']=c.index
    c['want']=c[1]
    c['notwant']=c[0]
    c['target_diff_major_discipline_enc']=c['want']-c['notwant']
    dic9=dict(zip(c['major_discipline_enc'],c['target_diff_major_discipline_enc']))
    train['target_diff_major_discipline_enc']=train['major_discipline_enc'].map(dic9).astype(int)
    
    #enrolled_university_val
    c=pd.crosstab(train.enrolled_university_val,train.target)
    c['enrolled_university_val']=c.index
    c['want']=c[1]
    c['notwant']=c[0]
    c['target_diff_enrolled_university_val']=c['want']-c['notwant']
    dic10=dict(zip(c['enrolled_university_val'],c['target_diff_enrolled_university_val']))
    train['target_diff_enrolled_university_val']=train['enrolled_university_val'].map(dic10).astype(int)
    

    ###city_development_index ratio
    c=pd.crosstab(train.city_development_index,train.target)
    c['city_development_index']=c.index
    c['want']=c[1]
    c['notwant']=c[0]
    c['target_ratio0_city_development_index']=c['notwant']/(c['want']+c['notwant'])
    #c['target_ratio1_city_development_index']=c['want']/(c['want']+c['notwant'])
    #c['target_ratio_diff_city_development_index']=c['target_ratio0_city_development_index']-c['target_ratio1_city_development_index']

    dic103=dict(zip(c['city_development_index'],c['target_ratio0_city_development_index']))
    train['target_ratio0_city_development_index']=train['city_development_index'].map(dic103)

    
    dic_all=[dic2,dic3,dic4,dic5,dic8,dic9,dic10,dic103]  #list1

    return(train,dic_all)
    

In [4]:
def FE_test(test,dic_all):
    
    (dic2,dic3,dic4,dic5,dic8,dic9,dic10,dic103)=dic_all         #list1

    
    test['target_diff_experience']=test['experience'].map(dic2).astype(int)

    test['target_diff_city_development_index']=test['city_development_index'].map(dic3)
    test.target_diff_city_development_index.fillna(value=test.target_diff_city_development_index.mean(),inplace=True)
    
    test['target_diff_city']=test['city'].map(dic4)
    test.target_diff_city.fillna(value=103,inplace=True)
    
    test['target_diff_company_size_val']=test['company_size_val'].map(dic5)
      
    test['target_diff_last_new_job_val']=test['last_new_job_val'].map(dic8)
    
    test['target_diff_major_discipline_enc']=test['major_discipline_enc'].map(dic9)
    
    test['target_diff_enrolled_university_val']=test['enrolled_university_val'].map(dic10)
    
      
    test['target_ratio0_city_development_index']=test['city_development_index'].map(dic103) #only for RF
    test.target_ratio0_city_development_index.fillna(value=test.target_ratio0_city_development_index.mean(),inplace=True)
  
    return(test)

In [5]:
train=pd.read_csv('train_jqd04QH.csv')
test=pd.read_csv('test_GYi4Gz5.csv')

train=preprocessing(train)
test=preprocessing(test)

In [6]:
train,dic_all=FE_train(train)
test=FE_test(test,dic_all)

In [7]:
y=train['target']
x=train
x.drop(['target'],axis=1,inplace=True)
x=pd.DataFrame(train)

In [8]:
%%time
# ENSEMBLING RF WITH BAGGING 
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor

model = RandomForestRegressor()
bags=10
seed=1
bagged_prediction=np.zeros(test.shape[0])
for n in range(0,bags):
    model.set_params(random_state=seed+n,n_estimators = 50, n_jobs = -1,max_features = 2, min_samples_leaf =30,max_depth=10)
    model.fit(x,y)
    pred=model.predict(test)
    bagged_prediction+=pred
bagged_prediction/=bags

Wall time: 6.44 s


In [9]:

# Read the submission file
submission=pd.read_csv("sample_submission_sxfcbdx.csv")
# Fill the is_pass variable with the predictions
#submission['target']=pred_test
submission['target']=bagged_prediction
# Converting the submission file to csv format
submission.to_csv('Verify_with_redownload_data_RF2.csv', index=False)

In [10]:
submission.head()

Unnamed: 0,enrollee_id,target
0,16548,0.316501
1,12036,0.073949
2,11061,0.29901
3,5032,0.062413
4,17599,0.097948
