In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression

import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline 

from sklearn.model_selection import train_test_split 
from catboost import CatBoostRegressor , Pool,metrics ,cv

In [2]:
train= pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('submission.csv')

In [3]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,52,Private,98926,HS-grad,9,Never-married,Other-service,Unmarried,White,Male,-3,0,44,United-States,<=50K
1,76,Self-emp-not-inc,132607,Bachelors,12,Married-civ-spouse,Sales,Husband,White,Male,7527,0,44,United-States,>50K
2,40,Private,243258,Bachelors,13,Married-civ-spouse,Transport-moving,Husband,Amer-Indian-Eskimo,Male,19,0,57,United-States,>50K
3,76,State-gov,181259,Bachelors,12,Married-civ-spouse,Prof-specialty,Husband,White,Male,7720,0,44,United-States,>50K
4,36,Self-emp-inc,115379,Masters,13,Divorced,Exec-managerial,Not-in-family,White,Male,12,0,58,United-States,<=50K


In [4]:
train.shape

(18944, 15)

In [5]:
train.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,18944.0,18944.0,18944.0,18944.0,18944.0,18944.0
mean,39.26647,164292.035156,9.479043,577.47651,33.082823,40.119299
std,13.395878,93425.986084,2.830047,2414.473217,248.521486,11.124649
min,14.0,8019.0,2.0,-58.0,-2.0,1.0
25%,28.0,98530.75,8.0,-5.0,0.0,35.0
50%,40.0,152895.5,9.0,5.0,0.0,39.0
75%,47.0,215541.0,12.0,17.0,0.0,44.0
max,99.0,772988.0,16.0,16193.0,2437.0,102.0


In [6]:
#checking for null values in our dataset train

train.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [7]:
# separating into features and targets 
X= train.drop('hours-per-week',axis=1)
y= train['hours-per-week']

In [8]:
# categorical feature indices 
categorical_feature_indices = np.where(X.dtypes=='object')[0]
print(categorical_feature_indices)

[ 1  3  5  6  7  8  9 12 13]


In [9]:
#split our data into train and test datasets 
X_train,X_validation,y_train,y_validation = train_test_split(X,y,test_size=0.20,random_state=23)
X_test = test

In [10]:
# Model training  1st iteration 

model = CatBoostRegressor(
    loss_function = 'RMSE',
    random_seed=45,
    logging_level='Silent'
)

model.fit(
    X_train,
    y_train,
    cat_features= categorical_feature_indices,
    eval_set = (X_validation,y_validation),
    plot=True

)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x1abfe3db7c0>

In [11]:
cv_params = model.get_params()
print(cv_params)

{'loss_function': 'RMSE', 'random_seed': 45, 'logging_level': 'Silent'}


In [12]:
# attempting cross_validation 
cv_data = cv(
    Pool(X,y,cat_features=categorical_feature_indices),
    cv_params,
    nfold=3,
    iterations =1500,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [13]:
cv_data.head()

Unnamed: 0,iterations,test-RMSE-mean,test-RMSE-std,train-RMSE-mean,train-RMSE-std
0,0,40.472676,0.06373,40.471699,0.028517
1,1,39.349237,0.064035,39.347314,0.0246
2,2,38.258996,0.060141,38.25702,0.023042
3,3,37.204355,0.059453,37.201644,0.020356
4,4,36.183546,0.06027,36.180218,0.016791


In [14]:

print(
'Best validation RMSE score is {:.2f} ± {:.2f} on step {}'.format(
    np.max(cv_data['test-RMSE-mean']),
    cv_data['test-RMSE-std'][np.argmax(cv_data['test-RMSE-mean'])],
    np.argmax(cv_data['test-RMSE-mean'])
))

Best validation RMSE score is 40.47 ± 0.06 on step 0


In [15]:
print('Precise  RMSE score: {}'.format(np.max(cv_data['test-RMSE-mean'])))

Precise  RMSE score: 40.472675538110146


In [16]:
sub.head()

Unnamed: 0,hours-per-week
0,0
1,0
2,0
3,0
4,0


In [17]:
# Model applying 
predictions = model.predict(X_test)

In [19]:
submission = pd.DataFrame(data={
    'hours-per-week': predictions
})

submission.to_csv('submission.csv',index=False)
