In [2]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")

In [3]:
df_train=pd.read_csv('train.csv')
print(df_train.shape)
print(df_train.head())

(4209, 378)
   ID       y  X0 X1  X2 X3 X4 X5 X6 X8  ...  X375  X376  X377  X378  X379  \
0   0  130.81   k  v  at  a  d  u  j  o  ...     0     0     1     0     0   
1   6   88.53   k  t  av  e  d  y  l  o  ...     1     0     0     0     0   
2   7   76.26  az  w   n  c  d  x  j  x  ...     0     0     0     0     0   
3   9   80.62  az  t   n  f  d  x  l  e  ...     0     0     0     0     0   
4  13   78.02  az  v   n  f  d  h  d  n  ...     0     0     0     0     0   

   X380  X382  X383  X384  X385  
0     0     0     0     0     0  
1     0     0     0     0     0  
2     0     1     0     0     0  
3     0     0     0     0     0  
4     0     0     0     0     0  

[5 rows x 378 columns]


In [4]:
y_train = df_train['y'].values
cols = [c for c in df_train.columns if 'X' in c]
print('Number of features: {}'.format(len(cols)))
print('Feature types:')
df_train[cols].dtypes.value_counts()

Number of features: 376
Feature types:


int64     368
object      8
dtype: int64

In [5]:
counts = [[], [], []]
for c in cols:
    typ = df_train[c].dtype
    uniq = len(np.unique(df_train[c]))
    if uniq == 1:
        counts[0].append(c)
    elif uniq == 2 and typ == np.int64:
        counts[1].append(c)
    else:
        counts[2].append(c)
print('Constant features: {} Binary features: {} Categorical features: {}\n'.format(*[len(c) for c in counts]))
print('Constant features:', counts[0])
print('Categorical features:', counts[2])

Constant features: 12 Binary features: 356 Categorical features: 8

Constant features: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
Categorical features: ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


In [6]:
df_test = pd.read_csv('test.csv')
usable_columns = list(set(df_train.columns) -set(['ID', 'y']))
y_train = df_train['y'].values
id_test = df_test['ID'].values
x_train = df_train[usable_columns]
x_test = df_test[usable_columns]

In [7]:
# Step7: Check for null and unique values for test and train sets
def check_missing_values(df):
    if df.isnull().any().any():
        print("There are missing values in the dataframe")
    else:
        print("There are no missing values in the dataframe")
check_missing_values(x_train)
check_missing_values(x_test)

There are no missing values in the dataframe
There are no missing values in the dataframe


In [8]:
# Step8: If for any column(s), the variance is equal to zero,then you need to remove those variable(s)and Apply label encoder
for column in usable_columns:
    cardinality = len(np.unique(x_train[column]))
    if cardinality == 1:
        x_train.drop(column, axis=1) # Column with only one, value is useless so we drop it
        x_test.drop(column, axis=1)
    if cardinality > 2: # Column is categorical
        mapper = lambda x: sum([ord(digit) for digit in x])
        x_train[column] = x_train[column].apply(mapper)
        x_test[column] = x_test[column].apply(mapper)
x_train.head()

Unnamed: 0,X192,X340,X164,X71,X227,X257,X290,X200,X270,X77,...,X3,X137,X27,X201,X183,X285,X271,X274,X369,X90
0,0,0,0,0,0,0,0,0,0,0,...,97,1,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,101,0,1,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,99,1,1,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,0,0,...,102,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,102,0,1,0,0,0,0,0,0,0


In [9]:
# Step9: Make sure the data is now changed into numericals
print('Feature types:')
x_train[cols].dtypes.value_counts()

Feature types:


int64    376
dtype: int64

In [10]:
# Step10: Perform dimensionality reduction
n_comp = 12
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(x_train)
pca2_results_test = pca.transform(x_test)


In [11]:
# Step11: Training using xgboost
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(pca2_results_train, y_train, test_size=0.2, random_state=42)
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
#d_test = xgb.DMatrix(x_test)
d_test = xgb.DMatrix(pca2_results_test)
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['max_depth'] = 4
def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, feval=xgb_r2_score, maximize=True, verbose_eval=10)

[0]	train-rmse:98.99695	train-r2:-59.49733	valid-rmse:98.88884	valid-r2:-61.82690
[10]	train-rmse:81.14410	train-r2:-39.64492	valid-rmse:81.07848	valid-r2:-41.23399
[20]	train-rmse:66.59753	train-r2:-26.37844	valid-rmse:66.55611	valid-r2:-27.45948
[30]	train-rmse:54.75785	train-r2:-17.50910	valid-rmse:54.73342	valid-r2:-18.24670
[40]	train-rmse:45.14008	train-r2:-11.57816	valid-rmse:45.13793	valid-r2:-12.08984
[50]	train-rmse:37.34669	train-r2:-7.60987	valid-rmse:37.35501	valid-r2:-7.96497
[60]	train-rmse:31.04961	train-r2:-4.95120	valid-rmse:31.08431	valid-r2:-5.20774
[70]	train-rmse:25.98398	train-r2:-3.16777	valid-rmse:26.03102	valid-r2:-3.35345
[80]	train-rmse:21.93871	train-r2:-1.97108	valid-rmse:21.99811	valid-r2:-2.10901
[90]	train-rmse:18.73534	train-r2:-1.16678	valid-rmse:18.81639	valid-r2:-1.27470
[100]	train-rmse:16.22450	train-r2:-0.62493	valid-rmse:16.33550	valid-r2:-0.71442
[110]	train-rmse:14.28491	train-r2:-0.25964	valid-rmse:14.42725	valid-r2:-0.33727
[120]	train-rmse:

In [12]:
# Step12: Predict your test_df values using xgboost
p_test = clf.predict(d_test)
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = p_test
sub.to_csv('xgb.csv', index=False)
sub.head()

Unnamed: 0,ID,y
0,1,79.712715
1,2,95.915001
2,3,81.350365
3,4,77.470039
4,5,109.934296
