# Step1: Import the required libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

# Step2: Read the data from train.csv

In [6]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# let us understand the data
print('Size of training set: {} rows and {} columns'
      .format(*df_train.shape))

Size of training set: 4209 rows and 378 columns


# Step3: Read the test.csv data

In [7]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [8]:
# let us understand the data
print('Size of testing set: {} rows and {} columns'
      .format(*df_test.shape))

Size of testing set: 4209 rows and 377 columns


# Step4: Collect the Y values into an array

In [9]:
# seperate the y from the data as we will use this to learn as 
# the prediction output

y_train = df_train['y'].values

# Step5: Understand the data types we have

In [10]:
# iterate through all the columns which has X in the name of the column
cols = [c for c in df_train.columns if 'X' in c]
print('Number of features: {}'.format(len(cols)))

Number of features: 376


In [11]:
print('Feature types:')
df_train[cols].dtypes.value_counts()

Feature types:


int64     368
object      8
dtype: int64

# Step6: Count the data in each of the columns

In [13]:
counts = [[], [], []]
for c in cols:
    typ = df_train[c].dtype
    uniq = len(np.unique(df_train[c]))
    if uniq == 1:
        counts[0].append(c)
    elif uniq == 2 and typ == np.int64:
        counts[1].append(c)
    else:
        counts[2].append(c)

print('Constant features: {} Binary features: {} Categorical features: {}\n'
      .format(*[len(c) for c in counts]))
print('Constant features:', counts[0])
print('Categorical features:', counts[2])

Constant features: 12 Binary features: 356 Categorical features: 8

Constant features: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
Categorical features: ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


Remove columns ID and Y from the data as they are not used for learning

In [14]:
usable_columns = list(set(df_train.columns) - set(['ID', 'y']))
y_train = df_train['y'].values
id_test = df_test['ID'].values

x_train = df_train[usable_columns]
x_test = df_test[usable_columns]

# Step7: Check for null and unique values for test and train sets

In [15]:
def check_missing_values(df):
    if df.isnull().any().any():
        print("There are missing values in the dataframe")
    else:
        print("There are no missing values in the dataframe")
check_missing_values(x_train)
check_missing_values(x_test)

There are no missing values in the dataframe
There are no missing values in the dataframe


# Step8: If for any column(s), the variance is equal to zero, then you need to remove those variable(s).

Apply label encoder

In [16]:
for column in usable_columns:
    cardinality = len(np.unique(x_train[column]))
    if cardinality == 1:
        x_train.drop(column, axis=1) # Column with only one 
        # value is useless so we drop it
        x_test.drop(column, axis=1)
    if cardinality > 2: # Column is categorical
        mapper = lambda x: sum([ord(digit) for digit in x])
        x_train[column] = x_train[column].apply(mapper)
        x_test[column] = x_test[column].apply(mapper)
x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[column] = x_train[column].apply(mapper)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[column] = x_test[column].apply(mapper)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[column] = x_train[column].apply(mapper)
A value is trying to be set on a copy of a slice from a DataFra

Unnamed: 0,X62,X295,X141,X172,X56,X44,X291,X214,X153,X365,...,X337,X154,X116,X34,X77,X306,X368,X100,X255,X38
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Step9: Make sure the data is now changed into numericals

In [17]:
print('Feature types:')
x_train[cols].dtypes.value_counts()

Feature types:


int64    376
dtype: int64

# Step10: Perform dimensionality reduction

Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space.

In [18]:
n_comp = 12
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(x_train)
pca2_results_test = pca.transform(x_test)

# Step11: Training using XGBoost

In [19]:
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [20]:
x_train, x_valid, y_train, y_valid = train_test_split(
        pca2_results_train, 
        y_train, test_size=0.2, 
        random_state=4242)

In [21]:
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
d_test = xgb.DMatrix(pca2_results_test)

In [22]:
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['max_depth'] = 4

In [23]:
def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)

In [24]:
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [25]:
clf = xgb.train(params, d_train, 
                1000, watchlist, early_stopping_rounds=50, 
                feval=xgb_r2_score, maximize=True, verbose_eval=10)

[0]	train-rmse:99.14834	train-r2:-58.35295	valid-rmse:98.26297	valid-r2:-67.63754




[10]	train-rmse:81.27653	train-r2:-38.88428	valid-rmse:80.36433	valid-r2:-44.91014
[20]	train-rmse:66.71610	train-r2:-25.87403	valid-rmse:65.77334	valid-r2:-29.75260
[30]	train-rmse:54.86956	train-r2:-17.17751	valid-rmse:53.88963	valid-r2:-19.64393
[40]	train-rmse:45.24492	train-r2:-11.35979	valid-rmse:44.21995	valid-r2:-12.90012
[50]	train-rmse:37.44736	train-r2:-7.46669	valid-rmse:36.37245	valid-r2:-8.40431
[60]	train-rmse:31.14759	train-r2:-4.85761	valid-rmse:30.01883	valid-r2:-5.40574
[70]	train-rmse:26.08677	train-r2:-3.10877	valid-rmse:24.90901	valid-r2:-3.41057
[80]	train-rmse:22.04666	train-r2:-1.93465	valid-rmse:20.83109	valid-r2:-2.08465
[90]	train-rmse:18.84414	train-r2:-1.14400	valid-rmse:17.60480	valid-r2:-1.20315
[100]	train-rmse:16.34037	train-r2:-0.61211	valid-rmse:15.08374	valid-r2:-0.61733
[110]	train-rmse:14.40188	train-r2:-0.25230	valid-rmse:13.14901	valid-r2:-0.22905
[120]	train-rmse:12.92206	train-r2:-0.00817	valid-rmse:11.69021	valid-r2:0.02854
[130]	train-rmse:1

# Step12: Predict your test_df values using XGBoost.

In [26]:
p_test = clf.predict(d_test)

In [27]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = p_test
sub.to_csv('xgb.csv', index=False)

In [28]:
sub.head()

Unnamed: 0,ID,y
0,1,82.870979
1,2,97.369026
2,3,83.09156
3,4,77.119469
4,5,112.567146
