In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train = pd.read_csv('/content/drive/MyDrive/Simplilearn_Assignments/ML/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Simplilearn_Assignments/ML/test.csv')

In [None]:
train.head()

In [None]:
train.describe()

In [6]:
# 2. Check for Null & Unique values in train & test sets

isNan = train.isnull().sum().any()
isDup = train.duplicated().sum().any()

if isNan == True:
    print("There are NaN values in the dataset")
else:
    print("There are No NaN values in the dataset")

if isDup == True:
    print("There are Duplicate rows in the dataset")
else:
    print("There are No Duplicate rows in the dataset")

There are No NaN values in the dataset
There are No Duplicate rows in the dataset


In [7]:
# Feature Analysis:
cols = [c for c in train.columns if 'X' in c]
print('Number of features: {}'.format(len(cols)))

print('Feature types:')
train[cols].dtypes.value_counts()

Number of features: 376
Feature types:


int64     368
object      8
dtype: int64

In [8]:
counts = [[], [], []]
for c in cols:
    typ = train[c].dtype
    uniq = len(np.unique(train[c]))
    if uniq == 1: counts[0].append(c)
    elif uniq == 2 and typ == np.int64: counts[1].append(c)
    else: counts[2].append(c)

print('Constant features: {}    Binary features: {}     Categorical features: {}\n'.format(*[len(c) for c in counts]))

print('Constant features:', counts[0])
print('Categorical features:', counts[2])
# There are 12 columns with 0 Variance and are redudant for model training so they need to be removed

Constant features: 12    Binary features: 356     Categorical features: 8

Constant features: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
Categorical features: ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


In [None]:
# Visualizations for Binary Features
import matplotlib.pyplot as plt
pal = sns.color_palette()

binary_means = [np.mean(train[c]) for c in counts[1]]
binary_names = np.array(counts[1])[np.argsort(binary_means)]
binary_means = np.sort(binary_means)

fig, ax = plt.subplots(1, 3, figsize=(12,30))
ax[0].set_ylabel('Feature name')
ax[1].set_title('Mean values of binary variables')
for i in range(3):
    names, means = binary_names[i*119:(i+1)*119], binary_means[i*119:(i+1)*119]
    ax[i].barh(range(len(means)), means, color=pal[2])
    ax[i].set_xlabel('Mean value')
    ax[i].set_yticks(range(len(means)))
    ax[i].set_yticklabels(names, rotation='horizontal')
plt.show()

In [None]:
for c in counts[2]:
    value_counts = train[c].value_counts()
    fig, ax = plt.subplots(figsize=(10, 5))
    plt.title('Categorical feature {} - Cardinality {}'.format(c, len(np.unique(train[c]))))
    plt.xlabel('Feature value')
    plt.ylabel('Occurences')
    plt.bar(range(len(value_counts)), value_counts.values, color=pal[1])
    ax.set_xticks(range(len(value_counts)))
    ax.set_xticklabels(value_counts.index, rotation='vertical')
    plt.show()

In [None]:
usable_columns = list(set(train.columns) - set(['ID', 'y']))

y_train = train['y'].values
id_test = test['ID'].values
x_train = train[usable_columns]
x_test = test[usable_columns]

for column in usable_columns:
    cardinality = len(np.unique(x_train[column]))
    # Feature Selection [ One of Dimension Reduction Technique ]
    if cardinality == 1:
        x_train.drop(column, axis=1) # Column with only one value is useless so we drop it
        x_test.drop(column, axis=1)
    # Label Encoding [ To easy process data ]
    if cardinality > 2: # Column is categorical
        mapper = lambda x: sum([ord(digit) for digit in x])
        x_train[column] = x_train[column].apply(mapper)
        x_test[column] = x_test[column].apply(mapper)
        
x_train.head(20)

In [16]:
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.25, random_state = 5)
std_scale = StandardScaler().fit(x_train)
 
x_train_std = std_scale.transform(x_train)
x_test_std = std_scale.transform(x_test)
x_val_std = std_scale.transform(x_valid)

d_train = xgb.DMatrix(x_train_std, label = y_train)
d_valid = xgb.DMatrix(x_val_std, label = y_valid)
d_test = xgb.DMatrix(x_test_std)

params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.026
params['max_depth'] = 6

def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

clf = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, feval=xgb_r2_score, maximize=True, verbose_eval=5)

[0]	train-rmse:98.6534	valid-rmse:98.3388	train-r2:-63.3827	valid-r2:-60.8201
Multiple eval metrics have been passed: 'valid-r2' will be used for early stopping.

Will train until valid-r2 hasn't improved in 50 rounds.
[5]	train-rmse:86.5946	valid-rmse:86.2948	train-r2:-48.6051	valid-r2:-46.6047
[10]	train-rmse:76.0317	valid-rmse:75.7449	train-r2:-37.2415	valid-r2:-35.6764
[15]	train-rmse:66.7823	valid-rmse:66.5066	train-r2:-28.5031	valid-r2:-27.2755
[20]	train-rmse:58.6864	valid-rmse:58.4204	train-r2:-21.7835	valid-r2:-20.8177
[25]	train-rmse:51.6041	valid-rmse:51.3465	train-r2:-16.6163	valid-r2:-15.854
[30]	train-rmse:45.4128	valid-rmse:45.1625	train-r2:-12.6428	valid-r2:-12.0388
[35]	train-rmse:40.0054	valid-rmse:39.7615	train-r2:-9.58723	valid-r2:-9.1066
[40]	train-rmse:35.288	valid-rmse:35.0497	train-r2:-7.23757	valid-r2:-6.85325
[45]	train-rmse:31.1787	valid-rmse:30.9454	train-r2:-5.43074	valid-r2:-5.12172
[50]	train-rmse:27.6057	valid-rmse:27.3769	train-r2:-4.0413	valid-r2:-3.79

In [13]:
p_test = clf.predict(d_test)

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = p_test
sub.to_csv('xgb.csv', index=False)

In [None]:
sub.head()