In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD

In [3]:
from sklearn.preprocessing import LabelEncoder
train = pd.read_csv("mertrain.csv")
test = pd.read_csv("mertest.csv")

# process columns, apply LabelEncoder to categorical features
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

Shape train: (4209, 378)
Shape test: (4209, 377)


In [4]:
 n_comp = 320
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

In [5]:
pca2_results_train

array([[ -4.20591806e+03,   3.74113607e-03,  -3.95586587e-02, ...,
          7.32289837e-15,  -1.25343517e-14,   2.16010548e-14],
       [ -4.19990961e+03,  -5.32835499e-02,   1.78049019e+00, ...,
          1.70867124e-13,   8.98967538e-13,  -2.44578876e-13],
       [ -4.19891168e+03,   1.64742769e+01,   1.38064828e+01, ...,
          3.16373645e-13,  -3.42292934e-13,   6.36730991e-13],
       ..., 
       [  4.20599752e+03,   3.08713176e+01,   1.60994784e+01, ...,
          3.39659353e-14,  -7.57125093e-15,   1.82220286e-13],
       [  4.20899773e+03,   2.54358916e+01,   2.87739952e+00, ...,
          9.09304388e-15,  -2.72912398e-13,  -1.42980857e-14],
       [  4.21099776e+03,  -1.90654871e+01,  -1.08794911e+01, ...,
          1.57660816e-14,   1.42139750e-14,   6.94884393e-14]])

In [6]:
y = train['y'].values
y_mean = np.mean(y)
id_test = test['ID'].values
X_test=test.values

In [7]:
from sklearn.model_selection import KFold
kf=KFold(n_splits=5, shuffle=True, random_state=0)
kf.get_n_splits(pca2_results_train)

5

In [8]:
for train_index, test_index in kf.split(pca2_results_train):
    X_train, y_train=pca2_results_train[train_index], y[train_index]
    X_valid, y_valid=pca2_results_train[test_index], y[test_index]

In [9]:
forest= RandomForestRegressor(n_estimators=100, random_state=0, max_depth=5)
forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [10]:
from sklearn.metrics import r2_score
print("R^2 valid: {: }".format(r2_score(y_valid, forest.predict(X_valid))))
print("R^2 train: {: }".format(r2_score(y_train, forest.predict(X_train))))

R^2 valid:  0.5153884375342284
R^2 train:  0.5600522270114556


In [11]:
print(r2_score(y, forest.predict(pca2_results_train)))

0.55180961231


In [12]:
y_pred=forest.predict(pca2_results_test)

In [13]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred
sub.to_csv('forest-eliminate320.csv', index=False)


In [14]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 500, 
    'eta': 0.005,
    'max_depth': 4,
    'subsample': 0.95,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X_train, y_train)
dvalid=xgb.DMatrix(X_valid, y_valid)
dtest = xgb.DMatrix(pca2_results_test)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=500, # increase to have better results (~700)
                   early_stopping_rounds=50,
                   verbose_eval=50, 
                   show_stdv=False
                  )

num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)



[0]	train-rmse:12.7448	test-rmse:12.7404
[50]	train-rmse:11.6024	test-rmse:11.7509
[100]	train-rmse:10.7695	test-rmse:11.0695
[150]	train-rmse:10.087	test-rmse:10.5525
[200]	train-rmse:9.51706	test-rmse:10.137
[250]	train-rmse:9.07776	test-rmse:9.85514
[300]	train-rmse:8.74722	test-rmse:9.66948
[350]	train-rmse:8.46847	test-rmse:9.55657
[400]	train-rmse:8.21994	test-rmse:9.47563
[450]	train-rmse:7.99696	test-rmse:9.41142
500


In [15]:
from sklearn.metrics import r2_score
print("R^2 valid: {: }".format(r2_score(y_valid, model.predict(dvalid))))
print("R^2 train: {: }".format(r2_score(y_train, model.predict(dtrain))))

R^2 valid:  0.5393131853561562
R^2 train:  0.5941373213718615


In [16]:
print(r2_score(dtrain.get_label(), model.predict(dtrain)))

0.594137324516


In [17]:
y_pred = model.predict(dtest)
output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': y_pred})
output.to_csv('xgboost-depth{}-elimination321.csv'.format(xgb_params['max_depth']), index=False)


In [18]:
from sklearn.linear_model import Ridge

In [19]:
ridge = Ridge.fit(X_train, y_train) 

In [20]:
from sklearn.metrics import r2_score
print("R^2 valid: {: }".format(r2_score(y_valid, ridge.predict(X_valid))))
print("R^2 train: {: }".format(r2_score(y_train, ridge.predict(X_train))))

R^2 valid:  0.5587575283297722
R^2 train:  0.5905363822984453


In [21]:
y_pred=ridge.predict(pca2_results_test)

In [22]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred
sub.to_csv('ridge-eliminate320.csv', index=False)


In [23]:
print("R^2 train{}".format(r2_score(y, ridge.predict(pca2_results_train))))

R^2 train0.584701782065228


In [24]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

In [25]:
scaler=MaxAbsScaler()

In [26]:
scaler.fit(X_train)

MaxAbsScaler(copy=True)

In [27]:
X_train_scaled=scaler.transform(X_train)
X_valid_scaled=scaler.transform(X_valid)
X_test_scaled=scaler.transform(pca2_results_test)

In [28]:
from sklearn.svm import SVR

In [29]:
model=SVR(gamma='auto', C=100)

In [30]:
model.fit(X_train_scaled, y_train)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [31]:
from sklearn.metrics import r2_score
print("R^2 train:{}".format(r2_score(y_train, model.predict(X_train_scaled))))
print("R^2 valid:{}".format(r2_score(y_valid, model.predict(X_valid_scaled))))

R^2 train:0.5471745832995186
R^2 valid:0.5548875927654109


In [32]:
y_pred= model.predict(X_test_scaled)

In [33]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred
sub.to_csv('svr-eliminate320.csv', index=False)
