In [186]:
import os
import numpy as np
import pandas as pd

import tensorflow as tf
import tflearn

import xgboost as xgb
#import autosklearn.regression

from sklearn.metrics import r2_score
from scipy import stats
from tpot import TPOTRegressor

from sklearn import model_selection
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import VotingClassifier


# Reading the Data

In [187]:
datapath = 'Data/'
test_csv = 'test.csv'
train_csv = 'train.csv'
sample_csv = 'sample_submission.csv'

In [188]:
data = pd.read_csv(os.path.join(datapath,train_csv))

# Understanding the Data

In [189]:
print('Variables with letters are categorical. Variables with 0/1 are binary values.')
data.head(3)

Variables with letters are categorical. Variables with 0/1 are binary values.


Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0


In [190]:
tp_int = []
tp_float = []
tp_obj = []
tp_other = []

#Getting all variables names, less ID and looking for it types 
for i in data.columns.to_series()[1:]:
    if data[i].dtype == 'int64':
        tp_int.append(i)
    elif data[i].dtype == 'float64':
        tp_float.append(i)
    elif data[i].dtype == 'object':
        tp_obj.append(i)
    else:
        tp_other.append(i)
        
dic = {'tp_int': tp_int, 'tp_float': tp_float, 'tp_obj': tp_obj, 'tp_other': tp_other}

print('Categorical:', tp_obj)
print('Float:', tp_float)
print('Has other type?', tp_other)

Categorical: ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']
Float: ['y']
Has other type? []


In [191]:
drop = []
binarys = []
for i in tp_int:
    print('Variable: {}, Min: {}, Max: {} , Unique: {}'.format(i, data[i].min(), data[i].max() ,data[i].unique()))
    if data[i].max() == 0:
        drop.append(i)
    elif data[i].max() == 1 and data[i].min() == 0 and len(data[i].unique()) == 2:
        binarys.append(i)
    else:
        pass 

Variable: X10, Min: 0, Max: 1 , Unique: [0 1]
Variable: X11, Min: 0, Max: 0 , Unique: [0]
Variable: X12, Min: 0, Max: 1 , Unique: [0 1]
Variable: X13, Min: 0, Max: 1 , Unique: [1 0]
Variable: X14, Min: 0, Max: 1 , Unique: [0 1]
Variable: X15, Min: 0, Max: 1 , Unique: [0 1]
Variable: X16, Min: 0, Max: 1 , Unique: [0 1]
Variable: X17, Min: 0, Max: 1 , Unique: [0 1]
Variable: X18, Min: 0, Max: 1 , Unique: [1 0]
Variable: X19, Min: 0, Max: 1 , Unique: [0 1]
Variable: X20, Min: 0, Max: 1 , Unique: [0 1]
Variable: X21, Min: 0, Max: 1 , Unique: [1 0]
Variable: X22, Min: 0, Max: 1 , Unique: [0 1]
Variable: X23, Min: 0, Max: 1 , Unique: [0 1]
Variable: X24, Min: 0, Max: 1 , Unique: [0 1]
Variable: X26, Min: 0, Max: 1 , Unique: [0 1]
Variable: X27, Min: 0, Max: 1 , Unique: [0 1]
Variable: X28, Min: 0, Max: 1 , Unique: [0 1]
Variable: X29, Min: 0, Max: 1 , Unique: [0 1]
Variable: X30, Min: 0, Max: 1 , Unique: [0 1]
Variable: X31, Min: 0, Max: 1 , Unique: [1 0]
Variable: X32, Min: 0, Max: 1 , Uniq

Variable: X206, Min: 0, Max: 1 , Unique: [0 1]
Variable: X207, Min: 0, Max: 1 , Unique: [0 1]
Variable: X208, Min: 0, Max: 1 , Unique: [0 1]
Variable: X209, Min: 0, Max: 1 , Unique: [1 0]
Variable: X210, Min: 0, Max: 1 , Unique: [0 1]
Variable: X211, Min: 0, Max: 1 , Unique: [0 1]
Variable: X212, Min: 0, Max: 1 , Unique: [0 1]
Variable: X213, Min: 0, Max: 1 , Unique: [0 1]
Variable: X214, Min: 0, Max: 1 , Unique: [0 1]
Variable: X215, Min: 0, Max: 1 , Unique: [0 1]
Variable: X216, Min: 0, Max: 1 , Unique: [0 1]
Variable: X217, Min: 0, Max: 1 , Unique: [0 1]
Variable: X218, Min: 0, Max: 1 , Unique: [0 1]
Variable: X219, Min: 0, Max: 1 , Unique: [0 1]
Variable: X220, Min: 0, Max: 1 , Unique: [1 0]
Variable: X221, Min: 0, Max: 1 , Unique: [0 1]
Variable: X222, Min: 0, Max: 1 , Unique: [0 1]
Variable: X223, Min: 0, Max: 1 , Unique: [0 1]
Variable: X224, Min: 0, Max: 1 , Unique: [0 1]
Variable: X225, Min: 0, Max: 1 , Unique: [0 1]
Variable: X226, Min: 0, Max: 1 , Unique: [0 1]
Variable: X22

In [192]:
print('Variables only with 0:', drop)

if len(drop) + len(binarys) == len(tp_int):
    print('All other integer variables are binary')

Variables only with 0: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
All other integer variables are binary


In [193]:
#Dropping variables only with zero
data = data.drop(drop, axis = 1)

# One hot enconding

In [194]:
#Best way to work with categorical variables? I don't think is the best in all case. I will study about that

for each in tp_obj:
    dummies = pd.get_dummies(data[each], prefix=each, drop_first=False)
    data = pd.concat([data, dummies], axis=1)

data = data.drop(tp_obj, axis=1)
data.head()

Unnamed: 0,ID,y,X10,X12,X13,X14,X15,X16,X17,X18,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,0,130.81,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,6,88.53,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,7,76.26,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,9,80.62,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Test Data

In [195]:
data_test = pd.read_csv(os.path.join(datapath,test_csv))

In [196]:
#Dropping variables only with zero
data_test = data_test.drop(drop, axis = 1)

In [197]:
for each in tp_obj:
    dummies = pd.get_dummies(data_test[each], prefix=each, drop_first=False)
    data_test = pd.concat([data_test, dummies], axis=1)

data_test = data_test.drop(tp_obj, axis=1)
data_test.head()

Unnamed: 0,ID,X10,X12,X13,X14,X15,X16,X17,X18,X19,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Keeping only columns who are in both data sets

In [198]:
keep = np.intersect1d(data.columns.to_series(), data_test.columns.to_series())

In [199]:
data = data[np.append(keep,'y')]

In [200]:
data_test = data_test[keep]

# Join Train and Test

In [201]:
len_train = len(data)
len_test = len(data_test)

data_all = pd.concat([data.drop(['y'],axis=1),data_test])


In [202]:
print(len_train)
print(len_test)
print(len(data_all))

4209
4209
8418


# PCA, ICA, Clustering, etc

In [203]:
## Got this from https://www.kaggle.com/frednavruzov/baselines-to-start-with-lb-0-56
## https://www.kaggle.com/linux18/kernel-0-5686/output

from sklearn.decomposition import PCA, FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import TruncatedSVD

n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results = pca.fit_transform(data_all.drop(["ID"], axis=1))

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results = ica.fit_transform(data_all.drop(["ID"], axis=1))

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results = tsvd.fit_transform(data_all.drop(["ID"], axis=1))

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results = grp.fit_transform(data_all.drop(["ID"], axis=1))

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results = srp.fit_transform(data_all.drop(["ID"], axis=1))





In [204]:
#Still doing this

from sklearn import cluster

from scipy.spatial import distance
from sklearn import metrics

# Doing DBSCAN
#D = distance.squareform(distance.pdist(data_all.drop(["ID"],axis=1), metric = 'hamming'))
D = distance.squareform(distance.pdist(data_all.drop(["ID"],axis=1)))

S = 1 - (D / np.max(D))

#Chosen to get 5-15 clusters. Getting 9 currently (label=-1 is "noise").
eps = 1
min_samples = 40

#db = cluster.DBSCAN(eps = 100000,min_samples=100,n_jobs=-1).fit(data.drop(["ID"],axis=1))
db = cluster.DBSCAN(eps = eps,min_samples=min_samples,n_jobs=-1).fit(S)
core_samples = db.core_sample_indices_
labels = db.labels_

In [205]:
print(labels)
print(len(labels))
print(core_samples)

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print(n_clusters_)



[-1 -1 -1 ..., -1 -1 -1]
8418
[  52   97  260  271  297  373  529  608  633  637  648  694  710  726  761
  827  911  917  921  922  929  938  943  948  961 1017 1018 1068 1089 1105
 1110 1114 1122 1128 1133 1137 1161 1162 1176 1183 1199 1230 1249 1281 1286
 1317 1338 1380 1397 1398 1410 1412 1449 1458 1516 1518 1530 1553 1557 1586
 1592 1608 1612 1622 1635 1649 1676 1805 1836 1841 1863 1876 1883 1911 1951
 1952 1984 1995 2003 2020 2056 2064 2098 2126 2138 2153 2159 2187 2201 2213
 2297 2335 2348 2371 2384 2401 2403 2407 2410 2414 2415 2429 2430 2458 2463
 2464 2495 2508 2519 2521 2548 2549 2558 2572 2618 2624 2625 2648 2683 2735
 2754 2762 2779 2792 2861 2959 2987 3005 3013 3014 3018 3028 3034 3048 3101
 3111 3162 3169 3177 3233 3284 3293 3383 3397 3400 3429 3441 3447 3493 3514
 3519 3526 3531 3580 3583 3604 3662 3682 3693 3705 3737 3778 3783 3864 3890
 3893 3904 3909 3967 4010 4011 4032 4058 4071 4124 4165 4202 4217 4219 4225
 4302 4324 4327 4332 4344 4357 4388 4443 4496 4566 4582 46

In [206]:
# Append decomposition components to datasets
for i in range(1, n_comp+1):
    data_all['pca_' + str(i)] = pca2_results[:,i-1]
    
    data_all['ica_' + str(i)] = ica2_results[:,i-1]
    
    
    data_all['tsvd_' + str(i)] = tsvd_results[:,i-1]

    data_all['grp_' + str(i)] = grp_results[:,i-1]

    data_all['srp_' + str(i)] = srp_results[:,i-1]

data_all['cluster'] = labels
# Append cluters to datasets

In [207]:
#The guy in the link is only adding new colunms. I think this doesnt make sense (and barely changed the result). 

#I also created more PCA/ICA features than just 10

#data = data.drop(labels = pca_drop, axis = 1)
#data_test = data_test.drop(labels = pca_drop, axis = 1)

#print(data.head())
#print(data_test.head())

#This got worse results


In [208]:
print(data_all.head())
len(data_all)

   ID  X0_a  X0_ad  X0_af  X0_ai  X0_aj  X0_ak  X0_al  X0_am  X0_ao   ...     \
0   0     0      0      0      0      0      0      0      0      0   ...      
1   6     0      0      0      0      0      0      0      0      0   ...      
2   7     0      0      0      0      0      0      0      0      0   ...      
3   9     0      0      0      0      0      0      0      0      0   ...      
4  13     0      0      0      0      0      0      0      0      0   ...      

      ica_9    tsvd_9     grp_9     srp_9    pca_10    ica_10   tsvd_10  \
0  0.006600  0.614259  1.666618  0.000000 -0.298623 -0.019325  0.170974   
1  0.005459  0.750283  0.498245  1.525103  0.610756  0.002209 -0.088325   
2 -0.000203 -0.344924 -2.278078  0.000000 -0.298661 -0.011307 -1.098066   
3  0.000465  0.399228 -3.151707  1.525103  0.128562 -0.002679 -1.400843   
4 -0.000757  0.155938 -4.238580  1.525103 -0.365441 -0.000968 -1.992382   

     grp_10    srp_10  cluster  
0  3.441077  0.000000       -1  
1 

8418

# One Hot Enconding... In the clusters!

In [209]:
data_all.head()

Unnamed: 0,ID,X0_a,X0_ad,X0_af,X0_ai,X0_aj,X0_ak,X0_al,X0_am,X0_ao,...,ica_9,tsvd_9,grp_9,srp_9,pca_10,ica_10,tsvd_10,grp_10,srp_10,cluster
0,0,0,0,0,0,0,0,0,0,0,...,0.0066,0.614259,1.666618,0.0,-0.298623,-0.019325,0.170974,3.441077,0.0,-1
1,6,0,0,0,0,0,0,0,0,0,...,0.005459,0.750283,0.498245,1.525103,0.610756,0.002209,-0.088325,3.804237,0.0,-1
2,7,0,0,0,0,0,0,0,0,0,...,-0.000203,-0.344924,-2.278078,0.0,-0.298661,-0.011307,-1.098066,4.337093,-3.050207,-1
3,9,0,0,0,0,0,0,0,0,0,...,0.000465,0.399228,-3.151707,1.525103,0.128562,-0.002679,-1.400843,4.861759,-1.525103,-1
4,13,0,0,0,0,0,0,0,0,0,...,-0.000757,0.155938,-4.23858,1.525103,-0.365441,-0.000968,-1.992382,4.93272,-3.050207,-1


In [210]:
dummies = pd.get_dummies(data_all['cluster'], prefix='cluster', drop_first=False)
data_all = pd.concat([data_all, dummies], axis=1)

data_all = data_all.drop('cluster', axis=1)


In [211]:
data_all.head()

Unnamed: 0,ID,X0_a,X0_ad,X0_af,X0_ai,X0_aj,X0_ak,X0_al,X0_am,X0_ao,...,cluster_-1,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,6,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,7,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,9,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,13,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


# Separate again in train and test

In [212]:
data2 = data_all.head(len_train)
data_test2 = data_all.tail(len_test)
data2['y'] = data['y']

print(len(data2))
print(len(data_test2))

print(data.head(5))
print(data2.head(5))


print(data_test.head(5))
print(data_test2.head(5))

data = data2
data_test = data_test2


del data2, data_test2

4209
4209
   ID  X0_a  X0_ad  X0_af  X0_ai  X0_aj  X0_ak  X0_al  X0_am  X0_ao   ...    \
0   0     0      0      0      0      0      0      0      0      0   ...     
1   6     0      0      0      0      0      0      0      0      0   ...     
2   7     0      0      0      0      0      0      0      0      0   ...     
3   9     0      0      0      0      0      0      0      0      0   ...     
4  13     0      0      0      0      0      0      0      0      0   ...     

   X90  X91  X92  X94  X95  X96  X97  X98  X99       y  
0    0    0    0    0    0    0    0    0    0  130.81  
1    0    0    0    0    0    1    0    1    0   88.53  
2    0    0    0    0    0    1    0    1    0   76.26  
3    0    0    0    0    0    1    0    1    0   80.62  
4    0    0    0    0    0    1    0    1    0   78.02  

[5 rows x 543 columns]
   ID  X0_a  X0_ad  X0_af  X0_ai  X0_aj  X0_ak  X0_al  X0_am  X0_ao   ...    \
0   0     0      0      0      0      0      0      0      0      0   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# Keeping colunms in both train and test (needed due to new cluster colunms)

In [213]:
keep = np.intersect1d(data.columns.to_series(), data_test.columns.to_series())
data = data[np.append(keep,'y')]
data_test = data_test[keep]

In [214]:
# Droping noise cluster
#data = data.drop('cluster_-1',axis=1)
#data_test = data_test.drop('cluster_-1',axis=1)
# Made things worse, for some reason.

# Splitting data into training(validation)/testing

In [215]:
train_features, test_features, train_y, test_y = model_selection.train_test_split(
    data, data['y'], test_size = 0.3, random_state = 0)

In [216]:
train_features = train_features.drop(['ID', 'y'], axis = 1).values
test_features = test_features.drop(['ID', 'y'], axis = 1).values

# Rede neural
train_y_rn = train_y[:, np.newaxis]
test_y_rn = test_y[:, np.newaxis]

train_y = train_y.values
test_y = test_y.values

In [217]:
#del data

# Using all Data available

In [218]:
train_y = data['y'].values

train_features = data.drop(['ID', 'y'], axis = 1).values

In [219]:
train_y.shape

(4209,)

# xgboost

In [220]:
rg_xgb = xgb.sklearn.XGBRegressor(base_score=train_y.mean(),
                                        learning_rate = 0.005,
                                        n_estimators = 800,
                                        subsample = 0.95,
                                        max_depth = 4,
                                        objective = 'reg:linear',
                                        silent = 1).fit(train_features, train_y)

In [221]:
rg_xgb_score = rg_xgb.score(test_features, test_y)
print('Xgboost Score:', rg_xgb_score)

predict = rg_xgb.predict(test_features)

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('Xgboost Square:', r_value**2)

Xgboost Score: 0.676217086364
Xgboost Square: 0.686090355993


In [222]:
predict_test = rg_xgb.predict(data_test.iloc[:, 1:].values)

submission = pd.DataFrame({'ID': data_test['ID'], 'y': predict_test})


In [223]:
#submission.to_csv(os.path.join(datapath,'submissionrg_xgb2.csv'), index = False)
submission.to_csv('submissionrg_xgb2.csv', index = False)

# Random Forest

In [None]:
randomforest = RandomForestRegressor(n_estimators=200, max_features='auto', bootstrap=False, 
                                   oob_score=False, n_jobs=-1, random_state=0).fit(train_features, train_y)

In [None]:
randomforest_score = randomforest.score(test_features, test_y)
print('RF Score:', randomforest_score)

predict = randomforest.predict(test_features)

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('R Square:', r_value**2)

In [None]:
predict_test = randomforest.predict(data_test.iloc[:, 1:].values)

submission = pd.DataFrame({'ID': data_test['ID'], 'y': predict_test})
submission.to_csv(os.path.join(datapath,'submissionrf.csv'), index = False)

# Tpot

In [None]:
rg_tpot = TPOTRegressor(verbosity=2, 
                        max_time_mins=300, 
                        max_eval_time_mins=5, 
                        population_size=50,
                        generations=5,
                        n_jobs = -1,
                        scoring = 'neg_mean_squared_error')

rg_tpot.fit(train_features, train_y)

Optimization Progress:   0%|          | 0/50 [00:00<?, ?pipeline/s]

In [None]:
rg_tpot_score = rg_tpot.score(test_features, test_y)
print('rg_tpot Score:', rg_tpot_score)

predict = rg_tpot.predict(test_features)

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('Tpot Square:', r_value**2)

In [None]:
predict_test = rg_tpot.predict(data_test.iloc[:, 1:].values)

submission = pd.DataFrame({'ID': data_test['ID'], 'y': predict_test})
submission.to_csv(os.path.join(datapath,'submissionrg_tpot_terceira_interacao.csv'), index = False)

In [None]:
''' ~0.54995 No Kaggle

Model Tpot Param:
verbosity=2, 
max_time_mins=60, 
max_eval_time_mins=4, 
population_size=50,
generations=5,
n_jobs = -1,
scoring = neg_mean_squared_error

ExtraTreesRegressor(input_matrix, ExtraTreesRegressor__bootstrap=True, 
                    ExtraTreesRegressor__max_features=0.45, ExtraTreesRegressor__min_samples_leaf=18,
                    ExtraTreesRegressor__min_samples_split=18, ExtraTreesRegressor__n_estimators=100)
                    
'''

In [None]:
''' 0.54865
rg_tpot = TPOTRegressor(verbosity=2, 
                        max_time_mins=300, 
                        max_eval_time_mins=5, 
                        population_size=50,
                        generations=5,
                        n_jobs = 1,
                        scoring = 'neg_mean_squared_error')

Best pipeline: XGBRegressor(input_matrix, XGBRegressor__learning_rate=DEFAULT, 
XGBRegressor__max_depth=2, XGBRegressor__min_child_weight=17, 
XGBRegressor__n_estimators=100, XGBRegressor__nthread=1, XGBRegressor__subsample=0.95)

In [None]:
''' 0.54841 No Kaggle
Vale ressaltar que parece que não rodou todo

rg_tpot = TPOTRegressor(verbosity=2, 
                        max_time_mins=440, 
                        max_eval_time_mins=6, 
                        population_size=75,
                        generations=10,
                        n_jobs = -1,
                        scoring = 'neg_mean_squared_error')

Best pipeline: RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=DEFAULT, RandomForestRegressor__max_features=0.7, 
                                     RandomForestRegressor__min_samples_leaf=20, RandomForestRegressor__min_samples_split=13, 
                                     RandomForestRegressor__n_estimators=100)
                                     
                                     

# Adaboost

In [None]:
rg_adboost = AdaBoostRegressor(base_estimator=None,
                             n_estimators=400,
                             learning_rate=0.1,
                             random_state=0).fit(train_features, train_y)

In [None]:
rg_adboost_score = rg_adboost.score(test_features, test_y)
print('rg_adboost Score:', rg_adboost_score)

predict = rg_adboost.predict(test_features)

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('Adaboost Square:', r_value**2)

# Rede Neural

In [None]:
# Não está convergindo, provavelmente problema com muitas variáveis pouco explicativas

def build_model():
    tf.reset_default_graph()
    
    net = tflearn.input_data([None, train_features.shape[1]])

    net = tflearn.fully_connected(net, 1440, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 2440, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 2440, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 2440, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 2440, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 3440, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 2440, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 1440, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 2440, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 2440, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 2440, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 1440, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 740, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 320, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 80, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    net = tflearn.fully_connected(net, 10, activation='ReLU') 
    net = tflearn.dropout(net, 0.80)
    
    net = tflearn.fully_connected(net, 1, activation='linear')
    net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss="mean_square")
    
    model = tflearn.DNN(net)
    return model

model = build_model()

In [None]:
model.fit(train_features, train_y_rn, validation_set=0.2, show_metric=True, batch_size=512, n_epoch=2000)

In [None]:
predict = model.predict(test_features)
predict = [predict[i][0] for i in range(0, len(predict))]

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('R Square:', r_value**2)

In [None]:
predict_test = model.predict(data_test.iloc[:, 1:])
predict_test = [predict_test[i][0] for i in range(0, len(predict_test))]

In [None]:
submission = pd.DataFrame({'ID': data_test['ID'], 'y': predict_test})
print(submission.head())
submission.to_csv(os.path.join(datapath,'submissiondp.csv'), index = False)

# Voting

In [None]:
predict = rg_xgb.predict(test_features) + rg_tpot.predict(test_features)
predict = predict / 2

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('Voting Square:', r_value**2)

In [None]:
#Melhor entrada

predict_test = rg_xgb.predict(data_test.iloc[:, 1:].values) + rg_tpot.predict(data_test.iloc[:, 1:].values) 
predict_test = predict_test / 2

submission = pd.DataFrame({'ID': data_test['ID'], 'y': predict_test})
submission.to_csv(os.path.join(datapath,'submissionvoting3.csv'), index = False)

# Autosklearn

In [None]:
autoskl = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=60,
                                                               per_run_time_limit=30,
                                                               seed=0)

autoskl.fit(train_features, train_y)

In [None]:
autoskl_score = autoskl.score(test_features, test_y)
print('autoskl Score:', autoskl_score)

predict = autoskl.predict(test_features)

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('Autosklearn Square:', r_value**2)

# XGboost - Kaggle Kernel

In [None]:
 ()# mmm, xgboost, loved by everyone ^-^
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 500, 
    'eta': 0.005,
    'max_depth': 4,
    'subsample': 0.95,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': train_y.mean(), # base prediction = mean(target)
    'silent': 1
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(train_features, train_y)
dtest = xgb.DMatrix(data_test.iloc[:, 1:].values)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=700, # increase to have better results (~700)
                   early_stopping_rounds=50,
                   verbose_eval=50, 
                   show_stdv=False
                  )

num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

In [None]:
print(r2_score(dtrain.get_label(), model.predict(dtrain)))

In [None]:
# make predictions and save results
y_pred = model.predict(dtest)
output = pd.DataFrame({'id': data_test['ID'].astype(np.int32), 'y': y_pred})
output.to_csv(os.path.join(datapath,'submissionxgb_kaggle.csv'), index = False)

# Old PCA ICA

In [None]:
## Got this from https://www.kaggle.com/frednavruzov/baselines-to-start-with-lb-0-56

from sklearn.decomposition import PCA, FastICA
n_comp = 5

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(data.drop(["ID","y"], axis=1))
pca2_results_test = pca.transform(data_test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(data.drop(["ID","y"], axis=1))
ica2_results_test = ica.transform(data_test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    data['pca_' + str(i)] = pca2_results_train[:,i-1]
    data_test['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    data['ica_' + str(i)] = ica2_results_train[:,i-1]
    data_test['ica_' + str(i)] = ica2_results_test[:, i-1]
    
