In [None]:
import os
import numpy as np
import pandas as pd

import tensorflow as tf
import tflearn

import xgboost as xgb
#import autosklearn.regression

from sklearn.metrics import r2_score
from scipy import stats
from tpot import TPOTRegressor

from sklearn import model_selection
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import VotingRegressor


# Reading the Data

In [None]:
datapath = 'Data/'
test_csv = 'test.csv'
train_csv = 'train.csv'
sample_csv = 'sample_submission.csv'

In [None]:
data = pd.read_csv(os.path.join(datapath,train_csv))

# Understanding the Data

In [None]:
print('Variables with letters are categorical. Variables with 0/1 are binary values.')
data.head(3)

In [None]:
tp_int = []
tp_float = []
tp_obj = []
tp_other = []

#Getting all variables names, less ID and looking for it types 
for i in data.columns.to_series()[1:]:
    if data[i].dtype == 'int64':
        tp_int.append(i)
    elif data[i].dtype == 'float64':
        tp_float.append(i)
    elif data[i].dtype == 'object':
        tp_obj.append(i)
    else:
        tp_other.append(i)
        
dic = {'tp_int': tp_int, 'tp_float': tp_float, 'tp_obj': tp_obj, 'tp_other': tp_other}

print('Categorical:', tp_obj)
print('Float:', tp_float)
print('Has other type?', tp_other)

In [None]:
drop = []
binarys = []
for i in tp_int:
    print('Variable: {}, Min: {}, Max: {} , Unique: {}'.format(i, data[i].min(), data[i].max() ,data[i].unique()))
    if data[i].max() == 0:
        drop.append(i)
    elif data[i].max() == 1 and data[i].min() == 0 and len(data[i].unique()) == 2:
        binarys.append(i)
    else:
        pass 

In [None]:
print('Variables only with 0:', drop)

if len(drop) + len(binarys) == len(tp_int):
    print('All other integer variables are binary')

In [None]:
#Dropping variables only with zero
data = data.drop(drop, axis = 1)

# One hot enconding

In [None]:
#Best way to work with categorical variables? I don't think is the best in all case. I will study about that

for each in tp_obj:
    dummies = pd.get_dummies(data[each], prefix=each, drop_first=False)
    data = pd.concat([data, dummies], axis=1)

data = data.drop(tp_obj, axis=1)
data.head()

# Test Data

In [None]:
data_test = pd.read_csv(os.path.join(datapath,test_csv))

In [None]:
#Dropping variables only with zero
data_test = data_test.drop(drop, axis = 1)

In [None]:
for each in tp_obj:
    dummies = pd.get_dummies(data_test[each], prefix=each, drop_first=False)
    data_test = pd.concat([data_test, dummies], axis=1)

data_test = data_test.drop(tp_obj, axis=1)
data_test.head()

# Keeping only columns who are in both data sets

In [None]:
keep = np.intersect1d(data.columns.to_series(), data_test.columns.to_series())

In [None]:
data = data[np.append(keep,'y')]

In [None]:
data_test = data_test[keep]

# PCA and ICA

In [None]:
## Got this from https://www.kaggle.com/frednavruzov/baselines-to-start-with-lb-0-56

from sklearn.decomposition import PCA, FastICA
n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(data.drop(["y"], axis=1))
pca2_results_test = pca.transform(data_test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(data.drop(["y"], axis=1))
ica2_results_test = ica.transform(data_test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    data['pca_' + str(i)] = pca2_results_train[:,i-1]
    data_test['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    data['ica_' + str(i)] = ica2_results_train[:,i-1]
    data_test['ica_' + str(i)] = ica2_results_test[:, i-1]
    


In [None]:
#pca_drop = list(data)[1:-101]
#print(pca_drop)
#print(type(pca_drop))

In [None]:
#The guy in the link is only adding new colunms. I think this doesnt make sense (and barely changed the result). 

#I also created more PCA/ICA features than just 10

#data = data.drop(labels = pca_drop, axis = 1)
#data_test = data_test.drop(labels = pca_drop, axis = 1)

#print(data.head())
#print(data_test.head())

#This got worse results


# Splitting data into training/testing

In [None]:
train_features, test_features, train_y, test_y = model_selection.train_test_split(
    data, data['y'], test_size = 0.3, random_state = 0)

In [None]:
train_features = train_features.drop(['ID', 'y'], axis = 1).values
test_features = test_features.drop(['ID', 'y'], axis = 1).values

train_y = train_y.values
test_y = test_y.values

# Using all Data available

In [None]:
train_y = data['y']

train_features = data.drop(['ID', 'y'], axis = 1).values

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

randomforest = RandomForestRegressor(n_estimators=200, max_features='auto', bootstrap=False, 
                                   oob_score=False, n_jobs=-1, random_state=0).fit(train_features, train_y)

In [None]:
randomforest_score = randomforest.score(test_features, test_y)
print('RF Score:', randomforest_score)

predict = randomforest.predict(test_features)

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('R Square:', r_value**2)

In [None]:
predict_test = randomforest.predict(data_test.iloc[:, 1:].values)

submission = pd.DataFrame({'ID': data_test['ID'], 'y': predict_test})
submission.to_csv(os.path.join(datapath,'submissionrf.csv'), index = False)

# xgboost

In [None]:
rg_xgb = xgb.sklearn.XGBRegressor(base_score=train_y.mean(),
                                        learning_rate = 0.005,
                                        n_estimators = 600,
                                        subsample = 0.95,
                                        max_depth = 4,
                                        objective = 'reg:linear',
                                        silent = 1).fit(train_features, train_y)

In [None]:
rg_xgb_score = rg_xgb.score(test_features, test_y)
print('Xgboost Score:', rg_xgb_score)

predict = rg_xgb.predict(test_features)

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('Xgboost Square:', r_value**2)

In [None]:
predict_test = rg_xgb.predict(data_test.iloc[:, 1:].values)

submission = pd.DataFrame({'ID': data_test['ID'], 'y': predict_test})
submission.to_csv(os.path.join(datapath,'submissionrg_xgb.csv'), index = False)

# Tpot

In [None]:
rg_tpot = TPOTRegressor(verbosity=2, 
                        max_time_mins=1, 
                        max_eval_time_mins=0.1, 
                        population_size=100,
                        generations=200,
                        n_jobs = 1)

rg_tpot.fit(train_features, train_y)

In [None]:
rg_tpot_score = rg_tpot.score(test_features, test_y)
print('rg_tpot Score:', rg_tpot_score)

predict = rg_tpot.predict(test_features)

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('Tpot Square:', r_value**2)

# Adaboost

In [None]:
rg_adboost = AdaBoostRegressor(base_estimator=train_y.mean(),
                             n_estimators=200,
                             learning_rate=0.1,
                             algorithm='SAMME.R',
                             random_state=0).fit(train_features, train_y)

In [None]:
rg_adboost_score = rg_adboost.score(test_features, test_y)
print('rg_adboost Score:', rg_adboost_score)

predict = rg_adboost.predict(test_features)

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('Adaboost Square:', r_value**2)

# Autosklearn

In [None]:
autoskl = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=60,
                                                               per_run_time_limit=30,
                                                               seed=0)

autoskl.fit(train_features, train_y, metric=autosklearn.metrics.accuracy)

In [None]:
autoskl_score = autoskl.score(test_features, test_y)
print('autoskl Score:', autoskl_score)

predict = autoskl.predict(test_features)

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('Autosklearn Square:', r_value**2)

# Voting

In [None]:
from sklearn.ensemble import VotingRegressor
#http://scikit-learn.org/stable/modules/ensemble.html#votingclassifier

ensemble = VotingRegressor(estimators=[('randomforest', randomforest),
                                        ('rg_xgb', rg_xgb),
                                        #('rg_tpot', rg_tpot),
                                        ('rg_adboost', rg_adboost),
                                        ('autoskl', autoskl),
                                        ],
                            voting='soft',
                            weights=[1,2,1,2]).fit(train_features, train_y)

In [None]:
ensemble_score = ensemble.score(test_features, test_y)
print('ensemble Score:', ensemble_score)

predict = ensemble.predict(test_features)

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('ensemble Square:', r_value**2)

# Rede Neural

In [None]:
import tensorflow as tf
import tflearn

# Define the neural network
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    # Inputs
    net = tflearn.input_data([None, train_features.shape[1]])

    # Hidden layer(s)
    net = tflearn.fully_connected(net, 512, activation='ReLU') 
    net = tflearn.fully_connected(net, 512, activation='ReLU')
    net = tflearn.dropout(net, 0.80)
    
    # Output layer and training model
    net = tflearn.fully_connected(net, 1, activation='linear')
    net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss="mean_square")
    
    model = tflearn.DNN(net)
    return model

model = build_model()

In [None]:
model.fit(train_features, train_y, validation_set=0.2, show_metric=True, batch_size=512, n_epoch=10000)

In [None]:
predict = model.predict(test_features)
predict = [predict[i][0] for i in range(0, len(predict))]

slope, intercept, r_value, p_value, std_err = stats.linregress(predict, test_y)
print('R Square:', r_value**2)

In [None]:
predict_test = model.predict(data_test.iloc[:, 1:])
predict_test = [predict_test[i][0] for i in range(0, len(predict_test))]

In [None]:
submission = pd.DataFrame({'ID': data_test['ID'], 'y': predict_test})
print(submission.head())
submission.to_csv(os.path.join(datapath,'submissiondp.csv'), index = False)

# XGboost - Kaggle Kernel

In [None]:
 ()# mmm, xgboost, loved by everyone ^-^
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 500, 
    'eta': 0.005,
    'max_depth': 4,
    'subsample': 0.95,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': train_y.mean(), # base prediction = mean(target)
    'silent': 1
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(train_features, train_y)
dtest = xgb.DMatrix(data_test.iloc[:, 1:].values)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=700, # increase to have better results (~700)
                   early_stopping_rounds=50,
                   verbose_eval=50, 
                   show_stdv=False
                  )

num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

In [None]:
print(r2_score(dtrain.get_label(), model.predict(dtrain)))