In [79]:
%matplotlib inline
# problem worked on: AI Hack Tunisia #6 - Predictive analytics challenge #3
# url of the problem: https://zindi.africa/competitions/ai-hack-tunisia-6-predictive-analytics-challenge-3/leaderboard

In [91]:
# imports:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [92]:
df = pd.read_csv("data/train.csv")
t_df = pd.read_csv("data/test.csv")

In [93]:
# used methods in this notebook:
# data preparation: OneHotEncoding
# Feature selection: using wrapper : exhaustive feature selection
# ESTIMATORS = {
#    "Extra trees": ExtraTreesRegressor(n_estimators=10,
#                                       max_features=1,     # Out of 20000
#                                       random_state=0),
#    "K-nn": KNeighborsRegressor(),                          # Accept default parameters
#    "Linear regression": LinearRegression(),
#    "Ridge": RidgeCV(),
#    "Lasso": Lasso(),
#    "ElasticNet": ElasticNet(random_state=0),
#    "RandomForestRegressor": RandomForestRegressor(max_depth=4, random_state=2),
#    "Decision Tree Regressor":DecisionTreeRegressor(max_depth=5),
#    "MultiO/P GBR" :GradientBoostingRegressor(),
#    "MultiO/P AdaB" :AdaBoostRegressor(),
#    "GBR" : GradientBoostingRegressor(),
#     "SVR" : SVR(),
#     "xgb": xgboost.XGBRegressor(n_estimators=100, learning_rate=0.1, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7)
# }
# will work with all these estimators for feature selection and modeling:

In [94]:
df = df.drop('id', axis=1)
t_df = t_df.drop('id', axis=1)

In [95]:
df = pd.get_dummies(df)
t_df = pd.get_dummies(t_df)

In [96]:
df = df.drop('CTR_CATEGO_X_N', axis=1) # this column does not exist on test df

In [97]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso, ElasticNet
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [98]:
# Feature selection: 
# 1- working with wrappers: 
# Step forward feature selection, 
# Step backwards feature selection and 
# Exhaustive feature selection

In [99]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

# OneHotEncoding: 
df = pd.get_dummies(df)

# Splitting data: 
train_features, test_features, train_labels, test_labels = train_test_split(
    df.drop(labels=['target'], axis=1),
    df['target'],
    test_size=0.25,
    random_state=41)

# remove highly correlated features: 
correlated_features = set()
correlation_matrix = df.corr()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)


train_features.drop(labels=correlated_features, axis=1, inplace=True)
test_features.drop(labels=correlated_features, axis=1, inplace=True)

train_features.shape, test_features.shape

((18789, 98), (6264, 98))

In [100]:
# Step forward feature selection:
# KNNRegressor with r2_score evaluation
from sklearn.metrics import mean_absolute_error
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.metrics import r2_score
methods = [KNeighborsRegressor, 
           XGBRegressor,
           SVR,
           DecisionTreeRegressor,
           LinearRegression, 
           RidgeCV,
           Lasso,
           ElasticNet,
           ExtraTreesRegressor, 
           RandomForestRegressor, 
           GradientBoostingRegressor, 
           AdaBoostRegressor]

feature_selector = SequentialFeatureSelector(KNeighborsRegressor(),
           k_features=98,
           forward=True,
           verbose=2,
           scoring='r2',
           cv=4)

# features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels)


In [101]:
filtered_features= train_features.columns[list(features.k_feature_idx_)]

In [103]:
# run with XGBRegressor:
# df = df[filtered_features]
df.shape
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.20, random_state=0)
xgb = XGBRegressor(colsample_bytree=0.7,learning_rate=0.03,max_depth=7,min_child_weight=4,
n_estimators=500,
nthread=4,
objective='reg:linear',
silent=1,
subsample=0.7)
xgb.fit(X_train, y_train)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0,
             importance_type='gain', learning_rate=0.03, max_delta_step=0,
             max_depth=7, min_child_weight=4, missing=None, n_estimators=500,
             n_jobs=1, nthread=4, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=1,
             subsample=0.7, verbosity=1)

In [106]:
y_pred_train = xgb.predict(X_train)
y_pred_test = xgb.predict(X_test)

In [108]:
from sklearn.metrics import mean_squared_error
from math import sqrt

mse_train = sqrt(mean_squared_error(y_train, y_pred_train))
mse_test = sqrt(mean_squared_error(y_test, y_pred_test))
print(" xgboost train set: %.2f%%" %(mse_train))
print(" xgboost test set : %.2f%%" %(mse_test))

 xgboost train set: 4.38%
 xgboost test set : 5.81%


In [73]:
from sklearn.metrics import mean_squared_error
from math import sqrt

mse_train = sqrt(mean_squared_error(y_train, y_pred_train))
mse_test = sqrt(mean_squared_error(y_test, y_pred_test))
print(" xgboost: %.2f%%" %(mse_train*100))
print(" xgboost: %.2f%%" %(mse_test*100))

NameError: name 'y_train' is not defined