In [26]:
import pandas as pd
import numpy as np

import lightgbm as lgb

# Evaluation of the model
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

In [2]:
fm = pd.read_csv('../input/costa-rican-poverty-derived-data/ft_2000_important.csv').replace({-np.inf: np.nan, np.inf:np.nan})
fm.shape

  interactivity=interactivity, compiler=compiler, result=result)


(10307, 2016)

In [14]:
test_base = pd.read_csv('../input/costa-rican-household-poverty-prediction/test.csv')[['Id', 'idhogar']]
test_base.shape

(23856, 2)

In [3]:
from imblearn.over_sampling import SMOTE, ADASYN

In [4]:
smote = SMOTE()
ada = ADASYN()

In [16]:
train = fm[fm['Target'].notnull()].copy()
test = fm[fm['Target'].isnull()].copy()

train_labels = np.array(train.pop('Target')).reshape((-1,))
train_ids = train.pop('idhogar')
test = test.drop(columns = 'Target')
test_ids = test.pop('idhogar')

for col in train:
    if train[col].dtype == 'object':
        train[col] = train[col].astype(np.float32)
        test[col] = test[col].astype(np.float32)
        
feature_names = list(train.columns)
train = np.array(train, dtype = 'float')
test = np.array(test, dtype = 'float')

In [17]:
from sklearn.preprocessing import Imputer

imputer = Imputer(strategy = 'median')
train = imputer.fit_transform(train.astype(np.float32))
test = imputer.transform(test.astype(np.float32))



In [18]:
train = np.nan_to_num(train)
test = np.nan_to_num(test)

In [19]:
train_labels = np.array(train_labels, dtype = 'float')

In [20]:
smote.fit(train, train_labels)

  return umr_sum(a, axis, dtype, out, keepdims)


SMOTE(k_neighbors=5, kind='regular', m_neighbors=10, n_jobs=1, out_step=0.5,
   random_state=None, ratio=None, sampling_strategy='auto',
   svm_estimator=None)

In [21]:
train_over, train_labels_over = smote.fit_sample(train, train_labels)

  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)


In [27]:
def macro_f1_score(labels, predictions):
    # Reshape the predictions as needed
    predictions = predictions.reshape(len(np.unique(labels)), -1 ).argmax(axis = 0)
    
    metric_value = f1_score(labels, predictions, average = 'macro')
    
    # Return is name, value, is_higher_better
    return 'macro_f1', metric_value, True

def model_gbm(hyperparameters, features, labels, 
              test_features, test_ids, submission_base, 
              nfolds = 5, return_preds = False):
    """Model using the GBM and cross validation.
       Trains with early stopping on each fold.
       Hyperparameters probably need to be tuned."""
    
    feature_names = list(features.columns)
    
    # Model with hyperparameters selected from previous work
#     model = lgb.LGBMClassifier(boosting_type = 'gbdt', n_estimators = 10000, max_depth = -1,
#                                learning_rate = 0.025, metric = 'None', min_child_samples = 30,
#                                reg_alpha = 0.35, reg_lambda = 0.6, num_leaves = 15, 
#                                colsample_bytree = 0.85, objective = 'multiclass', 
#                                class_weight = 'balanced', 
#                                n_jobs = -1)

    model = lgb.LGBMClassifier(**hyperparameters, 
                               objective = 'multiclass', n_jobs = -1, 
                               n_estimators = 10000, metric = 'None')
    
    # Using stratified kfold cross validation
    strkfold = StratifiedKFold(n_splits = nfolds, shuffle = True)
    predictions = pd.DataFrame()
    importances = np.zeros(len(feature_names))
    
    # Convert to arrays for indexing
    features = np.array(features)
    test_features = np.array(test_features)
    labels = np.array(labels).reshape((-1 ))
    
    valid_scores = []
    
    # Iterate through the folds
    for i, (train_indices, valid_indices) in enumerate(strkfold.split(features, labels)):
        # Dataframe for 
        fold_predictions = pd.DataFrame()
        
        # Training and validation data
        X_train = features[train_indices]
        X_valid = features[valid_indices]
        y_train = labels[train_indices]
        y_valid = labels[valid_indices]
        
        # Train with early stopping
        model.fit(X_train, y_train, early_stopping_rounds = 100, 
                  eval_metric = macro_f1_score,
                  eval_set = [(X_train, y_train), (X_valid, y_valid)],
                  eval_names = ['train', 'valid'],
                  verbose = 200)
        
        # Record the validation fold score
        valid_scores.append(model.best_score_['valid']['macro_f1'])
        
        # Make predictions from the fold
        fold_probabilitites = model.predict_proba(test_features)
        
        # Record each prediction for each class as a column
        for j in range(4):
            fold_predictions[(j + 1)] = fold_probabilitites[:, j]
            
        fold_predictions['idhogar'] = test_ids
        fold_predictions['fold'] = (i+1)
        predictions = predictions.append(fold_predictions)
        
        importances += model.feature_importances_ / nfolds   
        
        display(f'Fold {i + 1}, Validation Score: {round(valid_scores[i], 5)}, Estimators Trained: {model.best_iteration_}')

    feature_importances = pd.DataFrame({'feature': feature_names,
                                        'importance': importances})
    valid_scores = np.array(valid_scores)
    display(f'{nfolds} cross validation score: {round(valid_scores.mean(), 5)} with std: {round(valid_scores.std(), 5)}.')
    
    # If we want to examine predictions don't average over folds
    if return_preds:
        predictions['Target'] = predictions[[1, 2, 3, 4]].idxmax(axis = 1)
        predictions['confidence'] = predictions[[1, 2, 3, 4]].max(axis = 1)
        return predictions, feature_importances
    
    # Average the predictions over folds
    predictions = predictions.groupby('idhogar', as_index = False).mean()
    
    # Find the class and associated probability
    predictions['Target'] = predictions[[1, 2, 3, 4]].idxmax(axis = 1)
    predictions['confidence'] = predictions[[1, 2, 3, 4]].max(axis = 1)
    predictions = predictions.drop(columns = ['fold'])
    
    # Merge with the base to have one prediction for each individual
    submission = submission_base.merge(predictions[['idhogar', 'Target']], on = 'idhogar', how = 'left').drop(columns = ['idhogar'])
        
    submission['Target'] = submission['Target'].fillna(4).astype(np.int8)
    
    # return the submission and feature importances
    return submission, feature_importances, valid_scores

In [28]:
hyp_OPTaaS = { 'boosting_type': 'dart',
              'colsample_bytree': 0.9843467236959204,
              'learning_rate': 0.11598629586769524,
              'min_child_samples': 44,
              'num_leaves': 49,
              'reg_alpha': 0.35397370408131534,
              'reg_lambda': 0.5904910774606467,
              'subsample': 0.6299872254632797,
              'subsample_for_bin': 60611}

In [29]:
train_over = pd.DataFrame(train_over, columns = feature_names)

In [None]:
submission, feature_importances, valid_scores = model_gbm(hyp_OPTaaS, train_over, train_labels_over,
                                                          test, test_ids, test_base)

In [None]:
train_labels_over.shape

In [None]:
train_over.shape