A simple RF model for the classification problem

https://www.hackerearth.com/challenge/competitive/brainwaves-17-1/

## Load datasets

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
print(train.shape)
print(test.shape)

(348978, 51)
(523466, 50)


## Preprocess

In [2]:
sub_ids = test['transaction_id']
y = train['target']

train = train.drop(['transaction_id', 'target'], axis=1)
test = test.drop(['transaction_id'], axis=1)

In [3]:
# Distribution of target 
y.value_counts(normalize=True)

0    0.892922
1    0.107078
Name: target, dtype: float64

In [4]:
# The categorical features
cat_vars = [x for x in train.columns if 'cat_' in x]
len(cat_vars)

42

In [5]:
# categorical features with only single value in train set
cat_to_drop_train = []
for x in cat_vars:
    if train[x].nunique() == 1:
        print(x, train[x].nunique())
        cat_to_drop_train.append(x)

cat_var_31 1
cat_var_35 1
cat_var_36 1
cat_var_37 1
cat_var_38 1
cat_var_40 1
cat_var_42 1


In [6]:
# categorical features with only single value in test set
cat_to_drop_test = []
for x in cat_vars:
    if test[x].nunique() == 1:
        print(x, test[x].nunique())
        cat_to_drop_test.append(x)

cat_var_38 1
cat_var_41 1
cat_var_42 1


In [7]:
# drop categorical features having only single value
cat_to_drop = list(set(cat_to_drop_train + cat_to_drop_test))
train = train.drop(cat_to_drop, axis=1)
test = test.drop(cat_to_drop, axis=1)

print(train.shape)
print(test.shape)

(348978, 41)
(523466, 41)


In [8]:
# remaining categorical features
cat_vars = [x for x in train.columns if 'cat_' in x]
len(cat_vars)

34

In [9]:
# Encode categorical features
from sklearn.preprocessing import LabelEncoder
for x in cat_vars:
    train[x] = train[x].fillna('NaN')
    test[x] = test[x].fillna('NaN')
    encoder = LabelEncoder()
    encoder.fit(list(set(list(train[x]) + list(test[x]))))
    train[x] = encoder.transform(train[x])
    test[x] = encoder.transform(test[x])

In [10]:
print(train.shape)
print(test.shape)

(348978, 41)
(523466, 41)


## Evaluate a RandomForest model

In [11]:
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

forest_clf = RandomForestClassifier(random_state=7)

y_probas_forest = cross_val_predict(forest_clf, train, y, cv=3, method='predict_proba')
y_scores_forest = y_probas_forest[:, 1]

roc_auc_score(y, y_scores_forest)

0.71618729280361049

## Predict on test set

In [12]:
# fit on the whole training set
forest_clf = RandomForestClassifier(random_state=7)
forest_clf.fit(train, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)

In [13]:
preds = forest_clf.predict_proba(test)[:,1]

## Prepare submission

In [14]:
from IPython.display import FileLink

sub = pd.DataFrame({'transaction_id': sub_ids, 'target': preds})
sub = sub[['transaction_id','target']]    

filename='sub1.csv'
sub.to_csv(filename, index=False)
FileLink(filename) # lb 0.72336