In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 500)
pd.set_option("display.expand_frame_repr", True)

sns.set_style()

warnings.filterwarnings("ignore")

In [2]:
credit = pd.read_csv("data/credit.csv")
credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
checking_status           1000 non-null object
duration                  1000 non-null int64
credit_history            1000 non-null object
purpose                   1000 non-null object
credit_amount             1000 non-null int64
savings_status            1000 non-null object
employment                1000 non-null object
installment_commitment    1000 non-null int64
personal_status           1000 non-null object
other_parties             1000 non-null object
residence_since           1000 non-null int64
property_magnitude        1000 non-null object
age                       1000 non-null int64
other_payment_plans       1000 non-null object
housing                   1000 non-null object
existing_credits          1000 non-null int64
job                       1000 non-null object
num_dependents            1000 non-null int64
own_telephone             1000 non-null object
foreign_

In [3]:
credit.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,'<0',6,'critical/other existing credit',buy_radio_tv,1169,'no known savings','>=7',4,'male single',none,4,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,'0<=X<200',48,'existing paid',buy_radio_tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,2,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,3,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,'<0',42,'existing paid',buy_furniture_equipment,7882,'<100','4<=X<7',2,'male single',guarantor,4,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,'<0',24,'delayed previously',buy_new_car,4870,'<100','1<=X<4',3,'male single',none,4,'no known property',53,none,'for free',2,skilled,2,none,yes,bad


### Feature engineering
You are tasked to predict whether a new cohort of loan applicants are likely to default on their loans. You have a historical dataset and wish to train a classifier on it. You notice that many features are in string format, which is a problem for your classifiers. You hence decide to encode the string columns numerically using `LabelEncoder()`. The function has been preloaded for you from the preprocessing submodule of sklearn. The dataset credit is also preloaded, as is a list of all column names whose data types are string, stored in `non_numeric_columns`

In [4]:
non_numeric_columns = ['checking_status','credit_history','purpose',
                       'savings_status','employment','personal_status',
                       'other_parties','property_magnitude','other_payment_plans',
                       'housing','job','own_telephone','foreign_worker']

# Create a label encoder for each column. Encode the values
for column in non_numeric_columns:
    le = LabelEncoder()
    credit[column] = le.fit_transform(credit[column])

# Inspect the data types of the columns of the data frame
print(credit.dtypes)

checking_status            int64
duration                   int64
credit_history             int64
purpose                    int64
credit_amount              int64
savings_status             int64
employment                 int64
installment_commitment     int64
personal_status            int64
other_parties              int64
residence_since            int64
property_magnitude         int64
age                        int64
other_payment_plans        int64
housing                    int64
existing_credits           int64
job                        int64
num_dependents             int64
own_telephone              int64
foreign_worker             int64
class                     object
dtype: object


In [5]:
credit.head(3)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,1,6,1,4,1169,4,3,4,3,2,4,2,67,1,1,2,3,1,1,1,good
1,0,48,3,4,5951,2,0,2,0,2,2,2,22,1,1,1,3,1,0,1,bad
2,3,12,1,6,2096,2,1,2,3,2,3,2,49,1,1,1,2,2,0,1,good


### Your first pipeline
Your colleague has used `AdaBoostClassifier` for the credit scoring dataset. You want to also try out a random forest classifier. In this exercise, you will fit this classifier to the data and compare it to `AdaBoostClassifier`. Make sure to use train/test data splitting to avoid overfitting. The data is preloaded and transformed so that all features are numeric. The features are available as X and the labels as y. The module `RandomForestClassifier` has also been preloaded.

In [6]:
credit.shape

(1000, 21)

In [7]:
def build_data(dataframe):
    X = dataframe.iloc[:, :-1]
    y = dataframe.iloc[:, -1]
    return X, y

In [8]:
X, y = build_data(credit)
X.shape, y.shape

((1000, 20), (1000,))

In [9]:
# Split the data into train and test, with 20% as test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create a random forest classifier, fixing the seed to 2
rf_model = RandomForestClassifier(random_state=2).fit(X_train, y_train)

# Use it to predict the labels of the test data
rf_predictions = rf_model.predict(X_test)

accuracies = {}

# Assess the accuracy of both classifiers
accuracies['rf'] = accuracy_score(y_test, rf_predictions)

In [10]:
accuracies

{'rf': 0.74}

### Grid search CV for model complexity
In the last slide, you saw how most classifiers have one or more hyperparameters that control its complexity. You also learned to tune them using `GridSearchCV()`. In this exercise, you will perfect this skill. You will experiment with:

- The number of trees, `n_estimators`, in a `RandomForestClassifier`.
- The maximum depth, `max_depth`, of the decision trees used in an `AdaBoostClassifier`.
- The number of nearest neighbors, `n_neighbors`, in `KNeighborsClassifier`.

In [11]:
# Set a range for n_estimators from 10 to 40 in steps of 10
param_grid = {'n_estimators': range(10, 50, 10)}

# Optimize for a RandomForestClassifier() using GridSearchCV
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)
grid.fit(X, y)
grid.best_params_

{'n_estimators': 20}

In [12]:
# Define a grid for n_estimators ranging from 1 to 10
param_grid = {'n_estimators': range(1, 11)}

# Optimize for a AdaBoostClassifier() using GridSearchCV
grid = GridSearchCV(AdaBoostClassifier(), param_grid, cv=3)
grid.fit(X, y)
grid.best_params_

{'n_estimators': 10}

In [13]:
# Define a grid for n_neighbors with values 10, 50 and 100
param_grid = {'n_neighbors': [10, 50, 100]}

# Optimize for KNeighborsClassifier() using GridSearchCV
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=3)
grid.fit(X, y)
grid.best_params_

{'n_neighbors': 50}

### Categorical encodings
Your colleague has converted the columns in the credit dataset to numeric values using `LabelEncoder()`. He left one out: `credit_history`, which records the credit history of the applicant. You want to create two versions of the dataset. One will use `LabelEncoder()` and another one-hot encoding, for comparison purposes. The feature matrix is available to you as credit. You have `LabelEncoder()` preloaded and pandas as pd.

In [14]:
# Create numeric encoding for credit_history
credit_history_num = LabelEncoder().fit_transform(credit['credit_history'])

# Create a new feature matrix including the numeric encoding
X_num = pd.concat([X, pd.Series(credit_history_num)], 1)

# Create new feature matrix with dummies for credit_history
X_hot = pd.concat([X, pd.get_dummies(credit['credit_history'])], 1)

# Compare the number of features of the resulting DataFrames
X_hot.shape[1] > X_num.shape[1]

True

### Feature transformations
You are discussing the credit dataset with the bank manager. She suggests that the safest loan applications tend to request mid-range credit amounts. Values that are either too low or too high suggest high risk. This means that a non-linear relationship might exist between this variable and the class. You want to test this hypothesis. You will construct a non-linear transformation of the feature. Then, you will assess which of the two features is better at predicting the class using `SelectKBest()` and the `chi2()` metric, both of which have been preloaded.

The data is available as a pandas DataFrame called credit, with the class contained in the column class. You also have preloaded pandas as pd and numpy as np.

In [15]:
# Function computing absolute difference from column mean
def abs_diff(x):
    return np.abs(x-np.mean(x))

# Apply it to the credit amount and store to new column
credit['diff'] = abs_diff(credit['credit_amount'])

# Create a feature selector with chi2 that picks one feature
sk = SelectKBest(chi2, k=1)

# Use the selector to pick between credit_amount and diff
sk.fit(credit[['credit_amount', 'diff']], credit['class'])

# Inspect the results
sk.get_support()

array([ True, False])

### Bringing it all together
You just joined an arrhythmia detection startup and want to train a model on the arrhythmias dataset arrh. You noticed that random forests tend to win quite a few Kaggle competitions, so you want to try that out with a maximum depth of 2, 5, or 10, using grid search. You also observe that the dimension of the dataset is quite high so you wish to consider the effect of a feature selection method.

To make sure you don't overfit by mistake, you have already split your data. You will use X_train and y_train for the grid search, and X_test and y_test to decide if feature selection helps. All four dataset folds are preloaded in your environment. You also have access to `GridSearchCV()`, `train_test_split()`, `SelectKBest()`, `chi2()` and `RandomForestClassifier` as rfc.

In [16]:
arrh = pd.read_csv("data/arrh.csv")
arrh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Columns: 280 entries, age to class
dtypes: float64(120), int64(160)
memory usage: 988.8 KB


In [17]:
arrh.head()

Unnamed: 0,age,sex,height,weight,QRSduration,PRinterval,Q-Tinterval,Tinterval,Pinterval,QRS,T,P,QRST,J,heartrate,chDI_Qwave,chDI_Rwave,chDI_Swave,chDI_RPwave,chDI_SPwave,chDI_intrinsicReflecttions,chDI_RRwaveExists,chDI_DD_RRwaveExists,chDI_RPwaveExists,chDI_DD_RPwaveExists,chDI_RTwaveExists,chDI_DD_RTwaveExists,chDII_Qwave,chDII_Rwave,chDII_Swave,chDII_RPwave,chDII_SPwave,chDII_intrinsicReflecttions,chDII_RRwaveExists,chDII_DD_RRwaveExists,chDII_RPwaveExists,chDII_DD_RPwaveExists,chDII_RTwaveExists,chDII_DD_RTwaveExists,chDIII_Qwave,chDIII_Rwave,chDIII_Swave,chDIII_RPwave,chDIII_SPwave,chDIII_intrinsicReflecttions,chDIII_RRwaveExists,chDIII_DD_RRwaveExists,chDIII_RPwaveExists,chDIII_DD_RPwaveExists,chDIII_RTwaveExists,chDIII_DD_RTwaveExists,chAVR_Qwave,chAVR_Rwave,chAVR_Swave,chAVR_RPwave,chAVR_SPwave,chAVR_intrinsicReflecttions,chAVR_RRwaveExists,chAVR_DD_RRwaveExists,chAVR_RPwaveExists,chAVR_DD_RPwaveExists,chAVR_RTwaveExists,chAVR_DD_RTwaveExists,chAVL_Qwave,chAVL_Rwave,chAVL_Swave,chAVL_RPwave,chAVL_SPwave,chAVL_intrinsicReflecttions,chAVL_RRwaveExists,chAVL_DD_RRwaveExists,chAVL_RPwaveExists,chAVL_DD_RPwaveExists,chAVL_RTwaveExists,chAVL_DD_RTwaveExists,chAVF_Qwave,chAVF_Rwave,chAVF_Swave,chAVF_RPwave,chAVF_SPwave,chAVF_intrinsicReflecttions,chAVF_RRwaveExists,chAVF_DD_RRwaveExists,chAVF_RPwaveExists,chAVF_DD_RPwaveExists,chAVF_RTwaveExists,chAVF_DD_RTwaveExists,chV1_Qwave,chV1_Rwave,chV1_Swave,chV1_RPwave,chV1_SPwave,chV1_intrinsicReflecttions,chV1_RRwaveExists,chV1_DD_RRwaveExists,chV1_RPwaveExists,chV1_DD_RPwaveExists,chV1_RTwaveExists,chV1_DD_RTwaveExists,chV2_Qwave,chV2_Rwave,chV2_Swave,chV2_RPwave,chV2_SPwave,chV2_intrinsicReflecttions,chV2_RRwaveExists,chV2_DD_RRwaveExists,chV2_RPwaveExists,chV2_DD_RPwaveExists,chV2_RTwaveExists,chV2_DD_RTwaveExists,chV3_Qwave,chV3_Rwave,chV3_Swave,chV3_RPwave,chV3_SPwave,chV3_intrinsicReflecttions,chV3_RRwaveExists,chV3_DD_RRwaveExists,chV3_RPwaveExists,chV3_DD_RPwaveExists,chV3_RTwaveExists,chV3_DD_RTwaveExists,chV4_Qwave,chV4_Rwave,chV4_Swave,chV4_RPwave,chV4_SPwave,chV4_intrinsicReflecttions,chV4_RRwaveExists,chV4_DD_RRwaveExists,chV4_RPwaveExists,chV4_DD_RPwaveExists,chV4_RTwaveExists,chV4_DD_RTwaveExists,chV5_Qwave,chV5_Rwave,chV5_Swave,chV5_RPwave,chV5_SPwave,chV5_intrinsicReflecttions,chV5_RRwaveExists,chV5_DD_RRwaveExists,chV5_RPwaveExists,chV5_DD_RPwaveExists,chV5_RTwaveExists,chV5_DD_RTwaveExists,chV6_Qwave,chV6_Rwave,chV6_Swave,chV6_RPwave,chV6_SPwave,chV6_intrinsicReflecttions,chV6_RRwaveExists,chV6_DD_RRwaveExists,chV6_RPwaveExists,chV6_DD_RPwaveExists,chV6_RTwaveExists,chV6_DD_RTwaveExists,chDI_JJwaveAmp,chDI_QwaveAmp,chDI_RwaveAmp,chDI_SwaveAmp,chDI_RPwaveAmp,chDI_SPwaveAmp,chDI_PwaveAmp,chDI_TwaveAmp,chDI_QRSA,chDI_QRSTA,chDII_JJwaveAmp,chDII_QwaveAmp,chDII_RwaveAmp,chDII_SwaveAmp,chDII_RPwaveAmp,chDII_SPwaveAmp,chDII_PwaveAmp,chDII_TwaveAmp,chDII_QRSA,chDII_QRSTA,chDIII_JJwaveAmp,chDIII_QwaveAmp,chDIII_RwaveAmp,chDIII_SwaveAmp,chDIII_RPwaveAmp,chDIII_SPwaveAmp,chDIII_PwaveAmp,chDIII_TwaveAmp,chDIII_QRSA,chDIII_QRSTA,chAVR_JJwaveAmp,chAVR_QwaveAmp,chAVR_RwaveAmp,chAVR_SwaveAmp,chAVR_RPwaveAmp,chAVR_SPwaveAmp,chAVR_PwaveAmp,chAVR_TwaveAmp,chAVR_QRSA,chAVR_QRSTA,chAVL_JJwaveAmp,chAVL_QwaveAmp,chAVL_RwaveAmp,chAVL_SwaveAmp,chAVL_RPwaveAmp,chAVL_SPwaveAmp,chAVL_PwaveAmp,chAVL_TwaveAmp,chAVL_QRSA,chAVL_QRSTA,chAVF_JJwaveAmp,chAVF_QwaveAmp,chAVF_RwaveAmp,chAVF_SwaveAmp,chAVF_RPwaveAmp,chAVF_SPwaveAmp,chAVF_PwaveAmp,chAVF_TwaveAmp,chAVF_QRSA,chAVF_QRSTA,chV1_JJwaveAmp,chV1_QwaveAmp,chV1_RwaveAmp,chV1_SwaveAmp,chV1_RPwaveAmp,chV1_SPwaveAmp,chV1_PwaveAmp,chV1_TwaveAmp,chV1_QRSA,chV1_QRSTA,chV2_JJwaveAmp,chV2_QwaveAmp,chV2_RwaveAmp,chV2_SwaveAmp,chV2_RPwaveAmp,chV2_SPwaveAmp,chV2_PwaveAmp,chV2_TwaveAmp,chV2_QRSA,chV2_QRSTA,chV3_JJwaveAmp,chV3_QwaveAmp,chV3_RwaveAmp,chV3_SwaveAmp,chV3_RPwaveAmp,chV3_SPwaveAmp,chV3_PwaveAmp,chV3_TwaveAmp,chV3_QRSA,chV3_QRSTA,chV4_JJwaveAmp,chV4_QwaveAmp,chV4_RwaveAmp,chV4_SwaveAmp,chV4_RPwaveAmp,chV4_SPwaveAmp,chV4_PwaveAmp,chV4_TwaveAmp,chV4_QRSA,chV4_QRSTA,chV5_JJwaveAmp,chV5_QwaveAmp,chV5_RwaveAmp,chV5_SwaveAmp,chV5_RPwaveAmp,chV5_SPwaveAmp,chV5_PwaveAmp,chV5_TwaveAmp,chV5_QRSA,chV5_QRSTA,chV6_JJwaveAmp,chV6_QwaveAmp,chV6_RwaveAmp,chV6_SwaveAmp,chV6_RPwaveAmp,chV6_SPwaveAmp,chV6_PwaveAmp,chV6_TwaveAmp,chV6_QRSA,chV6_QRSTA,class
0,75,0,190,80,91,193,371,174,121,-16,70,78,11,69,29,0,52,44,0,0,32,0,0,0,0,0,0,0,44,20,36,0,28,0,0,0,0,0,0,52,40,0,0,0,60,0,0,0,0,0,0,52,0,0,0,0,0,0,0,0,0,0,0,0,56,36,0,0,32,0,0,0,0,0,0,48,32,0,0,0,56,0,0,0,0,0,0,80,0,0,0,0,0,0,0,0,0,0,0,0,40,52,0,0,28,0,0,0,0,0,0,0,48,48,0,0,32,0,0,0,0,0,0,0,52,52,0,0,36,0,0,0,0,0,0,0,52,48,0,0,32,0,0,0,0,0,0,0,56,44,0,0,32,0,0,0,0,0,0,-0.2,0.0,6.1,-1.0,0.0,0.0,0.6,2.1,13.6,30.8,0.0,0.0,1.7,-1.0,0.6,0.0,1.3,1.5,3.7,14.5,0.1,-5.2,1.4,0.0,0.0,0.0,0.8,-0.6,-10.7,-15.6,0.4,-3.9,0.0,0.0,0.0,0.0,-0.8,-1.7,-10.1,-22.0,0.0,0.0,5.7,-1.0,0.0,0.0,-0.1,1.2,14.1,22.5,0.0,-2.5,0.8,0.0,0.0,0.0,1.0,0.4,-4.8,-2.7,0.1,-6.0,0.0,0.0,0.0,0.0,-0.8,-0.6,-24.0,-29.7,0.0,0.0,2.0,-6.4,0.0,0.0,0.2,2.9,-12.6,15.2,-0.1,0.0,8.4,-10.0,0.0,0.0,0.6,5.9,-3.9,52.7,-0.3,0.0,15.2,-8.4,0.0,0.0,0.9,5.1,17.7,70.7,-0.4,0.0,13.5,-4.0,0.0,0.0,0.9,3.9,25.5,62.9,-0.3,0.0,9.0,-0.9,0.0,0.0,0.9,2.9,23.3,49.4,0
1,56,1,165,64,81,174,401,149,39,25,113,3,69,69,19,0,48,0,0,0,24,0,0,0,0,0,0,0,64,0,0,0,24,0,0,0,0,0,0,32,24,0,0,0,40,0,0,0,0,0,0,48,0,0,0,0,0,0,0,0,0,0,0,0,44,20,0,0,24,0,0,0,0,0,0,0,60,0,0,0,20,0,0,0,0,0,0,0,24,52,0,0,16,0,0,0,0,0,0,0,32,52,0,0,20,0,0,0,0,0,0,0,44,48,0,0,32,0,0,0,0,0,0,0,48,44,0,0,32,0,0,0,0,0,0,0,48,40,0,0,28,0,0,0,0,0,0,0,48,0,0,0,28,0,0,0,0,0,0,-0.6,0.0,7.2,0.0,0.0,0.0,0.4,1.5,17.2,26.5,0.0,0.0,5.5,0.0,0.0,0.0,0.1,1.7,17.6,29.5,0.3,-1.6,0.9,0.0,0.0,0.0,-0.3,0.4,-1.5,1.3,0.1,-6.4,0.0,0.0,0.0,0.0,-0.3,-1.6,-15.3,-25.5,-0.3,0.0,4.2,-0.9,0.0,0.0,0.4,0.7,8.3,12.3,0.2,0.0,2.2,0.0,0.0,0.0,-0.2,0.8,6.6,11.7,0.4,0.0,1.0,-8.8,0.0,0.0,0.5,-0.6,-21.6,-26.8,0.4,0.0,2.6,-7.9,0.0,0.0,0.8,2.0,-16.4,1.2,0.0,0.0,5.8,-7.7,0.0,0.0,0.9,3.8,-5.7,27.7,-0.2,0.0,9.5,-5.0,0.0,0.0,0.5,2.6,11.8,34.6,-0.4,0.0,11.0,-2.4,0.0,0.0,0.4,2.6,21.6,43.4,-0.5,0.0,8.5,0.0,0.0,0.0,0.2,2.1,20.4,38.8,0
2,54,0,172,95,138,163,386,185,102,96,110,85,106,61,41,0,40,80,0,0,24,0,0,0,0,0,0,20,56,52,0,0,40,0,0,0,0,0,0,28,116,0,0,0,52,0,0,0,0,0,0,52,64,0,0,0,88,0,0,0,0,0,0,0,36,92,0,0,24,0,0,0,0,0,0,0,128,0,0,0,24,0,1,0,0,0,0,0,24,36,76,0,100,0,0,0,0,0,0,0,40,28,60,0,96,0,0,0,0,0,0,0,48,20,56,24,32,0,0,0,0,0,0,0,44,88,0,0,28,0,0,0,0,0,0,0,44,76,0,0,28,0,0,0,0,0,0,0,44,72,0,0,24,0,0,0,0,0,0,1.0,0.0,4.5,-2.8,0.0,0.0,0.3,2.5,-2.2,19.8,0.8,-0.4,6.4,-1.3,0.0,0.0,0.7,2.7,14.2,37.9,-0.2,-0.6,4.4,0.0,0.0,0.0,0.5,0.2,24.7,26.2,-1.0,-5.3,1.8,0.0,0.0,0.0,-0.5,-2.5,-8.0,-28.5,0.5,0.0,1.7,-2.7,0.0,0.0,-0.2,1.0,-9.4,-1.2,0.4,0.0,4.9,0.0,0.0,0.0,0.6,1.4,31.3,42.7,-0.8,0.0,0.7,-3.8,6.5,0.0,0.3,-3.3,18.7,-13.6,-0.9,0.0,2.2,-4.1,7.4,0.0,0.5,-2.4,20.9,-2.6,0.0,0.0,5.8,-4.1,4.0,-0.5,0.4,0.3,20.4,23.3,0.7,0.0,10.0,-5.7,0.0,0.0,0.5,2.2,-3.0,20.7,1.3,0.0,11.1,-3.4,0.0,0.0,0.4,3.4,11.5,48.2,0.9,0.0,9.5,-2.4,0.0,0.0,0.3,3.4,12.3,49.0,0
3,55,0,175,94,100,202,380,179,143,28,58,10,57,69,37,0,72,20,0,0,48,0,0,0,0,0,0,0,64,36,0,0,36,0,0,0,0,0,0,20,52,48,0,0,56,0,0,0,0,0,0,64,32,0,0,0,72,0,0,0,0,0,0,0,60,12,0,0,44,0,0,0,0,0,0,0,60,44,0,0,32,0,0,0,0,0,0,56,0,0,0,0,0,0,0,0,0,0,0,0,40,44,0,0,20,0,0,0,0,0,0,0,52,40,0,0,32,0,0,0,0,0,0,0,56,48,0,0,36,0,0,0,0,0,0,0,60,48,0,0,36,0,0,0,0,0,0,0,64,40,0,0,40,0,0,0,0,0,0,0.9,0.0,7.8,-0.7,0.0,0.0,1.1,1.9,27.3,45.1,0.1,0.0,9.1,-2.6,0.0,0.0,0.4,1.5,24.5,36.8,-0.4,-0.4,1.6,-2.2,0.0,0.0,-1.0,-0.9,-1.5,-9.2,-0.4,-8.2,1.8,0.0,0.0,0.0,-0.7,-1.7,-23.4,-35.6,0.9,0.0,3.2,-0.4,0.0,0.0,0.7,1.2,9.4,18.0,-0.1,0.0,5.1,-2.5,0.0,0.0,0.3,0.6,9.8,12.6,1.6,-6.5,0.0,0.0,0.0,0.0,-0.4,-0.4,-18.2,-22.4,2.1,0.0,1.2,-6.9,0.0,0.0,-0.5,2.9,-12.7,18.0,0.7,0.0,9.0,-7.9,0.0,0.0,0.1,4.1,7.6,51.0,0.4,0.0,15.0,-5.5,0.0,0.0,0.1,3.3,28.8,63.1,0.1,0.0,15.2,-3.7,0.0,0.0,0.6,3.0,36.8,68.0,0.1,0.0,12.2,-2.2,0.0,0.0,0.4,2.6,34.6,61.6,1
4,75,0,190,80,88,181,360,177,103,-16,70,75,67,69,63,0,48,40,0,0,28,0,0,0,0,0,0,0,40,24,0,0,24,0,0,0,0,0,0,52,36,0,0,0,60,0,0,0,0,0,0,48,28,0,0,0,56,0,0,0,0,0,0,0,48,36,0,0,28,0,0,0,0,0,0,44,0,0,0,0,0,0,0,0,0,0,0,88,0,0,0,0,0,0,0,0,0,0,0,0,40,52,0,0,28,0,0,0,0,0,0,0,48,48,0,0,32,0,0,0,0,0,0,0,48,52,0,0,32,0,0,0,0,0,0,0,52,44,0,0,28,0,0,0,0,0,0,0,52,48,0,0,32,0,0,0,0,0,0,0.0,0.0,5.2,-1.4,0.0,0.0,0.9,2.3,9.6,31.6,0.1,0.0,1.6,-0.5,0.0,0.0,1.9,1.7,2.6,18.9,0.2,-3.8,1.2,0.0,0.0,0.0,1.0,-0.6,-7.7,-13.4,-0.1,-3.4,0.8,0.0,0.0,0.0,-1.4,-1.5,-7.0,-17.8,-0.1,0.0,4.4,-1.3,0.0,0.0,-0.1,1.1,8.2,16.5,0.6,-1.6,0.0,0.0,0.0,0.0,1.4,0.3,-3.5,-1.9,0.0,-5.7,0.0,0.0,0.0,0.0,-0.4,-0.5,-25.0,-30.0,-0.2,0.0,1.6,-6.0,0.0,0.0,-0.7,2.1,-12.4,8.6,-0.5,0.0,8.5,-10.2,0.0,0.0,-1.0,4.7,-4.0,43.0,-0.2,0.0,15.2,-7.8,0.0,0.0,-0.1,4.9,16.2,63.2,-0.2,0.0,9.1,-0.9,0.0,0.0,-0.2,2.9,21.7,48.9,-0.4,0.0,13.1,-3.6,0.0,0.0,-0.1,3.9,25.4,62.8,0


In [18]:
X, y = build_data(arrh)
X.shape, y.shape

((452, 279), (452,))

In [19]:
columns_with_negative_values = X.describe().loc["min"][X.describe().loc["min"]<0].index.tolist()
len(columns_with_negative_values)

93

In [20]:
# Replace negative numbers by zero
X[X < 0] = 0

In [21]:
# Split the data into train and test, with 20% as test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

X_train.shape, y_train.shape

((339, 279), (339,))

In [22]:
rfc = RandomForestClassifier

In [23]:
# Find the best value for max_depth among values 2, 5 and 10
grid_search = GridSearchCV(rfc(random_state=1), param_grid={'max_depth': [2, 5, 10]})
best_value = grid_search.fit(X_train, y_train).best_params_['max_depth']

best_value

10

In [24]:
# Using the best value from above, fit a random forest
clf = rfc(random_state=1, max_depth=best_value).fit(X_train, y_train)

In [25]:
# Apply SelectKBest with chi2 and pick top 100 features
vt = SelectKBest(chi2, k=100).fit(X_train, y_train)

# Create a new dataset only containing the selected features
X_train_reduced = vt.transform(X_train)

X_train_reduced.shape

(339, 100)

In [26]:
X_train_reduced

array([[160. ,  84. , 186. , ...,   3. ,  24. ,  52.8],
       [175. ,  96. , 141. , ...,   3.4,  24.8,  57.4],
       [170. , 113. , 216. , ...,   3.2,  33. ,  61.8],
       ...,
       [158. ,  82. , 122. , ...,   2.5,  17.6,  40.6],
       [173. , 103. , 155. , ...,   0. ,  26.1,   8.5],
       [164. ,  85. , 200. , ...,   4.3,  26.5,  67.7]])

## Chapter 2

### Is the source or the destination bad?
In the previous lesson, you used the destination computer as your entity of interest. However, your cybersecurity analyst just told you that it is the infected machines that generate the bad traffic, and will therefore appear as a source, not a destination, in the flows dataset.

The data flows has been preloaded, as well as the list bad of infected IDs and the feature extractor `featurizer()` from the previous lesson. You also have numpy available as np, `AdaBoostClassifier()`, and `cross_val_score()`.

In [27]:
flows = pd.read_csv("data/lanl_flows.csv")
flows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
time                    10000 non-null int64
duration                10000 non-null int64
source_computer         10000 non-null object
source_port             10000 non-null object
destination_computer    10000 non-null object
destination_port        10000 non-null object
protocol                10000 non-null int64
packet_count            10000 non-null int64
byte_count              10000 non-null int64
dtypes: int64(5), object(4)
memory usage: 703.2+ KB


In [28]:
flows.head()

Unnamed: 0,time,duration,source_computer,source_port,destination_computer,destination_port,protocol,packet_count,byte_count
0,471692,0,C5808,N24128,C26871,N17023,6,1,60
1,471692,0,C5808,N2414,C26871,N19148,6,1,60
2,471692,0,C5808,N24156,C26871,N8001,6,1,60
3,471692,0,C5808,N24161,C26871,N18502,6,1,60
4,471692,0,C5808,N24162,C26871,N11309,6,1,60


In [29]:
def featurize(df):
    return {
        'unique_ports': len(set(df['destination_port'])),
        'average_packet': np.mean(df['packet_count']),
        'average_duration': np.mean(df['duration'])
    }

In [30]:
bads = {'C1', 'C10', 'C10005', 'C1003', 'C1006', 'C1014', 'C1015', 'C102', 'C1022', 'C1028', 'C10405', 'C1042', 'C1046', 'C10577', 'C1065', 'C108', 'C10817', 'C1085', 'C1089', 'C1096', 'C11039', 'C11178', 'C1119', 'C11194', 'C1124', 'C1125', 'C113', 'C115', 'C11727', 'C1173', 'C1183', 'C1191', 'C12116', 'C1215', 'C1222', 'C1224', 'C12320', 'C12448', 'C12512', 'C126', 'C1268', 'C12682', 'C1269', 'C1275', 'C1302', 'C1319', 'C13713', 'C1382', 'C1415', 'C143', 'C1432', 'C1438', 'C1448', 'C1461', 'C1477', 'C1479', 'C148', 'C1482', 'C1484', 'C1493', 'C15', 'C1500', 'C1503', 'C1506', 'C1509', 'C15197', 'C152', 'C15232', 'C1549', 'C155', 'C1555', 'C1567', 'C1570', 'C1581', 'C16088', 'C1610', 'C1611', 'C1616', 'C1626', 'C1632', 'C16401', 'C16467', 'C16563', 'C1710', 'C1732', 'C1737', 'C17425', 'C17600', 'C17636', 'C17640', 'C17693', 'C177', 'C1776', 'C17776', 'C17806', 'C1784', 'C17860', 'C1797', 'C18025', 'C1810', 'C18113', 'C18190', 'C1823', 'C18464', 'C18626', 'C1887', 'C18872', 'C19038', 'C1906', 'C19156', 'C19356', 'C1936', 'C1944', 'C19444', 'C1952', 'C1961', 'C1964', 'C1966', 'C1980', 'C19803', 'C19932', 'C2012', 'C2013', 'C20203', 'C20455', 'C2057', 'C2058', 'C20677', 'C2079', 'C20819', 'C2085', 'C2091', 'C20966', 'C21349', 'C21664', 'C21814', 'C21919', 'C21946', 'C2196', 'C21963', 'C22174', 'C22176', 'C22275', 'C22409', 'C2254', 'C22766', 'C231', 'C2341', 'C2378', 'C2388', 'C243', 'C246', 'C2519', 'C2578', 'C2597', 'C2604', 'C2609', 'C2648', 'C2669', 'C2725', 'C2816', 'C2844', 'C2846', 'C2849', 'C2877', 'C2914', 'C294', 'C2944', 'C3019', 'C302', 'C3037', 'C305', 'C306', 'C307', 'C313', 'C3153', 'C3170', 'C3173', 'C3199', 'C3249', 'C3288', 'C3292', 'C3303', 'C3305', 'C332', 'C338', 'C3380', 'C3388', 'C3422', 'C3435', 'C3437', 'C3455', 'C346', 'C3491', 'C3521', 'C353', 'C3586', 'C359', 'C3597', 'C3601', 'C3610', 'C3629', 'C3635', 'C366', 'C368', 'C3699', 'C370', 'C3755', 'C3758', 'C3813', 'C385', 'C3888', 'C395', 'C398', 'C400', 'C4106', 'C4159', 'C4161', 'C42', 'C423', 'C4280', 'C429', 'C430', 'C4403', 'C452', 'C4554', 'C457', 'C458', 'C46', 'C4610', 'C464', 'C467', 'C477', 'C4773', 'C4845', 'C486', 'C492', 'C4934', 'C5030', 'C504', 'C506', 'C5111', 'C513', 'C52', 'C528', 'C529', 'C5343', 'C5439', 'C5453', 'C553', 'C5618', 'C5653', 'C5693', 'C583', 'C586', 'C61', 'C612', 'C625', 'C626', 'C633', 'C636', 'C6487', 'C6513', 'C685', 'C687', 'C706', 'C7131', 'C721', 'C728', 'C742', 'C7464', 'C7503', 'C754', 'C7597', 'C765', 'C7782', 'C779', 'C78', 'C791', 'C798', 'C801', 'C8172', 'C8209', 'C828', 'C849', 'C8490', 'C853', 'C8585', 'C8751', 'C881', 'C882', 'C883', 'C886', 'C89', 'C90', 'C9006', 'C917', 'C92', 'C923', 'C96', 'C965', 'C9692', 'C9723', 'C977', 'C9945'}
len(bads)

305

In [31]:
# Group by source computer, and apply the feature extractor 
out = flows.groupby('source_computer').apply(featurize)

out[:10]

source_computer
C10                                   {'unique_ports': 4, 'average_packet': 222.0, 'average_duration': 5.0}
C10026                                {'unique_ports': 2, 'average_packet': 21.0, 'average_duration': 39.0}
C10047     {'unique_ports': 5, 'average_packet': 21.076923076923077, 'average_duration': 7.538461538461538}
C1015     {'unique_ports': 35, 'average_packet': 5.371428571428571, 'average_duration': 27.571428571428573}
C10235                                 {'unique_ports': 1, 'average_packet': 11.0, 'average_duration': 0.0}
C10297                                 {'unique_ports': 1, 'average_packet': 9.0, 'average_duration': 11.0}
C10326                                  {'unique_ports': 1, 'average_packet': 4.8, 'average_duration': 0.2}
C10328                                  {'unique_ports': 1, 'average_packet': 2.0, 'average_duration': 0.0}
C10366                                 {'unique_ports': 1, 'average_packet': 23.0, 'average_duration': 1.0}
C10380      

In [32]:
# Convert the iterator to a dataframe by calling list on it
X = pd.DataFrame(list(out), index=out.index)

# Check which sources in X.index are bad to create labels
y = [x in bads for x in X.index]

# Report the average accuracy of Adaboost over 3-fold CV
print(np.mean(cross_val_score(AdaBoostClassifier(), X, y)))

0.9361199939089387


### Feature engineering on grouped data
You will now build on the previous exercise, by considering one additional feature: the number of unique protocols used by each source computer. Note that with grouped data, it is always possible to construct features in this manner: you can take the number of unique elements of all categorical columns, and the mean of all numeric columns as your starting point. As before, you have flows preloaded, `cross_val_score()` for measuring accuracy, `AdaBoostClassifier()`, pandas as pd and numpy as np.

In [33]:
# Create a feature counting unique protocols per source
protocols = flows.groupby('source_computer').apply(lambda df: len(set(df['protocol'])))

# Convert this feature into a dataframe, naming the column
protocols_DF = pd.DataFrame(protocols, index=protocols.index, columns=['protocol'])

protocols_DF.head()

Unnamed: 0_level_0,protocol
source_computer,Unnamed: 1_level_1
C10,1
C10026,1
C10047,2
C1015,1
C10235,1


In [34]:
# Now concatenate this feature with the previous dataset, X
X_more = pd.concat([X, protocols_DF], axis=1)

# Refit the classifier and report its accuracy
print(np.mean(cross_val_score(AdaBoostClassifier(), X_more, y)))

0.9377950357849856
