# Microsoft Malware detection

In this kernel, I build a LGBM model using only a subset of the training data, in order to fit in memory.

## Notebook  Content
1. [Utility functions](#0)
1. [Loading the data](#1) <br>
    2.1 [Get the files and select the variables](#2.1) <br>
    2.2 [Define the type of each variable](#2.2)
1. [Feature engineering](#2) <br>
    3.1 [Frequency encoding](#3.1) <br>
    3.2 [Label encoding](#3.2)
1. [Training the model](#3)
1. [Feature importance](#4)
1. [Getting the predictions to a csv file](#5)

<a id="0"></a> <br>
## 1. Utility functions
Before starting, we define a utility function that helps managing memory.

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

<a id="1"></a> <br>
## 2. Loading the data

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
import warnings
import gc
import time
import sys
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn import metrics
# Plotly library
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected=True)
pd.set_option('display.max_columns', 500)

<a id="2.1"></a> <br>
### 2.1 Get the files and select the variables
 we set the types of each fields in the train set in order to reduce the memory usage:

In [None]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

First, we make a census of the variables, by type, and define the set we want to keep before reading the data:

In [None]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = [c for c,v in dtypes.items() if v in numerics]
categorical_columns = [c for c,v in dtypes.items() if v not in numerics]

We read the data, limiting the size of the training set to 4'000'000 rows:

In [None]:
#nrows = 9000000
nrows =1000000
#_______________________________________________________________________________
retained_columns = numerical_columns + categorical_columns
train = pd.read_csv('../input/microsoft-malware-prediction/train.csv',
                    nrows = nrows,
                    usecols = retained_columns,
                    dtype = dtypes)
tqdm.pandas()
train.progress_apply(lambda x:x)
#_______________________________________________________________
retained_columns += ['MachineIdentifier']
retained_columns.remove('HasDetections')
test = pd.read_csv('../input/microsoft-malware-prediction/test.csv',
                   usecols = retained_columns,
                   dtype = dtypes,nrows=nrows)
tqdm.pandas()
test.progress_apply(lambda x:x)

In [None]:
train.head()

In [None]:
train.info()

To visualise the amount of missing data in train and test set, we will create new datasets from test,train containing features with NaN values.LightGBM can treat missing values as missing. Still we do it to know about the amount of missing data.

In [None]:
def avail(a,b):
    train_data = (train.count() / len(train)).drop('HasDetections').sort_values().values
    train_data=train_data[a:b]
    ind = np.arange(len(train_data))

    width = 0.35
    fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
    tr = axes.bar(ind, train_data, width, color='royalblue')

    test_data = (test.count() / len(test)).sort_values().values
    test_data=test_data[a:b]
    tt = axes.bar(ind+width, test_data, width, color='seagreen')

    axes.set_ylabel('Amount of data available');
    axes.set_xticks(ind + width / 2)
    axes.tick_params(axis='x')
    axes.tick_params(axis='y')
    axes.set_xticklabels((train.count() / len(train)).drop('HasDetections')[a:].sort_values().index, rotation=40)
    axes.legend([tr, tt], ['Train', 'Test']);
    import gc
    del train_data
    del test_data
    gc.collect()

In [None]:
avail(0,9)

In [None]:
avail(10,19)

In [None]:
avail(20,29)

In [None]:
avail(30,39)
#avail(40,49)
#avail(50,59)
#avail(60,69)


<a id="2.2"></a> <br>
### 2.2 Define the type of each variable
In practice, among the numerical variables, many corresponds to identifiers. *In the current dataset, the truly numerical variables are in fact rare*. Below, I make a list of the variables which are truly numerical, according the the description of the data.

In [None]:
true_numerical_columns = [
    'Census_ProcessorCoreCount',
    'Census_PrimaryDiskTotalCapacity',
    'Census_SystemVolumeTotalCapacity',
    'Census_TotalPhysicalRAM',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches',
    'Census_InternalPrimaryDisplayResolutionHorizontal',
    'Census_InternalPrimaryDisplayResolutionVertical',
    'Census_InternalBatteryNumberOfCharges'
]

We also list binary variables, since they can be treated as numericals by tree methods:

In [None]:
binary_variables = [c for c in train.columns if train[c].nunique() == 2]

to finally make a census of the categorical variables:

In [None]:
categorical_columns = [c for c in train.columns 
                       if (c not in true_numerical_columns) & (c not in binary_variables)]

In [None]:
variables = {
    'categorical_columns': len(categorical_columns),
    'binary_variables': len(binary_variables),
    'true_numerical_columns': len(true_numerical_columns)
}
pie_trace = go.Pie(labels=list(variables.keys()), values=list(variables.values()))
layout = dict(title= "Variable types", height=400, width=800)
fig = dict(data=[pie_trace], layout=layout)
iplot(fig)

Most of the current variables are categories and we need to choose a method to treat them. **Depending on the cardinality of each variable**, we can opt for** one-hot-encoding, frequency or target encoding**. In the particular case of Light-GBM, we can also use the **built-in LGBM treatment of categoricals**:

In [None]:
def card():
    cardinality = []
    for c in categorical_columns:
        if c == 'MachineIdentifier': continue
        cardinality.append([c, train[c].nunique()])
    cardinality.sort(key = lambda x:x[1], reverse=False)

    trace = go.Bar(y=[x[0] for x in cardinality],
                   x=[x[1] for x in cardinality],
                   orientation='h', marker=dict(color='rgb(49,130,189)'), name='train')

    layout = go.Layout(
        title='Categorical cardinality', height=1600, width=800,
        xaxis=dict(
            title='Number of categories',
            titlefont=dict(size=16, color='rgb(107, 107, 107)'),
            domain=[0.25, 1]
        ),
        barmode='group',
        bargap=0.1,
        bargroupgap=0.1
    )

    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)

In [None]:
card()

<a id="2"></a> <br>
## 3. Feature Engineering
<a id="3.1"></a> <br>
### 3.1 Frequency encoding
For variables with large cardinality, an efficient encoding consists in ranking the categories with respect to their frequencies. These variables are then treated as numerical.

In [None]:
def frequency_encoding(variable):
    t = pd.concat([train[variable], test[variable]]).value_counts().reset_index()
    t = t.reset_index()
    t.loc[t[variable] == 1, 'level_0'] = np.nan
    t.set_index('index', inplace=True)
    max_label = t['level_0'].max() + 1
    t.fillna(max_label, inplace=True)
    return t.to_dict()['level_0']

In [None]:
frequency_encoded_variables = [
    'Census_OEMModelIdentifier',
    'CityIdentifier',
    'Census_FirmwareVersionIdentifier',
    'AvSigVersion',
    'Census_ProcessorModelIdentifier',
    'Census_OEMNameIdentifier',
    'DefaultBrowsersIdentifier'
]

In [None]:
for variable in tqdm(frequency_encoded_variables):
    freq_enc_dict = frequency_encoding(variable)
    train[variable] = train[variable].map(lambda x: freq_enc_dict.get(x, np.nan))
    test[variable] = test[variable].map(lambda x: freq_enc_dict.get(x, np.nan))
    categorical_columns.remove(variable)

In [None]:
card()

We will try reducing the cardinality further by defining following function and calling it for the feature which we want to reduce the cardinality for

In [None]:
# REDUCE CATEGORY CARDINALITY
def relax_data(df_train, df_test, col):
    cv1 = pd.DataFrame(df_train[col].value_counts().reset_index().rename({col:'train'},axis=1))
    cv2 = pd.DataFrame(df_test[col].value_counts().reset_index().rename({col:'test'},axis=1))
    cv3 = pd.merge(cv1,cv2,on='index',how='outer')
    factor = len(df_test)/len(df_train)
    cv3['train'].fillna(0,inplace=True)
    cv3['test'].fillna(0,inplace=True)
    cv3['remove'] = False
    cv3['remove'] = cv3['remove'] | (cv3['train'] < len(df_train)/10000)
    cv3['remove'] = cv3['remove'] | (factor*cv3['train'] < cv3['test']/3)
    cv3['remove'] = cv3['remove'] | (factor*cv3['train'] > 3*cv3['test'])
    cv3['new'] = cv3.apply(lambda x: x['index'] if x['remove']==False else 0,axis=1)
    cv3['new'],_ = cv3['new'].factorize(sort=True)
    cv3.set_index('index',inplace=True)
    cc = cv3['new'].to_dict()
    df_train[col] = df_train[col].map(cc)
    #reduce_memory(df_train,col)
    df_test[col] = df_test[col].map(cc)
    #reduce_memory(df_test,col)

In [None]:
relax_data(train,test,'AVProductStatesIdentifier')  

Plot cardinality once again

In [None]:
card()

<a id="3.2"></a> <br>
### 3.1 Feature Engineering

AppVersion2 indicates whether your Windows Defender is up to date. This is the second number from AppVersion

Downloaded timestamps from Microsoft and created this python dictionary that maps Microsoft's AvSigVersion (str) to a timestamp (datetime.datetime)

In [None]:
#AS timestamp
datedictAS = np.load('../input/avsigversiontimestamps/AvSigVersionTimestamps.npy')[()]
train['DateAS'] =train['AvSigVersion'].map(datedictAS)
test['DateAS'] =test['AvSigVersion'].map(datedictAS)


In [None]:
#Engineered feature 'AppVersion2'
train['AppVersion2'] = train['AppVersion'].map(lambda x: np.int(x.split('.')[1]))
test['AppVersion2'] = test['AppVersion'].map(lambda x: np.int(x.split('.')[1]))


Lag1 is the difference between AvSigVersion_Date and Census_OSVersion_Date. Since AvSigVersion is the virus definitions for Windows Defender, this variable indicates whether Windows Defender is out-of-date by comparing it's last install with the date of the operating system. Out-of-date antivirus indicates that a user either has better antivirus or they don't use their computer often. In either case, they have less HasDetections.

In [None]:
# OS timestamp
datedictOS = np.load('../input/osversiontimestamps/OSVersionTimestamps.npy')[()]
train['DateOS'] = train['Census_OSVersion'].map(datedictOS)
test['DateOS'] =test['Census_OSVersion'].map(datedictOS)


In [None]:
train['Lag1'] = train['DateAS'] - train['DateOS']
train['Lag1'] = train['Lag1'].map(lambda x: x.days//7)
test['Lag1'] = test['DateAS'] - test['DateOS']
test['Lag1'] = test['Lag1'].map(lambda x: x.days//7)

driveA is the ratio of harddrive partition used for the operating system with the total hard drive. Savy users install multiple operating systems and have a lower ratio. Savy users have reduced HasDetections.



In [None]:
# ENGINEERED FEATURE #4
train['driveA'] = train['Census_SystemVolumeTotalCapacity'].astype('float')/train['Census_PrimaryDiskTotalCapacity'].astype('float')
test['driveA'] = test['Census_SystemVolumeTotalCapacity'].astype('float')/test['Census_PrimaryDiskTotalCapacity'].astype('float')
train['driveA'] = train['driveA'].astype('float32') 
test['driveA'] = test['driveA'].astype('float32') 



driveB is the difference between harddrive partition used for the operating system and total hard drive. Responsible users manager their hard drives well. Responsible users have reduced HasDetections.

In [None]:
# ENGINNERED FEATURE #5
train['driveB'] = train['Census_PrimaryDiskTotalCapacity'].astype('float') - train['Census_SystemVolumeTotalCapacity'].astype('float')
test['driveB'] = test['Census_PrimaryDiskTotalCapacity'].astype('float') - test['Census_SystemVolumeTotalCapacity'].astype('float')
train['driveB'] = train['driveB'].astype('float32') 
test['driveB'] = test['driveB'].astype('float32') 

In [None]:
cols6=['Lag1']
cols8=['Lag5','driveB','driveA']

In [None]:

del train['DateAS'], train['DateOS'] #, train['DateBL']
del test['DateAS'], test['DateOS'] #, test['DateBL']
del datedictAS, datedictOS
x=gc.collect()

There is a correlation between the frequency of time variables (EngineVersion, AvSigVersion, AppVersion, Census_OSVersion, Census_OSBuildRevision) and HasDetections.
This occurs for two reasons explained here. One reason is a leak and will lead to overfitting train and public test because I believe that private test is a random sample and therefore does not benefit from the leak. The second reason is because out-of-date antivirus produces a front tail with both reduced frequency and reduced HasDetections. To take advantage of these two correlations, you need to frequency encode the time variables separately for train and test.We have already done that for AvSigVersion.We will do it for the rest now.

In [None]:
FE = ['EngineVersion',
    'AppVersion',
    'Census_OSVersion',
    'Census_OSBuildRevision']

for variable in tqdm(FE):
    freq_enc_dict = frequency_encoding(variable)
    train[variable] = train[variable].map(lambda x: freq_enc_dict.get(x, np.nan))
    test[variable] = test[variable].map(lambda x: freq_enc_dict.get(x, np.nan))
    categorical_columns.remove(variable)

After every variable was changed to dtype integer, each variable was tested one by one to see if making it categorical increases LGBM validation score. It was determined that the following 6 variables increased validation score. By making them categorical you prioritize them. So presumably these are important variables. Notice that they cover the variety of flavors of features. We have two geographical variables, one hardware variable, two software/virus variables, and one miscellenous. 



Every variable was removed one at a time and validation score was recorded. Validation verified that we can remove the following 18 variables without decreasing model accuracy. MachineIdentifier, ProductName, IsBeta, IsSxsPassiveMode, HasTpm, AutoSampleOptIn, PuaMode, UacLuaenable, Census_DeviceFamily, Census_ProcessorClass, Census_IsPortableOperatingSystem, Census_IsFlightsDisabled, Census_IsVirtualDevice, Census_OSSkuName, OsVer, Census_OSArchitecture, Census_OSInstallLanguageIdentifier, SMode. The first one is obvious. The next 12 have more than 98% of their data in one category value. The next 4 correlate with another variable greater than r=0.99. And the last variable SMode changes is distribution radically over time and leads to decreased CV and LB. The other variables don't decrease validation per say but they don't increase it either, so we can try running with or without removing them.

<a id="3.2"></a> <br>
### 3.2 Label encoding

In [None]:
indexer = {}
for col in tqdm(categorical_columns):
    if col == 'MachineIdentifier': continue
    _, indexer[col] = pd.factorize(train[col])
    
for col in tqdm(categorical_columns):
    if col == 'MachineIdentifier': continue
    train[col] = indexer[col].get_indexer(train[col])
    test[col] = indexer[col].get_indexer(test[col])

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
target = train['HasDetections']
del train['HasDetections']

<a id="3"></a> <br>
## 3. Training the model

In [None]:
param = {'num_leaves': 60,
         'min_data_in_leaf': 60, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.1,
         "boosting": "gbdt",
         "feature_fraction": 0.8,
         "bagging_freq": 1,
         "bagging_fraction": 0.8 ,
         "bagging_seed": 11,
         "metric": 'auc',
         "lambda_l1": 0.1,
         "random_state": 133,
         "verbosity": -1}

We set the max number of iteration over folds:

In [None]:
max_iter = 5

In [None]:
gc.collect()

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
categorical_columns = [c for c in categorical_columns if c not in ['MachineIdentifier']]
features = [c for c in train.columns if c not in ['MachineIdentifier']]
predictions = np.zeros(len(test))
start = time.time()
feature_importance_df = pd.DataFrame()
start_time= time.time()
score = [0 for _ in range(folds.n_splits)]

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features],
                           label=target.iloc[trn_idx],
                           categorical_feature = categorical_columns
                          )
    val_data = lgb.Dataset(train.iloc[val_idx][features],
                           label=target.iloc[val_idx],
                           categorical_feature = categorical_columns
                          )

    num_round = 10000
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds = 200)
    
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    # we perform predictions by chunks
    initial_idx = 0
    chunk_size = 1000000
    current_pred = np.zeros(len(test))
    while initial_idx < test.shape[0]:
        final_idx = min(initial_idx + chunk_size, test.shape[0])
        idx = range(initial_idx, final_idx)
        current_pred[idx] = clf.predict(test.iloc[idx][features], num_iteration=clf.best_iteration)
        initial_idx = final_idx
    predictions += current_pred / min(folds.n_splits, max_iter)
   
    print("time elapsed: {:<5.2}s".format((time.time() - start_time) / 3600))
    score[fold_] = metrics.roc_auc_score(target.iloc[val_idx], oof[val_idx])
    if fold_ == max_iter - 1: break
        
if (folds.n_splits == max_iter):
    print("CV score: {:<8.5f}".format(metrics.roc_auc_score(target, oof)))
else:
     print("CV score: {:<8.5f}".format(sum(score) / max_iter))

In [None]:
 predictions[0:5]

In [None]:
#convert into binary values
y_pred=[]
for i in range(len(test)):
    if predictions[i]>=.5:       # setting threshold to .5
       y_pred.append(1)
    else:  
       y_pred.append(0)

<a id="4"></a> <br>
## 4. Feature importance

In [None]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

<a id="5"></a> <br>
## 5. Getting the predictions to a csv file

In [None]:
sub_df = pd.DataFrame({"MachineIdentifier": test["MachineIdentifier"].values})
sub_df["HasDetections"] = y_pred
sub_df[:10]

In [None]:
sub_df.to_csv("Predictions.csv", index=False)