In [1]:
import pandas as pd
import numpy as np
import feather

import seaborn as sns
import matplotlib.pyplot as plt
import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold,train_test_split

from lightgbm import LGBMClassifier
import xgboost as xgb
from catboost import CatBoostClassifier

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
train = feather.read_dataframe('train_raw')
test = feather.read_dataframe('test_raw')
print(train.shape, test.shape)

(8921483, 83) (7853253, 82)


In [4]:
droppable_features = ['Census_ProcessorClass',
 'Census_IsWIMBootEnabled',
 'IsBeta',
 'Census_IsFlightsDisabled',
 'Census_IsFlightingInternal',
 'AutoSampleOptIn',
 'Census_ThresholdOptIn',
 'SMode',
 'Census_IsPortableOperatingSystem',
 'PuaMode',
 'Census_DeviceFamily',
 'UacLuaenable',
 'Census_IsVirtualDevice',
 'Platform',
 'Census_OSSkuName',
 'Census_OSInstallLanguageIdentifier',
 'Processor']

In [5]:
train.drop(droppable_features, axis=1, inplace=True)
test.drop(droppable_features, axis=1, inplace=True)
print(train.shape, test.shape)

(8921483, 66) (7853253, 65)


In [6]:
train.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,OsVer,OsBuild,OsSuite,OsPlatformSubRelease,OsBuildLab,SkuEdition,IsProtected,IeVerIdentifier,SmartScreen,Firewall,Census_MDC2FormFactor,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,Census_ProcessorManufacturerIdentifier,Census_ProcessorModelIdentifier,Census_PrimaryDiskTotalCapacity,Census_PrimaryDiskTypeName,Census_SystemVolumeTotalCapacity,Census_HasOpticalDiskDrive,Census_TotalPhysicalRAM,Census_ChassisTypeName,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_PowerPlatformRoleName,Census_InternalBatteryType,Census_InternalBatteryNumberOfCharges,Census_OSVersion,Census_OSArchitecture,Census_OSBranch,Census_OSBuildNumber,Census_OSBuildRevision,Census_OSEdition,Census_OSInstallTypeName,Census_OSUILocaleIdentifier,Census_OSWUAutoUpdateOptionsName,Census_GenuineStateName,Census_ActivationChannel,Census_FlightRing,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1735.0,7.0,0,,53447.0,1.0,1.0,1,29,128035.0,18.0,35.0,-85,10.0.0.0,17134,256,rs4,17134.1.amd64fre.rs4_release.180410-1804,Pro,1.0,137.0,,1.0,Desktop,2668.0,9124.0,4.0,5.0,2341.0,476940.0,HDD,299451.0,0,4096.0,Desktop,18.9,1440.0,900.0,Desktop,,4294967000.0,10.0.17134.165,amd64,rs4_release,17134,165,Professional,UUPUpgrade,119,UNKNOWN,IS_GENUINE,Retail,Retail,628.0,36144.0,0,0,0,0.0,0.0,10.0,0
1,1,win8defender,1.1.14600.4,4.13.17134.1,1.263.48.0,7.0,0,,53447.0,1.0,1.0,1,93,1482.0,18.0,119.0,64,10.0.0.0,17134,256,rs4,17134.1.amd64fre.rs4_release.180410-1804,Pro,1.0,137.0,,1.0,Notebook,2668.0,91656.0,4.0,5.0,2405.0,476940.0,HDD,102385.0,0,4096.0,Notebook,13.9,1366.0,768.0,Mobile,,1.0,10.0.17134.1,amd64,rs4_release,17134,1,Professional,IBSClean,31,UNKNOWN,OFFLINE,Retail,NOT_SET,628.0,57858.0,0,0,0,0.0,0.0,8.0,0
2,2,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1341.0,7.0,0,,53447.0,1.0,1.0,1,86,153579.0,18.0,64.0,49,10.0.0.0,17134,768,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,1.0,137.0,RequireAdmin,1.0,Desktop,4909.0,317701.0,4.0,5.0,1972.0,114473.0,SSD,113907.0,0,4096.0,Desktop,21.5,1920.0,1080.0,Desktop,,4294967000.0,10.0.17134.165,amd64,rs4_release,17134,165,Core,UUPUpgrade,30,FullAuto,IS_GENUINE,OEM:NONSLP,Retail,142.0,52682.0,0,0,0,0.0,0.0,3.0,0
3,3,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1527.0,7.0,0,,53447.0,1.0,1.0,1,88,20710.0,,117.0,115,10.0.0.0,17134,256,rs4,17134.1.amd64fre.rs4_release.180410-1804,Pro,1.0,137.0,ExistsNotSet,1.0,Desktop,1443.0,275890.0,4.0,5.0,2273.0,238475.0,UNKNOWN,227116.0,0,4096.0,MiniTower,18.5,1366.0,768.0,Desktop,,4294967000.0,10.0.17134.228,amd64,rs4_release,17134,228,Professional,UUPUpgrade,64,FullAuto,IS_GENUINE,OEM:NONSLP,Retail,355.0,20050.0,0,0,0,0.0,0.0,3.0,1
4,4,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1379.0,7.0,0,,53447.0,1.0,1.0,1,18,37376.0,,277.0,75,10.0.0.0,17134,768,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,1.0,137.0,RequireAdmin,1.0,Notebook,1443.0,331929.0,4.0,5.0,2500.0,476940.0,HDD,101900.0,0,6144.0,Portable,14.0,1366.0,768.0,Mobile,lion,0.0,10.0.17134.191,amd64,rs4_release,17134,191,Core,Update,31,FullAuto,IS_GENUINE,Retail,Retail,355.0,19844.0,0,0,0,0.0,0.0,1.0,1


In [7]:
cat_cols = ['ProductName', 'EngineVersion', 'AppVersion', 'AvSigVersion', 'RtpStateBitfield', 'IsSxsPassiveMode',
            'DefaultBrowsersIdentifier','AVProductStatesIdentifier', 'AVProductsInstalled', 'AVProductsEnabled',
            'HasTpm', 'CountryIdentifier', 'CityIdentifier', 'OrganizationIdentifier', 'GeoNameIdentifier', 
            'LocaleEnglishNameIdentifier', 'OsVer', 'OsBuild', 'OsSuite', 'OsPlatformSubRelease', 'OsBuildLab',
            'SkuEdition', 'IsProtected', 'IeVerIdentifier', 'SmartScreen', 'Firewall', 'Census_MDC2FormFactor',
            'Census_OEMNameIdentifier', 'Census_OEMModelIdentifier', 'Census_ProcessorCoreCount',
            'Census_ProcessorManufacturerIdentifier','Census_ProcessorModelIdentifier','Census_PrimaryDiskTotalCapacity',
            'Census_PrimaryDiskTypeName', 'Census_HasOpticalDiskDrive', 'Census_ChassisTypeName',
            'Census_PowerPlatformRoleName', 'Census_InternalBatteryType', 'Census_OSVersion', 'Census_OSArchitecture',
            'Census_OSBranch', 'Census_OSBuildNumber', 'Census_OSBuildRevision', 'Census_OSEdition', 'Census_OSInstallTypeName',
            'Census_OSUILocaleIdentifier', 'Census_OSWUAutoUpdateOptionsName', 'Census_GenuineStateName', 'Census_ActivationChannel',
            'Census_FlightRing', 'Census_FirmwareManufacturerIdentifier', 'Census_FirmwareVersionIdentifier',
            'Census_IsSecureBootEnabled', 'Census_IsTouchEnabled', 'Census_IsPenCapable', 'Census_IsAlwaysOnAlwaysConnectedCapable',
            'Wdft_IsGamer', 'Wdft_RegionIdentifier']
print(len(cat_cols))

58


In [8]:
for col in cat_cols:
    train[col] = train[col].astype('str')
    test[col] = test[col].astype('str')
    
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    
    le = LabelEncoder().fit(np.unique(train[col].unique().tolist()+test[col].unique().tolist()))
    
    train[col] = le.transform(train[col])+1
    test[col]  = le.transform(test[col])+1

In [9]:
train['display_ratio'] = train['Census_InternalPrimaryDisplayResolutionHorizontal']/train['Census_InternalPrimaryDisplayResolutionVertical']
test['display_ratio'] = test['Census_InternalPrimaryDisplayResolutionHorizontal']/test['Census_InternalPrimaryDisplayResolutionVertical']
del train['Census_InternalPrimaryDisplayResolutionHorizontal'], train['Census_InternalPrimaryDisplayResolutionVertical'], test['Census_InternalPrimaryDisplayResolutionHorizontal'], test['Census_InternalPrimaryDisplayResolutionVertical']
gc.collect()
print(train.shape, test.shape)

(8921483, 65) (7853253, 64)


In [10]:
y_train = np.array(train['HasDetections'])
train_ids = train['MachineIdentifier']
test_ids  = test['MachineIdentifier']
del train['HasDetections'], train['MachineIdentifier'], test['MachineIdentifier']
X = np.array(train.values)
X_test = np.array(test.values)
print(train.shape, test.shape)
print(len(X), len(X_test))
print(len(y_train), len(train_ids), len(test_ids))

(8921483, 63) (7853253, 63)
8921483 7853253
8921483 8921483 7853253


In [31]:
def column_index(df, query_cols):
    cols_val = df.columns.values
    sidx = np.argsort(cols_val)
    return sidx[np.searchsorted(cols_val,query_cols,sorter=sidx)]

In [33]:
categorical_features_pos = column_index(train,cat_cols)
print(categorical_features_pos)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 35 37 39 40 42 43 44 45 46 47 48 49 50 51
 52 53 54 55 56 57 58 59 60 61]


In [34]:
cols_index =  pd.DataFrame(data=(train.columns), columns=['Column_Name'])
train.head()

Unnamed: 0,ProductName,EngineVersion,AppVersion,AvSigVersion,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,OsVer,OsBuild,OsSuite,OsPlatformSubRelease,OsBuildLab,SkuEdition,IsProtected,IeVerIdentifier,SmartScreen,Firewall,Census_MDC2FormFactor,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,Census_ProcessorManufacturerIdentifier,Census_ProcessorModelIdentifier,Census_PrimaryDiskTotalCapacity,Census_PrimaryDiskTypeName,Census_SystemVolumeTotalCapacity,Census_HasOpticalDiskDrive,Census_TotalPhysicalRAM,Census_ChassisTypeName,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_PowerPlatformRoleName,Census_InternalBatteryType,Census_InternalBatteryNumberOfCharges,Census_OSVersion,Census_OSArchitecture,Census_OSBranch,Census_OSBuildNumber,Census_OSBuildRevision,Census_OSEdition,Census_OSInstallTypeName,Census_OSUILocaleIdentifier,Census_OSWUAutoUpdateOptionsName,Census_GenuineStateName,Census_ActivationChannel,Census_FlightRing,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,display_ratio
0,5,67,65,7963,7,1,2610,26978,2,2,2,145,24112,10,226,112,1,7,4,5,307,7,2,28,21,2,3,1445,227007,31,6,1312,6049,1,299451.0,1,4096.0,29,18.9,2,72,4294967000.0,416,1,13,122,83,22,7,20,6,2,3,8,544,23599,1,1,1,1,1,2,1.6
1,5,57,30,6466,7,1,2610,26978,2,2,2,216,41795,10,23,216,1,7,4,5,307,7,2,28,21,2,9,1445,227346,31,6,1370,6049,1,102385.0,1,4096.0,43,13.9,4,72,1.0,412,1,13,122,2,22,3,82,6,3,3,5,544,43830,1,1,1,1,1,14,1.778646
2,5,67,65,7834,7,1,2610,26978,2,2,2,208,46444,10,257,199,1,7,13,5,307,5,2,28,18,2,3,3308,162982,31,6,955,597,2,113907.0,1,4096.0,29,21.5,2,72,4294967000.0,416,1,13,122,83,5,7,81,3,2,2,8,110,39165,1,1,1,1,1,9,1.777778
3,5,67,65,7885,7,1,2610,26978,2,2,2,210,62100,52,21,147,1,7,4,5,307,7,2,28,10,2,3,397,131543,31,6,1243,3173,3,227116.0,1,4096.0,41,18.5,2,72,4294967000.0,420,1,13,122,181,22,7,115,3,2,2,8,294,9298,1,1,1,1,1,9,1.778646
4,5,67,65,7844,7,1,2610,26978,2,2,2,90,76743,52,198,228,1,7,13,5,307,5,2,28,18,2,9,397,175475,31,6,1461,6049,1,101900.0,1,6144.0,47,14.0,4,62,0.0,418,1,13,122,152,5,8,82,3,2,3,8,294,9115,1,1,1,1,1,1,1.778646


In [35]:
cols_index[32:42]

Unnamed: 0,Column_Name
32,Census_PrimaryDiskTotalCapacity
33,Census_PrimaryDiskTypeName
34,Census_SystemVolumeTotalCapacity
35,Census_HasOpticalDiskDrive
36,Census_TotalPhysicalRAM
37,Census_ChassisTypeName
38,Census_InternalPrimaryDiagonalDisplaySizeInInches
39,Census_PowerPlatformRoleName
40,Census_InternalBatteryType
41,Census_InternalBatteryNumberOfCharges


In [36]:
del train, test
gc.collect()

814

In [37]:
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# skf.get_n_splits(train_ids, y_train)
X_train, X_valid, y_train, y_valid = train_test_split(X, y_train, test_size=0.13, random_state=23)
print(len(X_train), len(X_valid), len(y_train), len(y_valid))
del X

7761690 1159793 7761690 1159793


In [38]:
print("Train CatBoost Decision Tree")
cb_model = CatBoostClassifier(iterations=1200, learning_rate=0.02, eval_metric='AUC', random_seed = 23, 
                              verbose = 60, thread_count=8,
                              bagging_temperature = 5, depth=10, use_best_model =True, 
                              early_stopping_rounds = 50, metric_period=30, od_type='Iter')

Train CatBoost Decision Tree


In [None]:
cb_model.fit(X_train, y_train,
             eval_set=(X_valid,y_valid),
             cat_features=categorical_features_pos)

In [1]:
import numpy as np
from catboost import CatBoostClassifier, Pool

In [17]:
# initialize data
train_data = np.random.randint(0, 
                               100, 
                               size=(10000000, 10))

train_labels = np.random.randint(0, 
                                2, 
                                size=(10000000))

test_data = catboost_pool = Pool(train_data, 
                                 train_labels)

In [37]:
model = CatBoostClassifier(iterations=30, thread_count=10,
                           depth=6, 
                           learning_rate=0.5, 
                           loss_function='Logloss', 
                           logging_level='Verbose')

In [38]:
%%time
#train the model
model.fit(train_data, train_labels)
# make the prediction using the resulting model
preds_class = model.predict(test_data)
preds_proba = model.predict_proba(test_data)
print("class = ", preds_class)

0:	learn: 0.6931431	total: 749ms	remaining: 21.7s
1:	learn: 0.6931409	total: 1.45s	remaining: 20.3s
2:	learn: 0.6931377	total: 2.19s	remaining: 19.7s
3:	learn: 0.6931347	total: 2.94s	remaining: 19.1s
4:	learn: 0.6931313	total: 3.67s	remaining: 18.3s
5:	learn: 0.6931291	total: 4.42s	remaining: 17.7s
6:	learn: 0.6931265	total: 5.18s	remaining: 17s
7:	learn: 0.6931248	total: 5.93s	remaining: 16.3s
8:	learn: 0.6931220	total: 6.64s	remaining: 15.5s
9:	learn: 0.6931208	total: 7.42s	remaining: 14.8s
10:	learn: 0.6931190	total: 8.15s	remaining: 14.1s
11:	learn: 0.6931164	total: 8.88s	remaining: 13.3s
12:	learn: 0.6931143	total: 9.63s	remaining: 12.6s
13:	learn: 0.6931116	total: 10.3s	remaining: 11.8s
14:	learn: 0.6931081	total: 11.1s	remaining: 11.1s
15:	learn: 0.6931047	total: 11.8s	remaining: 10.3s
16:	learn: 0.6931029	total: 12.5s	remaining: 9.6s
17:	learn: 0.6930998	total: 13.3s	remaining: 8.85s
18:	learn: 0.6930968	total: 14s	remaining: 8.11s
19:	learn: 0.6930949	total: 14.7s	remaining: 7

01 thread count - CPU times: user 1min 31s, sys: 3 s, total: 1min 34s & Wall time: 1min 32s
04 thread Count - CPU times: user 2min 10s, sys: 4.52 s, total: 2min 15s & Wall time: 51.3 s
10 thread count - CPU times: user 3min 53s, sys: 7.2 s, total: 4min & Wall time: 45.2 s            

In [16]:
print("proba = ", preds_proba)

proba =  [[0.49216617 0.50783383]
 [0.46239692 0.53760308]
 [0.4992662  0.5007338 ]
 ...
 [0.48532568 0.51467432]
 [0.50411434 0.49588566]
 [0.48047022 0.51952978]]
