In [1]:
import pandas as pd
import numpy as np
import feather

import seaborn as sns
import matplotlib.pyplot as plt
import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold,train_test_split

from lightgbm import LGBMClassifier
import xgboost as xgb
from catboost import CatBoostClassifier

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
train = feather.read_dataframe('train_raw')
test = feather.read_dataframe('test_raw')
print(train.shape, test.shape)

(8921483, 83) (7853253, 82)


In [4]:
droppable_features = ['Census_ProcessorClass',
 'Census_IsWIMBootEnabled',
 'IsBeta',
 'Census_IsFlightsDisabled',
 'Census_IsFlightingInternal',
 'AutoSampleOptIn',
 'Census_ThresholdOptIn',
 'SMode',
 'Census_IsPortableOperatingSystem',
 'PuaMode',
 'Census_DeviceFamily',
 'UacLuaenable',
 'Census_IsVirtualDevice',
 'Platform',
 'Census_OSSkuName',
 'Census_OSInstallLanguageIdentifier',
 'Processor']

In [5]:
train.drop(droppable_features, axis=1, inplace=True)
test.drop(droppable_features, axis=1, inplace=True)
print(train.shape, test.shape)

(8921483, 66) (7853253, 65)


In [6]:
train.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,OsVer,OsBuild,OsSuite,OsPlatformSubRelease,OsBuildLab,SkuEdition,IsProtected,IeVerIdentifier,SmartScreen,Firewall,Census_MDC2FormFactor,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,Census_ProcessorManufacturerIdentifier,Census_ProcessorModelIdentifier,Census_PrimaryDiskTotalCapacity,Census_PrimaryDiskTypeName,Census_SystemVolumeTotalCapacity,Census_HasOpticalDiskDrive,Census_TotalPhysicalRAM,Census_ChassisTypeName,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_PowerPlatformRoleName,Census_InternalBatteryType,Census_InternalBatteryNumberOfCharges,Census_OSVersion,Census_OSArchitecture,Census_OSBranch,Census_OSBuildNumber,Census_OSBuildRevision,Census_OSEdition,Census_OSInstallTypeName,Census_OSUILocaleIdentifier,Census_OSWUAutoUpdateOptionsName,Census_GenuineStateName,Census_ActivationChannel,Census_FlightRing,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1735.0,7.0,0,,53447.0,1.0,1.0,1,29,128035.0,18.0,35.0,-85,10.0.0.0,17134,256,rs4,17134.1.amd64fre.rs4_release.180410-1804,Pro,1.0,137.0,,1.0,Desktop,2668.0,9124.0,4.0,5.0,2341.0,476940.0,HDD,299451.0,0,4096.0,Desktop,18.9,1440.0,900.0,Desktop,,4294967000.0,10.0.17134.165,amd64,rs4_release,17134,165,Professional,UUPUpgrade,119,UNKNOWN,IS_GENUINE,Retail,Retail,628.0,36144.0,0,0,0,0.0,0.0,10.0,0
1,1,win8defender,1.1.14600.4,4.13.17134.1,1.263.48.0,7.0,0,,53447.0,1.0,1.0,1,93,1482.0,18.0,119.0,64,10.0.0.0,17134,256,rs4,17134.1.amd64fre.rs4_release.180410-1804,Pro,1.0,137.0,,1.0,Notebook,2668.0,91656.0,4.0,5.0,2405.0,476940.0,HDD,102385.0,0,4096.0,Notebook,13.9,1366.0,768.0,Mobile,,1.0,10.0.17134.1,amd64,rs4_release,17134,1,Professional,IBSClean,31,UNKNOWN,OFFLINE,Retail,NOT_SET,628.0,57858.0,0,0,0,0.0,0.0,8.0,0
2,2,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1341.0,7.0,0,,53447.0,1.0,1.0,1,86,153579.0,18.0,64.0,49,10.0.0.0,17134,768,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,1.0,137.0,RequireAdmin,1.0,Desktop,4909.0,317701.0,4.0,5.0,1972.0,114473.0,SSD,113907.0,0,4096.0,Desktop,21.5,1920.0,1080.0,Desktop,,4294967000.0,10.0.17134.165,amd64,rs4_release,17134,165,Core,UUPUpgrade,30,FullAuto,IS_GENUINE,OEM:NONSLP,Retail,142.0,52682.0,0,0,0,0.0,0.0,3.0,0
3,3,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1527.0,7.0,0,,53447.0,1.0,1.0,1,88,20710.0,,117.0,115,10.0.0.0,17134,256,rs4,17134.1.amd64fre.rs4_release.180410-1804,Pro,1.0,137.0,ExistsNotSet,1.0,Desktop,1443.0,275890.0,4.0,5.0,2273.0,238475.0,UNKNOWN,227116.0,0,4096.0,MiniTower,18.5,1366.0,768.0,Desktop,,4294967000.0,10.0.17134.228,amd64,rs4_release,17134,228,Professional,UUPUpgrade,64,FullAuto,IS_GENUINE,OEM:NONSLP,Retail,355.0,20050.0,0,0,0,0.0,0.0,3.0,1
4,4,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1379.0,7.0,0,,53447.0,1.0,1.0,1,18,37376.0,,277.0,75,10.0.0.0,17134,768,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,1.0,137.0,RequireAdmin,1.0,Notebook,1443.0,331929.0,4.0,5.0,2500.0,476940.0,HDD,101900.0,0,6144.0,Portable,14.0,1366.0,768.0,Mobile,lion,0.0,10.0.17134.191,amd64,rs4_release,17134,191,Core,Update,31,FullAuto,IS_GENUINE,Retail,Retail,355.0,19844.0,0,0,0,0.0,0.0,1.0,1


In [7]:
cat_cols = ['ProductName', 'EngineVersion', 'AppVersion', 'AvSigVersion', 'RtpStateBitfield', 'IsSxsPassiveMode',
            'DefaultBrowsersIdentifier','AVProductStatesIdentifier', 'AVProductsInstalled', 'AVProductsEnabled',
            'HasTpm', 'CountryIdentifier', 'CityIdentifier', 'OrganizationIdentifier', 'GeoNameIdentifier', 
            'LocaleEnglishNameIdentifier', 'OsVer', 'OsBuild', 'OsSuite', 'OsPlatformSubRelease', 'OsBuildLab',
            'SkuEdition', 'IsProtected', 'IeVerIdentifier', 'SmartScreen', 'Firewall', 'Census_MDC2FormFactor',
            'Census_OEMNameIdentifier', 'Census_OEMModelIdentifier', 'Census_ProcessorCoreCount',
            'Census_ProcessorManufacturerIdentifier','Census_ProcessorModelIdentifier','Census_PrimaryDiskTotalCapacity',
            'Census_PrimaryDiskTypeName', 'Census_HasOpticalDiskDrive', 'Census_ChassisTypeName',
            'Census_PowerPlatformRoleName', 'Census_InternalBatteryType', 'Census_OSVersion', 'Census_OSArchitecture',
            'Census_OSBranch', 'Census_OSBuildNumber', 'Census_OSBuildRevision', 'Census_OSEdition', 'Census_OSInstallTypeName',
            'Census_OSUILocaleIdentifier', 'Census_OSWUAutoUpdateOptionsName', 'Census_GenuineStateName', 'Census_ActivationChannel',
            'Census_FlightRing', 'Census_FirmwareManufacturerIdentifier', 'Census_FirmwareVersionIdentifier',
            'Census_IsSecureBootEnabled', 'Census_IsTouchEnabled', 'Census_IsPenCapable', 'Census_IsAlwaysOnAlwaysConnectedCapable',
            'Wdft_IsGamer', 'Wdft_RegionIdentifier']
print(len(cat_cols))

58


In [8]:
for col in cat_cols:
    train[col] = train[col].astype('str')
    test[col] = test[col].astype('str')
    
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    
    le = LabelEncoder().fit(np.unique(train[col].unique().tolist()+test[col].unique().tolist()))
    
    train[col] = le.transform(train[col])+1
    test[col]  = le.transform(test[col])+1

In [9]:
train['display_ratio'] = train['Census_InternalPrimaryDisplayResolutionHorizontal']/train['Census_InternalPrimaryDisplayResolutionVertical']
test['display_ratio'] = test['Census_InternalPrimaryDisplayResolutionHorizontal']/test['Census_InternalPrimaryDisplayResolutionVertical']
del train['Census_InternalPrimaryDisplayResolutionHorizontal'], train['Census_InternalPrimaryDisplayResolutionVertical'], test['Census_InternalPrimaryDisplayResolutionHorizontal'], test['Census_InternalPrimaryDisplayResolutionVertical']
print(train.shape, test.shape)

(8921483, 65) (7853253, 64)


In [10]:
y_train = np.array(train['HasDetections'])
train_ids = train['MachineIdentifier']
test_ids  = test['MachineIdentifier']
del train['HasDetections'], train['MachineIdentifier'], test['MachineIdentifier']
X = np.array(train.values)
X_test = np.array(test.values)
print(train.shape, test.shape)
print(len(X), len(X_test))
print(len(y_train), len(train_ids), len(test_ids))

(8921483, 63) (7853253, 63)
8921483 7853253
8921483 8921483 7853253


In [11]:
def column_index(df, query_cols):
    cols_val = df.columns.values
    sidx = np.argsort(cols_val)
    return sidx[np.searchsorted(cols_val,query_cols,sorter=sidx)]

In [12]:
categorical_features_pos = column_index(train,cat_cols)

In [13]:
categorical_features_pos

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       35, 37, 39, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
       55, 56, 57, 58, 59, 60, 61])

In [14]:
del train, test
gc.collect()

342

In [15]:
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# skf.get_n_splits(train_ids, y_train)
X_train, X_valid, y_train, y_valid = train_test_split(X, y_train, test_size=0.13, random_state=23)
print(len(X_train), len(X_valid), len(y_train), len(y_valid))

7761690 1159793 7761690 1159793


In [20]:
print("Train CatBoost Decision Tree")
cb_model = CatBoostClassifier(iterations=1200, learning_rate=0.02, eval_metric='AUC', random_seed = 23, 
                              verbose = 60, task_type = 'GPU',
                              bagging_temperature = 5, depth=10, use_best_model =True, 
                              early_stopping_rounds = 50, metric_period=30, od_type='Iter')

Train CatBoost Decision Tree


In [None]:
cb_model.fit(X_train, y_train,
             eval_set=(X_valid,y_valid),
             cat_features=categorical_features_pos)