In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Implementation of the XGBoost on the entire dataset using incremental training through batches.

In [None]:
import xgboost as xgb

##### Specifying the datatypes (as obtained from previous notebooks) for loading the data quickly. 

In [None]:
dtypes = {
'MachineIdentifier': 'category',
'EngineVersion': 'int64',
'AppVersion': 'int64',
'AVProductsInstalled': 'float16',
'AVProductsEnabled': 'float16',
'CountryIdentifier': 'int64',
'GeoNameIdentifier': 'int64',
'LocaleEnglishNameIdentifier': 'int64',
'OsBuild': 'int64',
'OsPlatformSubRelease': 'int64',
'OsBuildLab': 'int64',
'SkuEdition': 'int64',
'IsProtected': 'float16',
'IeVerIdentifier': 'int64',
'Firewall': 'float16',
'Census_MDC2FormFactor': 'int64',
'Census_ProcessorCoreCount': 'float16',
'Census_ProcessorManufacturerIdentifier': 'int64',
'Census_PrimaryDiskTotalCapacity': 'float32',
'Census_SystemVolumeTotalCapacity': 'float32',
'Census_HasOpticalDiskDrive': 'int8',
'Census_TotalPhysicalRAM': 'float32',
'Census_ChassisTypeName': 'int64',
'Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float16',
'Census_InternalPrimaryDisplayResolutionHorizontal': 'float16',
'Census_InternalPrimaryDisplayResolutionVertical': 'float16',
'Census_PowerPlatformRoleName': 'int64',
'Census_InternalBatteryNumberOfCharges': 'float32',
'Census_OSVersion': 'int64',
'Census_OSBranch': 'int64',
'Census_OSBuildNumber': 'int64',
'Census_OSBuildRevision': 'int32',
'Census_OSEdition': 'int64',
'Census_OSSkuName': 'int64',
'Census_OSInstallTypeName': 'int64',
'Census_OSInstallLanguageIdentifier': 'int64',
'Census_OSUILocaleIdentifier': 'int64',
'Census_IsFlightsDisabled': 'float16',
'Census_FlightRing': 'int64',
'Census_FirmwareManufacturerIdentifier': 'int64',
'Census_IsSecureBootEnabled': 'int8',
'Census_IsTouchEnabled': 'int8',
'Census_IsAlwaysOnAlwaysConnectedCapable': 'float16',
'Wdft_IsGamer': 'float16',
'Wdft_RegionIdentifier': 'int64',
'HasDetections': 'int8',
'Processor_arm64': 'uint8',
'Processor_x64': 'uint8',
'Processor_x86': 'uint8',
'OsSuite_256': 'uint8',
'OsSuite_768': 'uint8',
'OsSuite_784': 'uint8',
'Census_PrimaryDiskTypeName_HDD': 'uint8',
'Census_PrimaryDiskTypeName_SSD': 'uint8',
'Census_PrimaryDiskTypeName_UNKNOWN': 'uint8',
'Census_PrimaryDiskTypeName_Unspecified': 'uint8',
'Census_OSArchitecture_amd64': 'uint8',
'Census_OSArchitecture_arm64': 'uint8',
'Census_OSArchitecture_x86': 'uint8',
'Census_OSWUAutoUpdateOptionsName_AutoInstallAndRebootAtMaintenanceTime': 'uint8',
'Census_OSWUAutoUpdateOptionsName_DownloadNotify': 'uint8',
'Census_OSWUAutoUpdateOptionsName_FullAuto': 'uint8',
'Census_OSWUAutoUpdateOptionsName_Notify': 'uint8',
'Census_OSWUAutoUpdateOptionsName_Off': 'uint8',
'Census_OSWUAutoUpdateOptionsName_UNKNOWN': 'uint8',
'Census_GenuineStateName_INVALID_LICENSE': 'uint8',
'Census_GenuineStateName_IS_GENUINE': 'uint8',
'Census_GenuineStateName_OFFLINE': 'uint8',
'Census_GenuineStateName_UNKNOWN': 'uint8',
'Census_ActivationChannel_OEM:DM': 'uint8',
'Census_ActivationChannel_OEM:NONSLP': 'uint8',
'Census_ActivationChannel_Retail': 'uint8',
'Census_ActivationChannel_Retail:TB:Eval': 'uint8',
'Census_ActivationChannel_Volume:GVLK': 'uint8',
'Census_ActivationChannel_Volume:MAK': 'uint8'
}

##### We have split the data into three files and will train three models on them and later combine them. 
### MODEL 1
##### Reading and using the first training file (train1) to train the model. This is the incremental learning part to prevent the memory from crashing.

In [None]:
df_subset = pd.read_csv('../input/trainandtestnew/train1.csv', dtype = dtypes)

In [None]:
main_X = df_subset.drop(['HasDetections'], axis=1)
main_y = df_subset['HasDetections']

##### Using the best hyperparameters obtained from fine-tuning. 

In [None]:
data_matrix = xgb.DMatrix(data=main_X,label=main_y)
params = {
    'alpha': 10, # learning rate
    'n_iter': 600, # number of epochs
    'objective': 'binary:logistic', # logistic regression,
    'n_estimators': 350,
    'penalty': 'l2',
    'n_jobs': -1
}

In [None]:
model_1 = xgb.train(params, data_matrix, 350)

In [None]:
  model_1.save_model('model_1.model')

# Model 2
##### Reading and using the second file. 

In [None]:
df_subset = pd.read_csv('../input/trainandtestnew/train2.csv', dtype = dtypes)

In [None]:
main_X = df_subset.drop(['HasDetections'], axis=1)
main_y = df_subset['HasDetections']

In [None]:
data_matrix = xgb.DMatrix(data=main_X,label=main_y)
params = {
    'alpha': 10, # learning rate
    'n_iter': 600, # number of epochs
    'objective': 'binary:logistic', # logistic regression,
    'n_estimators': 350,
    'penalty': 'l2',
    'n_jobs': -1
}

In [None]:
model_2= xgb.train(params, data_matrix, 350, xgb_model='model_1.model')

In [None]:
model_2.save_model('model_2.model')

# Model 3
##### Reading and using the third file. 

In [None]:
df_subset = pd.read_csv('../input/trainandtestnew/train3.csv', dtype = dtypes)

In [None]:
main_X = df_subset.drop(['HasDetections'], axis=1)
main_y = df_subset['HasDetections']

In [None]:
data_matrix = xgb.DMatrix(data=main_X,label=main_y)
params = {
    'alpha': 10, # learning rate
    'n_iter': 600, # number of epochs
    'objective': 'binary:logistic', # logistic regression,
    'n_estimators': 350,
    'penalty': 'l2',
    'n_jobs': -1
}

##### At each new model, we specify the old model(s) to be used in continuation. 

In [None]:
model_3= xgb.train(params, data_matrix, 350, xgb_model='model_2.model')

In [None]:
test_df = pd.read_csv('../input/trainandtestnew/test_df.csv', dtype = dtypes)

##### Finally, using the third model to predict results on the test data. 

In [None]:
kaggle_test_X = test_df.drop(columns="MachineIdentifier")
kaggle_test_X = xgb.DMatrix(kaggle_test_X)
kaggle_test_yhat = model_3.predict(kaggle_test_X)
# kaggle_test_yhat = model_3.predict(kaggle_test_X)[:, -1] 
test_df["HasDetections"] = kaggle_test_yhat
to_submit = test_df[["MachineIdentifier", "HasDetections"]]
to_submit.to_csv("class-xgb.csv", index=False)