In [1]:
# notebook to implement models
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection as model_selection
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
import sklearn.preprocessing as preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from tqdm.notebook import tqdm
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier


### Implementation of the Gradient Boosting classifier using sklearn. 

##### Specifying the datatypes (as obtained from previous notebooks) for loading the data quickly. 

In [2]:
dtypes = {
    'MachineIdentifier': 'category',
'Census_ProcessorCoreCount': 'float16',
'Census_PrimaryDiskTotalCapacity': 'float32',
'Census_SystemVolumeTotalCapacity': 'float32',
'Census_TotalPhysicalRAM': 'float32',
'Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float16',
'Census_InternalPrimaryDisplayResolutionHorizontal': 'float16',
'Census_InternalPrimaryDisplayResolutionVertical': 'float16',
'Census_InternalBatteryNumberOfCharges': 'float32',
'Census_OSBuildRevision': 'int32',
'Census_IsFlightsDisabled': 'float16',
'Census_IsSecureBootEnabled': 'int8',
'Census_IsTouchEnabled': 'int8',
'Wdft_IsGamer': 'float16',
'HasDetections': 'int8',
'EngineVersion_1.1.14800.3': 'uint8',
'EngineVersion_1.1.14901.4': 'uint8',
'EngineVersion_1.1.15000.2': 'uint8',
'EngineVersion_1.1.15100.1': 'uint8',
'EngineVersion_1.1.15200.1': 'uint8',
'EngineVersion_MinorVersions': 'uint8',
'AppVersion_4.12.16299.15': 'uint8',
'AppVersion_4.13.17134.1': 'uint8',
'AppVersion_4.13.17134.228': 'uint8',
'AppVersion_4.14.17639.18041': 'uint8',
'AppVersion_4.16.17656.18052': 'uint8',
'AppVersion_4.18.1806.18062': 'uint8',
'AppVersion_4.18.1807.18075': 'uint8',
'AppVersion_4.8.10240.17443': 'uint8',
'AppVersion_4.9.10586.1106': 'uint8',
'AppVersion_MinorVersions': 'uint8',
'AVProductsInstalled_1.0': 'uint8',
'AVProductsInstalled_2.0': 'uint8',
'AVProductsInstalled_3.0': 'uint8',
'AVProductsInstalled_4.0': 'uint8',
'AVProductsInstalled_5.0': 'uint8',
'AVProductsInstalled_6.0': 'uint8',
'Processor_arm64': 'uint8',
'Processor_x64': 'uint8',
'Processor_x86': 'uint8',
'OsBuild_10240': 'uint8',
'OsBuild_10586': 'uint8',
'OsBuild_14393': 'uint8',
'OsBuild_15063': 'uint8',
'OsBuild_16299': 'uint8',
'OsBuild_17134': 'uint8',
'OsBuild_MinorVersions': 'uint8',
'OsSuite_256': 'uint8',
'OsSuite_768': 'uint8',
'OsSuite_784': 'uint8',
'OsPlatformSubRelease_prers5': 'uint8',
'OsPlatformSubRelease_rs1': 'uint8',
'OsPlatformSubRelease_rs2': 'uint8',
'OsPlatformSubRelease_rs3': 'uint8',
'OsPlatformSubRelease_rs4': 'uint8',
'OsPlatformSubRelease_th1': 'uint8',
'OsPlatformSubRelease_th2': 'uint8',
'OsPlatformSubRelease_windows7': 'uint8',
'OsPlatformSubRelease_windows8.1': 'uint8',
'OsBuildLab_10240.17443.amd64fre.th1.170602-2340': 'uint8',
'OsBuildLab_14393.2189.amd64fre.rs1_release.180329-1711': 'uint8',
'OsBuildLab_15063.0.amd64fre.rs2_release.170317-1834': 'uint8',
'OsBuildLab_16299.15.amd64fre.rs3_release.170928-1534': 'uint8',
'OsBuildLab_16299.15.x86fre.rs3_release.170928-1534': 'uint8',
'OsBuildLab_16299.431.amd64fre.rs3_release_svc_escrow.180502-1908': 'uint8',
'OsBuildLab_17134.1.amd64fre.rs4_release.180410-1804': 'uint8',
'OsBuildLab_17134.1.x86fre.rs4_release.180410-1804': 'uint8',
'OsBuildLab_MinorVersions': 'uint8',
'SkuEdition_Cloud': 'uint8',
'SkuEdition_Education': 'uint8',
'SkuEdition_Enterprise': 'uint8',
    'SkuEdition_Home': 'uint8',
'SkuEdition_Invalid': 'uint8',
'SkuEdition_Pro': 'uint8',
'SkuEdition_Server': 'uint8',
'IeVerIdentifier_105.0': 'uint8',
'IeVerIdentifier_108.0': 'uint8',
'IeVerIdentifier_111.0': 'uint8',
'IeVerIdentifier_117.0': 'uint8',
'IeVerIdentifier_135.0': 'uint8',
'IeVerIdentifier_137.0': 'uint8',
'IeVerIdentifier_53.0': 'uint8',
'IeVerIdentifier_74.0': 'uint8',
'IeVerIdentifier_98.0': 'uint8',
'IeVerIdentifier_MinorVersions': 'uint8',
'Census_MDC2FormFactor_AllInOne': 'uint8',
'Census_MDC2FormFactor_Convertible': 'uint8',
'Census_MDC2FormFactor_Desktop': 'uint8',
'Census_MDC2FormFactor_Detachable': 'uint8',
'Census_MDC2FormFactor_Census_MDC2FormFactor': 'uint8',
'Census_MDC2FormFactor_Notebook': 'uint8',
'Census_MDC2FormFactor_PCOther': 'uint8',
'Census_ProcessorManufacturerIdentifier_1.0': 'uint8',
'Census_ProcessorManufacturerIdentifier_5.0': 'uint8',
'Census_ProcessorManufacturerIdentifier_Minor': 'uint8',
'Census_PrimaryDiskTypeName_HDD': 'uint8',
'Census_PrimaryDiskTypeName_SSD': 'uint8',
'Census_PrimaryDiskTypeName_UNKNOWN': 'uint8',
'Census_PrimaryDiskTypeName_Unspecified': 'uint8',
'Census_HasOpticalDiskDrive_0': 'uint8',
'Census_HasOpticalDiskDrive_1': 'uint8',
'Census_ChassisTypeName_AllinOne': 'uint8',
'Census_ChassisTypeName_Desktop': 'uint8',
'Census_ChassisTypeName_Laptop': 'uint8',
'Census_ChassisTypeName_MinorVersions': 'uint8',
'Census_ChassisTypeName_Notebook': 'uint8',
'Census_ChassisTypeName_Portable': 'uint8',
'Census_PowerPlatformRoleName_Desktop': 'uint8',
'Census_PowerPlatformRoleName_MinorVersions': 'uint8',
'Census_PowerPlatformRoleName_Mobile': 'uint8',
'Census_PowerPlatformRoleName_Slate': 'uint8',
'Census_OSArchitecture_amd64': 'uint8',
'Census_OSArchitecture_arm64': 'uint8',
'Census_OSArchitecture_x86': 'uint8',
'Census_OSBranch_MinorVersions': 'uint8',
'Census_OSBranch_rs1_release': 'uint8',
'Census_OSBranch_rs2_release': 'uint8',
'Census_OSBranch_rs3_release': 'uint8',
'Census_OSBranch_rs3_release_svc_escrow': 'uint8',
'Census_OSBranch_rs4_release': 'uint8',
'Census_OSBranch_th1_st1': 'uint8',
'Census_OSBranch_th2_release': 'uint8',
'Census_OSBranch_th2_release_sec': 'uint8',
'Census_OSBuildNumber_10586': 'uint8',
'Census_OSBuildNumber_14393': 'uint8',
'Census_OSBuildNumber_15063': 'uint8',
'Census_OSBuildNumber_16299': 'uint8',
'Census_OSBuildNumber_17134': 'uint8',
'Census_OSBuildNumber_MinorVersions': 'uint8',
'Census_OSEdition_Core': 'uint8',
'Census_OSEdition_CoreCountrySpecific': 'uint8',
'Census_OSEdition_CoreSingleLanguage': 'uint8',
'Census_OSEdition_MinorVersions': 'uint8',
'Census_OSEdition_Professional': 'uint8',
'Census_OSSkuName_CORE': 'uint8',
'Census_OSSkuName_CORE_COUNTRYSPECIFIC': 'uint8',
'Census_OSSkuName_CORE_SINGLELANGUAGE': 'uint8',
'Census_OSSkuName_MinorVersions': 'uint8',
'Census_OSSkuName_PROFESSIONAL': 'uint8',
'Census_OSInstallTypeName_Clean': 'uint8',
'Census_OSInstallTypeName_CleanPCRefresh': 'uint8',
'Census_OSInstallTypeName_IBSClean': 'uint8',
'Census_OSInstallTypeName_Other': 'uint8',
'Census_OSInstallTypeName_Refresh': 'uint8',
'Census_OSInstallTypeName_Reset': 'uint8',
'Census_OSInstallTypeName_UUPUpgrade': 'uint8',
'Census_OSInstallTypeName_Update': 'uint8',
'Census_OSInstallTypeName_Upgrade': 'uint8',
'Census_OSInstallLanguageIdentifier_10.0': 'uint8',
'Census_OSInstallLanguageIdentifier_14.0': 'uint8',
'Census_OSInstallLanguageIdentifier_18.0': 'uint8',
'Census_OSInstallLanguageIdentifier_20.0': 'uint8',
'Census_OSInstallLanguageIdentifier_24.0': 'uint8',
'Census_OSInstallLanguageIdentifier_25.0': 'uint8',
'Census_OSInstallLanguageIdentifier_26.0': 'uint8',
'Census_OSInstallLanguageIdentifier_27.0': 'uint8',
'Census_OSInstallLanguageIdentifier_29.0': 'uint8',
'Census_OSInstallLanguageIdentifier_35.0': 'uint8',
'Census_OSInstallLanguageIdentifier_37.0': 'uint8',
'Census_OSInstallLanguageIdentifier_39.0': 'uint8',
'Census_OSInstallLanguageIdentifier_5.0': 'uint8',
'Census_OSInstallLanguageIdentifier_7.0': 'uint8',
'Census_OSInstallLanguageIdentifier_8.0': 'uint8',
'Census_OSInstallLanguageIdentifier_9.0': 'uint8',
'Census_OSInstallLanguageIdentifier_MinorVersions': 'uint8',
'Census_OSUILocaleIdentifier_109': 'uint8',
'Census_OSUILocaleIdentifier_115': 'uint8',
'Census_OSUILocaleIdentifier_119': 'uint8',
'Census_OSUILocaleIdentifier_120': 'uint8',
'Census_OSUILocaleIdentifier_125': 'uint8',
'Census_OSUILocaleIdentifier_148': 'uint8',
'Census_OSUILocaleIdentifier_158': 'uint8',
'Census_OSUILocaleIdentifier_160': 'uint8',
'Census_OSUILocaleIdentifier_26': 'uint8',
'Census_OSUILocaleIdentifier_30': 'uint8',
'Census_OSUILocaleIdentifier_31': 'uint8',
'Census_OSUILocaleIdentifier_34': 'uint8',
'Census_OSUILocaleIdentifier_35': 'uint8',
'Census_OSUILocaleIdentifier_49': 'uint8',
'Census_OSUILocaleIdentifier_72': 'uint8',
'Census_OSUILocaleIdentifier_83': 'uint8',
'Census_OSUILocaleIdentifier_MinorVersions': 'uint8',
'Census_OSWUAutoUpdateOptionsName_AutoInstallAndRebootAtMaintenanceTime': 'uint8',
'Census_OSWUAutoUpdateOptionsName_DownloadNotify': 'uint8',
'Census_OSWUAutoUpdateOptionsName_FullAuto': 'uint8',
'Census_OSWUAutoUpdateOptionsName_Notify': 'uint8',
'Census_OSWUAutoUpdateOptionsName_Off': 'uint8',
'Census_OSWUAutoUpdateOptionsName_UNKNOWN': 'uint8',
'Census_FlightRing_MinorVersions': 'uint8',
'Census_FlightRing_NOT_SET': 'uint8',
'Census_FlightRing_Retail': 'uint8',
'Census_FlightRing_Unknown': 'uint8',
'Census_FirmwareManufacturerIdentifier_142.0': 'uint8',
'Census_FirmwareManufacturerIdentifier_355.0': 'uint8',
'Census_FirmwareManufacturerIdentifier_500.0': 'uint8',
'Census_FirmwareManufacturerIdentifier_513.0': 'uint8',
'Census_FirmwareManufacturerIdentifier_554.0': 'uint8',
'Census_FirmwareManufacturerIdentifier_556.0': 'uint8',
'Census_FirmwareManufacturerIdentifier_628.0': 'uint8',
'Census_FirmwareManufacturerIdentifier_807.0': 'uint8',
'Census_FirmwareManufacturerIdentifier_93.0': 'uint8',
'Census_FirmwareManufacturerIdentifier_MinorVersions': 'uint8',
'Wdft_RegionIdentifier_1.0': 'uint8',
'Wdft_RegionIdentifier_2.0': 'uint8',
'Wdft_RegionIdentifier_3.0': 'uint8',
'Wdft_RegionIdentifier_4.0': 'uint8',
'Wdft_RegionIdentifier_5.0': 'uint8',
'Wdft_RegionIdentifier_6.0': 'uint8',
'Wdft_RegionIdentifier_7.0': 'uint8',
'Wdft_RegionIdentifier_8.0': 'uint8',
'Wdft_RegionIdentifier_9.0': 'uint8',
'Wdft_RegionIdentifier_10.0': 'uint8',
'Wdft_RegionIdentifier_11.0': 'uint8',
'Wdft_RegionIdentifier_12.0': 'uint8',
'Wdft_RegionIdentifier_13.0': 'uint8',
'Wdft_RegionIdentifier_14.0': 'uint8',
'Wdft_RegionIdentifier_15.0': 'uint8',
'Census_GenuineStateName_INVALID_LICENSE': 'uint8',
'Census_GenuineStateName_IS_GENUINE': 'uint8',
'Census_GenuineStateName_OFFLINE': 'uint8',
'Census_GenuineStateName_UNKNOWN': 'uint8',
'Census_ActivationChannel_OEM:DM': 'uint8',
'Census_ActivationChannel_OEM:NONSLP': 'uint8',
'Census_ActivationChannel_Retail': 'uint8',
'Census_ActivationChannel_Retail:TB:Eval': 'uint8',
'Census_ActivationChannel_Volume:GVLK': 'uint8',
'Census_ActivationChannel_Volume:MAK': 'uint8'
}

In [6]:
test_df = pd.read_csv('../input/test-data-modified/test_df.csv', dtype = dtypes)

In [7]:
test_df.head()

Unnamed: 0,MachineIdentifier,Census_ProcessorCoreCount,Census_PrimaryDiskTotalCapacity,Census_SystemVolumeTotalCapacity,Census_TotalPhysicalRAM,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_InternalBatteryNumberOfCharges,Census_OSBuildRevision,...,Census_GenuineStateName_INVALID_LICENSE,Census_GenuineStateName_IS_GENUINE,Census_GenuineStateName_OFFLINE,Census_GenuineStateName_UNKNOWN,Census_ActivationChannel_OEM:DM,Census_ActivationChannel_OEM:NONSLP,Census_ActivationChannel_Retail,Census_ActivationChannel_Retail:TB:Eval,Census_ActivationChannel_Volume:GVLK,Census_ActivationChannel_Volume:MAK
0,000007535c3f730efa9ea0b7ef1bd645,4.0,13.075148,11.536506,8.31801,2.630859,7.222656,6.648438,0.693147,1,...,0,0,1,0,0,0,1,0,0,0
1,000007905a28d863f6d0d597892cd692,4.0,11.648103,11.643147,8.31801,3.068359,7.5625,6.988281,22.18071,165,...,0,1,0,0,0,1,0,0,0,0
2,00001a18d69bb60bda9779408dcf02ac,4.0,13.768283,12.222207,9.011035,2.740234,7.222656,6.648438,0.0,431,...,0,1,0,0,1,0,0,0,0,0
3,00001b924fcc6922321cfadbafd8a91a,2.0,11.712637,11.702124,8.31801,2.630859,7.222656,6.648438,2.079442,228,...,0,1,0,0,1,0,0,0,0,0
4,000028150912f45b1dc667164de489ef,2.0,12.628873,12.628536,8.31801,2.837891,7.273438,6.808594,0.0,164,...,0,1,0,0,0,0,1,0,0,0


In [8]:
df_subset = pd.read_csv('../input/train-data-modified/train_df.csv', dtype = dtypes, nrows = 3000000)

In [9]:
main_X = df_subset.drop(['HasDetections', 'MachineIdentifier'], axis=1)
main_y = df_subset['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(main_X, main_y, test_size = 0.3, random_state = 42)

##### Scaling the data:

In [10]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = main_X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = main_X.columns)

##### Creating an instance of the model. 

In [None]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)

print('Accuracy Score: ', (accuracy_score(y_test, y_pred)))
print('ROC-AUC score: ', metrics.roc_auc_score(y_test, y_pred))

In [None]:
kaggle_test_X = test_df.drop(columns="MachineIdentifier")
kaggle_test_yhat = gbc.predict_proba(kaggle_test_X)[:, -1] # predicting the probabilities
test_df["HasDetections"] = kaggle_test_yhat
to_submit = test_df[["MachineIdentifier", "HasDetections"]]
to_submit.to_csv("class-gbc.csv", index=False)


In [None]:
'''main_X = df.drop(['HasDetections'], axis=1)
main_y = df['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(main_X, main_y, test_size = 0.5, random_state = 42)'''

In [None]:
'''chunksize = 20000
clf = SGDClassifier(alpha=.0001, loss='log', penalty='l2', n_jobs=-1, shuffle=True)
for train_df in tqdm(pd.read_csv("/kaggle/working/train_df.csv", dtype = dtypes, chunksize=chunksize, iterator=True)):
    X = train_df.drop(['HasDetections', 'MachineIdentifier'], axis=1)
    Y = train_df['HasDetections']
    clf.partial_fit(X, Y, classes=[0,1])'''

In [None]:
#pca = PCA(n_components = 0.97) # hyperparameter
#principalComponents = pca.fit_transform(X_train)

In [None]:
#lr = LogisticRegression()
#lr.fit(principalComponents, y_train)

In [None]:
#test_principalComponents = pca.transform(X_test)

In [None]:
#yhat_test = lr.predict(test_principalComponents)
#print("Accuracy of PCA: ", 100 * accuracy_score(y_test, yhat_test))
#print('ROC-AUC score: ', (100 * metrics.roc_auc_score(y_test, yhat_test)))

In [None]:
'''logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Accuracy Score: ', (accuracy_score(y_test, y_pred)))
print('ROC-AUC score: ', metrics.roc_auc_score(y_test, y_pred))'''