In [None]:
# notebook to implement models
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection as model_selection
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
import sklearn.preprocessing as preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from tqdm.notebook import tqdm
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb


In [None]:
!pip install parfit
import parfit.parfit as pf

##### Finding the best hyperparameters for XGBoost using a combination of manual fine-tuning followed by grid searching techniques.

In [None]:
from sklearn.model_selection import ParameterGrid

##### Specifying the datatypes (as obtained from previous notebooks) for loading the data quickly.

In [None]:
dtypes = {
'MachineIdentifier': 'category',
'EngineVersion': 'int64',
'AppVersion': 'int64',
'AVProductsInstalled': 'float16',
'AVProductsEnabled': 'float16',
'CountryIdentifier': 'int64',
'GeoNameIdentifier': 'int64',
'LocaleEnglishNameIdentifier': 'int64',
'OsBuild': 'int64',
'OsPlatformSubRelease': 'int64',
'OsBuildLab': 'int64',
'SkuEdition': 'int64',
'IsProtected': 'float16',
'IeVerIdentifier': 'int64',
'Firewall': 'float16',
'Census_MDC2FormFactor': 'int64',
'Census_ProcessorCoreCount': 'float16',
'Census_ProcessorManufacturerIdentifier': 'int64',
'Census_PrimaryDiskTotalCapacity': 'float32',
'Census_SystemVolumeTotalCapacity': 'float32',
'Census_HasOpticalDiskDrive': 'int8',
'Census_TotalPhysicalRAM': 'float32',
'Census_ChassisTypeName': 'int64',
'Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float16',
'Census_InternalPrimaryDisplayResolutionHorizontal': 'float16',
'Census_InternalPrimaryDisplayResolutionVertical': 'float16',
'Census_PowerPlatformRoleName': 'int64',
'Census_InternalBatteryNumberOfCharges': 'float32',
'Census_OSVersion': 'int64',
'Census_OSBranch': 'int64',
'Census_OSBuildNumber': 'int64',
'Census_OSBuildRevision': 'int32',
'Census_OSEdition': 'int64',
'Census_OSSkuName': 'int64',
'Census_OSInstallTypeName': 'int64',
'Census_OSInstallLanguageIdentifier': 'int64',
'Census_OSUILocaleIdentifier': 'int64',
'Census_IsFlightsDisabled': 'float16',
'Census_FlightRing': 'int64',
'Census_FirmwareManufacturerIdentifier': 'int64',
'Census_IsSecureBootEnabled': 'int8',
'Census_IsTouchEnabled': 'int8',
'Census_IsAlwaysOnAlwaysConnectedCapable': 'float16',
'Wdft_IsGamer': 'float16',
'Wdft_RegionIdentifier': 'int64',
'HasDetections': 'int8',
'Processor_arm64': 'uint8',
'Processor_x64': 'uint8',
'Processor_x86': 'uint8',
'OsSuite_256': 'uint8',
'OsSuite_768': 'uint8',
'OsSuite_784': 'uint8',
'Census_PrimaryDiskTypeName_HDD': 'uint8',
'Census_PrimaryDiskTypeName_SSD': 'uint8',
'Census_PrimaryDiskTypeName_UNKNOWN': 'uint8',
'Census_PrimaryDiskTypeName_Unspecified': 'uint8',
'Census_OSArchitecture_amd64': 'uint8',
'Census_OSArchitecture_arm64': 'uint8',
'Census_OSArchitecture_x86': 'uint8',
'Census_OSWUAutoUpdateOptionsName_AutoInstallAndRebootAtMaintenanceTime': 'uint8',
'Census_OSWUAutoUpdateOptionsName_DownloadNotify': 'uint8',
'Census_OSWUAutoUpdateOptionsName_FullAuto': 'uint8',
'Census_OSWUAutoUpdateOptionsName_Notify': 'uint8',
'Census_OSWUAutoUpdateOptionsName_Off': 'uint8',
'Census_OSWUAutoUpdateOptionsName_UNKNOWN': 'uint8',
'Census_GenuineStateName_INVALID_LICENSE': 'uint8',
'Census_GenuineStateName_IS_GENUINE': 'uint8',
'Census_GenuineStateName_OFFLINE': 'uint8',
'Census_GenuineStateName_UNKNOWN': 'uint8',
'Census_ActivationChannel_OEM:DM': 'uint8',
'Census_ActivationChannel_OEM:NONSLP': 'uint8',
'Census_ActivationChannel_Retail': 'uint8',
'Census_ActivationChannel_Retail:TB:Eval': 'uint8',
'Census_ActivationChannel_Volume:GVLK': 'uint8',
'Census_ActivationChannel_Volume:MAK': 'uint8'
}


In [None]:
test_df = pd.read_csv('../input/rizwanv2/test_df.csv', dtype = dtypes)

In [None]:
test_df.head()

In [None]:
df_subset = pd.read_csv('../input/rizwanv2/train_df.csv', dtype = dtypes, nrows = 2000000)

In [None]:
main_X = df_subset.drop(['HasDetections', 'MachineIdentifier'], axis=1)
main_y = df_subset['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(main_X, main_y, test_size = 0.2, random_state = 42)


##### Defining the grid with possible hyperparameters and then using grid search techniques.

In [None]:
grid = {
    'alpha': [1e-1, 1e0, 1e1, 1e2], # learning rate
    'n_estimators': [300, 350, 400, 450, 500], # number of epochs
    'loss': ['log'], # logistic regression,
    'penalty': ['l2'],
    'n_jobs': [-1]
}
paramGrid = ParameterGrid(grid)

bestModel, bestScore, allModels, allScores =  pf.bestFit(xgb.XGBClassifier, paramGrid, X_train, y_train, X_test, y_test, 
                                                         metric = metrics.roc_auc_score,greater_is_better=True, scoreLabel = "AUC")

In [1]:
bestModel

NameError: name 'bestModel' is not defined