In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import plot_roc_curve, roc_curve

In [2]:
dtypes = {'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float32',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float32',
        'AVProductsEnabled':                                    'float32',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float32',
        'GeoNameIdentifier':                                    'float32',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float32',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float32',
        'IeVerIdentifier':                                      'float32',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float32',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float32',
        'Census_ProcessorManufacturerIdentifier':               'float32',
        'Census_ProcessorModelIdentifier':                      'float32',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float32',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float32',
        'Census_IsFlightsDisabled':                             'float32',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float32',
        'Census_FirmwareManufacturerIdentifier':                'float32',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float32',
        'Census_IsVirtualDevice':                               'float32',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float32',
        'Wdft_IsGamer':                                         'float32',
        'Wdft_RegionIdentifier':                                'float32',
        'HasDetections':                                        'int8'}

In [2]:
df = pd.read_csv('train.csv', low_memory=True)
#df = pd.read_csv('train.csv',dtype=dtypes, low_memory=True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
columncounts = df.describe(include='all').loc[['count']]/df.shape[0]
missingcolumns = columncounts.columns[(columncounts < .75).any(axis=0)]
df_reduce = df.drop(columns=missingcolumns)
df_nonan = df_reduce.dropna()
df = df_nonan

In [4]:
df_y = df['HasDetections'].copy()
df=df.drop(columns='HasDetections')
df=df.drop(columns='MachineIdentifier')
df_y.value_counts(normalize=True)

1    0.50692
0    0.49308
Name: HasDetections, dtype: float64

# AntiVirus Only Aggregate

In [5]:
AVVar_Agg = ['EngineVersion', 'AppVersion', 'AvSigVersion','AVProductStatesIdentifier']
df_AV_agg =df[AVVar_Agg].copy()

In [6]:
#df_AV_agg['AvSigVersion']=df_AV_agg['AvSigVersion'].str.replace('.','')
for col in AVVar_Agg[0:3] :
    df_AV_agg[col]=df_AV_agg[col].str.replace('[.,&,#,x,;]','')
    df_AV_agg[col]=df_AV_agg[col].astype(float)

  df_AV_agg[col]=df_AV_agg[col].str.replace('[.,&,#,x,;]','')


In [7]:
oldlength = np.zeros(len(AVVar_Agg))
newlength = np.zeros(len(AVVar_Agg))
keptvals = []
threshold = .99
count = 0
df_len = len(df_AV_agg)
ones= np.ones(df_len)

for col in AVVar_Agg :
    temp = df_AV_agg[col]
    temp_unique = df_AV_agg[col].unique()
    oldlength[count] = len(temp_unique)
    print(oldlength[count])
    
    if len(temp_unique) > 10 :
        cumcounts = 0
        temp_counts = temp.value_counts(normalize=True)
        temp_column = np.zeros(len(temp))
        
        for i in range(0,len(temp_counts)):
            cumcounts += temp_counts.values[i]
            #print(cumcounts)
            index = float(temp_counts.index[i])
            temp_column += ones*(temp==index)*index

            if cumcounts > threshold :
                break
        
        temp_column += ones*(temp_column == 0)*-1
        df_AV_agg[col] = temp_column
        
    newlength[count] = len(df_AV_agg[col].unique())
    print(newlength[count])
    count += 1    

60.0
9.0
104.0
28.0
7346.0
1772.0
23107.0
1320.0


# AntiVirus Variables Only

In [None]:
df_AV_AVPSOnly = df['AVProductStatesIdentifier'].copy()
df_AV_AVPSOnly['AVProductStatesIdentifier'] = df_AV_agg['AVProductStatesIdentifier']


ohe_AVPSOnly = OneHotEncoder(categories='auto').fit(df_AV_AVPSOnly)
df_AV_AVPSOnly_ohe = ohe_AVPSOnly.transform(df_AV_AVPSOnly)
df_AV_AVPSOnly_ohe_columns = ohe_AVPSOnly.get_feature_names()
print(len(df_AV_AVPSOnly_ohe_columns))

In [30]:
cv_AVPSonly_accs = np.zeros((5,2))
cv_AVPSonly_roc = np.zeros((5,2))

kfold = StratifiedKFold(5, shuffle=True, random_state=869)

i = 0
for train_index,test_index in kfold.split(df_AV_AVPSonly_ohe, df_y):

#X_train_train, X_holdout, y_train_train, y_holdout = train_test_split(df_AV_ohe, df_y, test_size=0.2, random_state=842)
    X_train_train = df_AV_AVPSonly_ohe[train_index,:]
    X_holdout = df_AV_AVPSonly_ohe[test_index,:]
    y_train_train = df_y.iloc[train_index]
    y_holdout = df_y.iloc[test_index]
   
    lr_AVPSonly = LogisticRegression(max_iter=35000)
    lr_AVPSonly.fit(X_train_train, y_train_train)
        
    cv_AVPSonly_accs[i,1] = accuracy_score(y_holdout, lr_AVPSonly.predict(X_holdout))
    cv_AVPSonly_roc[i,1] = roc_auc_score(y_holdout, lr_AVPSonly.predict(X_holdout))
            
    i  = i + 1

In [31]:
print(np.mean(cv_AVPSonly_accs,axis=0))
print(np.mean(cv_AVPSonly_roc,axis=0))

[0.         0.58349517]
[0.         0.58058422]


In [32]:
score_AVPSonly_df = pd.DataFrame({'feature':df_AV_AVPSonly_ohe_columns,
                            'coefficient': lr_AVPSonly.coef_[0]})

pd.set_option('display.max_rows', None)
score_AVPSonly_df.sort_values('coefficient',ascending=False)

Unnamed: 0,feature,coefficient
1214,x7_63554.0,2.462449
1211,x7_63295.0,2.198366
1212,x7_63390.0,2.146335
1215,x7_63682.0,2.145014
302,x7_17386.0,1.94087
510,x7_35087.0,1.937663
1210,x7_63249.0,1.87402
320,x7_18303.0,1.810661
660,x7_43667.0,1.801632
286,x7_16655.0,1.788279


In [None]:
score_AVPSonly_df = pd.DataFrame({'feature':df_AV_AVPSonly_ohe_columns,
                            'coefficient': lr_AVPSonly.coef_[0]})

pd.set_option('display.max_rows', None)
score_AVPSonly_df.sort_values('coefficient',ascending=False)

In [112]:
cv_AVPSonly_tree_accs = np.zeros((5,2))
cv_AVPSonly_tree_roc = np.zeros((5,2))

kfold = StratifiedKFold(5, shuffle=True, random_state=869)

i = 0
for train_index,test_index in kfold.split(df_AV_AVPSonly_ohe, df_y):

#X_train_train, X_holdout, y_train_train, y_holdout = train_test_split(df_AV_ohe, df_y, test_size=0.2, random_state=842)
    X_train_train = df_AV_AVPSonly_ohe[train_index,:]
    X_holdout = df_AV_AVPSonly_ohe[test_index,:]
    y_train_train = df_y.iloc[train_index]
    y_holdout = df_y.iloc[test_index]
    
    y_baseline = np.ones(len(y_holdout))
    #y_baselinepreds = np.zeros((len(df_y),2))
    #y_baselinepreds[:,1] = np.ones((len(df_y)))
    
    cv_AVPSonly_tree_accs[i,0] = accuracy_score(y_holdout, y_baseline)
    cv_AVPSonly_tree_roc[i,0] = roc_auc_score(y_holdout, y_baseline)
    
    tree_AVPSonly = DecisionTreeClassifier(max_depth = 10)
    tree_AVPSonly.fit(X_train_train, y_train_train)
        
    cv_AVPSonly_tree_accs[i,1] = accuracy_score(y_holdout, tree_AVPSonly.predict(X_holdout))
    cv_AVPSonly_tree_roc[i,1] = roc_auc_score(y_holdout, tree_AVPSonly.predict(X_holdout))
            
    i  = i + 1

In [115]:
print(np.mean(cv_AVPSonly_tree_accs,axis=0))
print(np.mean(cv_AVPSonly_tree_roc,axis=0))

[0.50691956 0.58163892]
[0.5        0.57889574]


In [113]:
score_AVPSonly_df = pd.DataFrame({'feature':df_AV_AVPSonly_ohe_columns,
                            'importance_score': tree_AVPSonly.feature_importances_})

score_AVPSonly_df.sort_values('importance_score',ascending=False).head(5)

Unnamed: 0,feature,importance_score
982,x7_53447.0,0.763964
1215,x7_63682.0,0.043063
805,x7_47238.0,0.037063
674,x7_43856.0,0.032218
761,x7_46413.0,0.028339


In [11]:
AVVar_AVPSExclude = ['ProductName', 'EngineVersion', 'AppVersion', 'AvSigVersion', 'IsBeta', 'RtpStateBitfield', 'IsSxsPassiveMode', 
               'AVProductsInstalled', 'AVProductsEnabled', 'HasTpm', 'IsProtected', 'SMode', 
               'Firewall', 'UacLuaenable', 'Census_IsFlightsDisabled', 'Census_FlightRing', 'Census_IsSecureBootEnabled']
AVVar_Agg_AVPSExclude = ['EngineVersion', 'AppVersion', 'AvSigVersion']

df_AV_AVPSExclude = df[AVVar_AVPSExclude].copy()
df_AV_AVPSExclude[AVVar_Agg_AVPSExclude] = df_AV_agg[AVVar_Agg_AVPSExclude]


ohe_AVPSExclude = OneHotEncoder(categories='auto').fit(df_AV_AVPSExclude)
df_AV_AVPSExclude_ohe = ohe_AVPSExclude.transform(df_AV_AVPSExclude)
df_AV_AVPSExclude_ohe_columns = ohe_AVPSExclude.get_feature_names()
print(len(df_AV_AVPSExclude_ohe_columns))

1866


In [20]:
cv_AVPSExclude_accs = np.zeros((5,2))
cv_AVPSExclude_roc = np.zeros((5,2))

kfold = StratifiedKFold(5, shuffle=True, random_state=869)

i = 0
for train_index,test_index in kfold.split(df_AV_AVPSExclude_ohe, df_y):

#X_train_train, X_holdout, y_train_train, y_holdout = train_test_split(df_AV_ohe, df_y, test_size=0.2, random_state=842)
    X_train_train = df_AV_AVPSExclude_ohe[train_index,:]
    X_holdout = df_AV_AVPSExclude_ohe[test_index,:]
    y_train_train = df_y.iloc[train_index]
    y_holdout = df_y.iloc[test_index]
   
    lr_AVPSExclude = LogisticRegression(max_iter=35000)
    lr_AVPSExclude.fit(X_train_train, y_train_train)
        
    cv_AVPSExclude_accs[i,1] = accuracy_score(y_holdout, lr_AVPSExclude.predict(X_holdout))
    cv_AVPSExclude_roc[i,1] = roc_auc_score(y_holdout, lr_AVPSExclude.predict(X_holdout))
            
    i  = i + 1

In [21]:
print(np.mean(cv_AVPSExclude_accs,axis=0))
print(np.mean(cv_AVPSExclude_roc,axis=0))

[0.         0.59306138]
[0.         0.59176928]


In [26]:
score_AVPSExclude_df = pd.DataFrame({'feature':df_AV_AVPSExclude_ohe_columns,
                            'coefficient': lr_AVPSExclude.coef_[0]})

pd.set_option('display.max_rows', None)
score_AVPSExclude_df.sort_values('coefficient',ascending=False)

Unnamed: 0,feature,coefficient
1853,x14_0.0,1.658841
1820,x5_8.0,1.207392
1821,x5_35.0,1.10417
1557,x3_127315270.0,0.982001
1549,x3_127315040.0,0.93514
1548,x3_127315010.0,0.923158
1546,x3_127314940.0,0.89557
1550,x3_127315050.0,0.8919
1547,x3_127314980.0,0.883472
1554,x3_127315190.0,0.85642


In [29]:
print(AVVar_AVPSExclude[14])
print(AVVar_AVPSExclude[5])
print(AVVar_AVPSExclude[3])
print(AVVar_AVPSExclude[11])

Census_IsFlightsDisabled
RtpStateBitfield
AvSigVersion
SMode


In [13]:
cv_AVPSExclude_tree_accs = np.zeros((5,2))
cv_AVPSExclude_tree_roc = np.zeros((5,2))

kfold = StratifiedKFold(5, shuffle=True, random_state=869)

i = 0
for train_index,test_index in kfold.split(df_AV_AVPSExclude_ohe, df_y):

#X_train_train, X_holdout, y_train_train, y_holdout = train_test_split(df_AV_ohe, df_y, test_size=0.2, random_state=842)
    X_train_train = df_AV_AVPSExclude_ohe[train_index,:]
    X_holdout = df_AV_AVPSExclude_ohe[test_index,:]
    y_train_train = df_y.iloc[train_index]
    y_holdout = df_y.iloc[test_index]
    
    y_baseline = np.ones(len(y_holdout))
    #y_baselinepreds = np.zeros((len(df_y),2))
    #y_baselinepreds[:,1] = np.ones((len(df_y)))
    
    cv_AVPSExclude_tree_accs[i,0] = accuracy_score(y_holdout, y_baseline)
    cv_AVPSExclude_tree_roc[i,0] = roc_auc_score(y_holdout, y_baseline)
    
    tree_AVPSExclude = DecisionTreeClassifier(max_depth = 10)
    tree_AVPSExclude.fit(X_train_train, y_train_train)
        
    cv_AVPSExclude_tree_accs[i,1] = accuracy_score(y_holdout, tree_AVPSExclude.predict(X_holdout))
    cv_AVPSExclude_tree_roc[i,1] = roc_auc_score(y_holdout, tree_AVPSExclude.predict(X_holdout))
            
    i  = i + 1

In [16]:
print(np.mean(cv_AVPSExclude_tree_accs,axis=0))
print(np.mean(cv_AVPSExclude_tree_roc,axis=0))

[0.50691956 0.59411728]
[0.5       0.5920764]


In [18]:
score_AVPSExclude_df = pd.DataFrame({'feature':df_AV_AVPSExclude_ohe_columns,
                            'importance_score': tree_AVPSExclude.feature_importances_})

score_AVPSExclude_df.sort_values('importance_score',ascending=False).head(5)

Unnamed: 0,feature,importance_score
1824,x7_1.0,0.436608
10,x1_11151001.0,0.146487
11,x1_11152001.0,0.117418
32,x2_418180718075.0,0.049253
1864,x16_0,0.022555


In [28]:
print(AVVar_AVPSExclude[7])
print(AVVar_AVPSExclude[1])
print(AVVar_AVPSExclude[2])
print(AVVar_AVPSExclude[16])

AVProductsInstalled
EngineVersion
AppVersion
Census_IsSecureBootEnabled


In [10]:
df_AV_NoAgg_AVPSOnly = df['AVProductStatesIdentifier'].copy()

In [30]:
test = df_AV_NoAgg_AVPSOnly_save.to_numpy()
df_AV_NoAgg_AVPSOnly = test.reshape(-1,1)

In [33]:
ohe_AVPSOnly_NoAgg = OneHotEncoder(categories='auto').fit(df_AV_NoAgg_AVPSOnly)
df_AV_NoAgg_AVPSOnly_ohe = ohe_AVPSOnly_NoAgg.transform(df_AV_NoAgg_AVPSOnly)
df_AV_NoAgg_AVPSOnly_ohe_columns = ohe_AVPSOnly_NoAgg.get_feature_names()
print(len(df_AV_NoAgg_AVPSOnly_ohe_columns))

23107


In [35]:
cv_AVPSOnly_NoAgg_tree_accs = np.zeros((5,2))
cv_AVPSOnly_NoAgg_tree_roc = np.zeros((5,2))

kfold = StratifiedKFold(5, shuffle=True, random_state=869)

i = 0
for train_index,test_index in kfold.split(df_AV_NoAgg_AVPSOnly_ohe, df_y):

#X_train_train, X_holdout, y_train_train, y_holdout = train_test_split(df_AV_ohe, df_y, test_size=0.2, random_state=842)
    X_train_train = df_AV_NoAgg_AVPSOnly_ohe[train_index,:]
    X_holdout = df_AV_NoAgg_AVPSOnly_ohe[test_index,:]
    y_train_train = df_y.iloc[train_index]
    y_holdout = df_y.iloc[test_index]
    
    y_baseline = np.ones(len(y_holdout))
    #y_baselinepreds = np.zeros((len(df_y),2))
    #y_baselinepreds[:,1] = np.ones((len(df_y)))
    
    cv_AVPSOnly_NoAgg_tree_accs[i,0] = accuracy_score(y_holdout, y_baseline)
    cv_AVPSOnly_NoAgg_tree_roc[i,0] = roc_auc_score(y_holdout, y_baseline)
    
    tree_AVPSOnly_NoAgg = DecisionTreeClassifier(max_depth = 10)
    tree_AVPSOnly_NoAgg.fit(X_train_train, y_train_train)
        
    cv_AVPSOnly_NoAgg_tree_accs[i,1] = accuracy_score(y_holdout, tree_AVPSOnly_NoAgg.predict(X_holdout))
    cv_AVPSOnly_NoAgg_tree_roc[i,1] = roc_auc_score(y_holdout, tree_AVPSOnly_NoAgg.predict(X_holdout))
            
    i  = i + 1

In [36]:
print(np.mean(cv_AVPSOnly_NoAgg_tree_accs,axis=0))
print(np.mean(cv_AVPSOnly_NoAgg_tree_roc,axis=0))

[0.50691956 0.58163892]
[0.5        0.57889574]
