In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import random
import pandas_profiling as pdp
import scipy.stats as stats


%matplotlib inline


In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
plt.style.use('dark_background')

In [None]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', # was 'float32'
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', # was 'float16'
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', # was 'float16'
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', # was 'float32'
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', # was 'float32'
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', # was 'float16'
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', # was 'float32'
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [None]:
# Reading the whole file is extremely slow
# train_df= pd.read_csv(r'/home/raz1/Desktop/DS/Projects/MicrosoftChallenge/microsoft-malware-prediction/train.csv',
#                       dtype = dtypes)

traindf = pd.read_csv(
         r'/home/raz1/DS Projects/MicrosoftChallenge/data/train.csv',
         dtype = dtypes,
         header=0, 
         skiprows=lambda i: i>0 and random.random() > 0.005
)


In [None]:
# profie = pdp.ProfileReport(traindf)
# 
# profie
# 
# profie_detections = pdp.ProfileReport(traindf[traindf.HasDetections == 1][['SmartScreen']])
# profile_nodetections = pdp.ProfileReport(traindf[traindf.HasDetections == 0][['SmartScreen']])

In [None]:
yellow_features = ['OsVer',
'OsBuild',
'OsSuite',
'OsPlatformSubRelease',
'OsBuildLab',
'SkuEdition',
'Census_OSVersion',
'Census_OSBuildNumber',
'Census_OSBuildRevision',
'Census_OSEdition',
'Census_OSSkuName']

In [None]:
traindf['SkuEdition'].unique()

In [None]:
traindf['SkuEdition'].hist()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))

fig.subplots_adjust(wspace=0.5)

ax1.barh(y=traindf[traindf['HasDetections'] == 1].groupby('SkuEdition').count().MachineIdentifier.index, 
         width=traindf[traindf['HasDetections'] == 0].groupby('SkuEdition').count().MachineIdentifier)
ax1.set_xticks(ticks=[1000*i for i in range(20)])
ax1.set_xticklabels(labels=[i for i in range(20)])
ax1.grid(which='major', axis = 'x')

ax2.barh(y=traindf[traindf['HasDetections'] == 1].groupby('SkuEdition').count().MachineIdentifier.index, 
         width=traindf[traindf['HasDetections'] == 1].groupby('SkuEdition').count().MachineIdentifier)

ax2.set_xticks(ticks=[1000*i for i in range(20)])
ax2.set_xticklabels(labels=[i for i in range(20)])
ax2.grid(which='major', axis = 'x')

In [None]:
traindf['SkuHome'] = (traindf['SkuEdition'] == 'Home')*1

In [None]:
stats.ttest_ind(traindf[traindf['HasDetections'] == 0].SkuHome,traindf[traindf['HasDetections'] == 1].SkuHome)

In [None]:
yellow_features.remove('SkuEdition')
yellow_features.append('SkuHome')

In [None]:
corrs = traindf[yellow_features].corr()
