<a href="https://colab.research.google.com/github/Rjlee22/REUproject/blob/main/REU_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import sys
import os

from sklearn.model_selection import train_test_split
df = pd.read_csv('Microsoft_condensed.csv')

In [None]:
column_data_types = {'IsBeta' : 'Int8',
                 'RtpStateBitfield' : 'Int16',
                 'IsSxsPassiveMode' : 'Int8',
                 'HasTpm' : 'Int8',
                 'CountryIdentifier' : 'Int64',
                 'CityIdentifier' : 'Int64',
                 'OrganizationIdentifier' : 'Int64',
                 'IsProtected' : 'Int8',
                 'AutoSampleOptIn' : 'Int8',
                 'SMode' : 'Int8',
                 'Firewall' : 'Int8',
                 'Census_HasOpticalDiskDrive' : 'Int8',
                 'Census_IsPortableOperatingSystem' : 'Int8',
                 'Census_IsFlightsDisabled' : 'Int8',
                 'Census_IsSecureBootEnabled' : 'Int8',
                 'Census_IsWIMBootEnabled' : 'Int8',
                 'Census_IsVirtualDevice' : 'Int8',
                 'Census_IsTouchEnabled' : 'Int8',
                 'Census_IsPenCapable' : 'Int8',
                 'Census_IsAlwaysOnAlwaysConnectedCapable': 'Int8',
                 'Wdft_IsGamer' : 'Int8',
                 'HasDetections' : 'int'}

In [None]:
columns_to_ignore = ('DefaultBrowsersIdentifier',  # 95.14% NA values
                     'PuaMode', # 99.97% NA values
                     'Census_ProcessorClass', # 99.59% NA values.
                     'Census_InternalBatteryType', # 71.05% NA values
                     'Census_IsFlightingInternal', #83.04% NA values
                     'Census_ThresholdOptIn', # 63.52% NA values

                     # numerical features
                     'Census_PrimaryDiskTotalCapacity',
                     'Census_SystemVolumeTotalCapacity',
                     'Census_TotalPhysicalRAM',
                     'Census_InternalPrimaryDisplayResolutionHorizontal',
                     'Census_InternalPrimaryDisplayResolutionVertical',
                     'Census_InternalPrimaryDiagonalDisplaySizeInInches',
                     'Census_InternalBatteryNumberOfCharges',

                     'IsBeta',
                     'AutoSampleOptIn',
                     'UacLuaenable',
                     'Census_IsWIMBootEnabled',

                     'Census_FlightRing_not',
                     'Census_IsAlwaysOnAlwaysConnectedCapable',
                     'Census_IsSecureBootEnabled',
                     'Census_IsTouchEnabled',
                     'Census_IsVirtualDevice',
                     'SMode'
                    )

In [None]:
train_df = pd.read_csv('Microsoft_condensed.csv',usecols=lambda x: x not in columns_to_ignore,
    dtype=column_data_types
)
train_df.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,...,Census_GenuineStateName,Census_ActivationChannel,Census_IsFlightsDisabled,Census_FlightRing,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsPenCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,f1cd864e97bae82bdf96523e1a539121,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1234.0,7,0,53447.0,1.0,1.0,...,IS_GENUINE,OEM:DM,0,Retail,355.0,19951.0,0,0,11.0,1
1,fd5ba6f5b75325ec0423a6c67cc75942,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1282.0,7,0,53447.0,1.0,1.0,...,IS_GENUINE,OEM:DM,0,Retail,486.0,48753.0,0,1,3.0,0
2,4e628391e7cc7c482fb3286f486dbd25,win8defender,1.1.15100.1,4.9.10586.1106,1.273.781.0,7,0,46781.0,2.0,1.0,...,IS_GENUINE,Retail,0,Retail,554.0,33111.0,1,0,15.0,0
3,270a2e9d028144a4df12a9e3da79fba5,win8defender,1.1.15200.1,4.18.1807.18075,1.275.1639.0,7,0,53447.0,1.0,1.0,...,IS_GENUINE,Retail,0,Retail,142.0,48473.0,0,0,15.0,1
4,06ca8fa8d32c2abdc5b3577d676b3269,win8defender,1.1.15200.1,4.18.1807.18075,1.275.511.0,7,0,47238.0,2.0,1.0,...,IS_GENUINE,OEM:DM,0,Retail,355.0,4343.0,0,1,13.0,1


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

X = train_df.drop(['HasDetections', 'MachineIdentifier'], axis=1)
y = train_df['HasDetections']

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
int16_columns = train_df.select_dtypes(include=['Int16']).columns

# Convert these columns to float
train_df[int16_columns] = train_df[int16_columns].astype(float)
train_df.dtypes

MachineIdentifier                    object
ProductName                          object
EngineVersion                        object
AppVersion                           object
AvSigVersion                         object
                                     ...   
Census_FirmwareVersionIdentifier    float64
Census_IsPenCapable                    Int8
Wdft_IsGamer                           Int8
Wdft_RegionIdentifier               float64
HasDetections                         int64
Length: 61, dtype: object

In [None]:
#train_df.drop(columns = ['MachineIdentifier'], inplace = True)

In [None]:
'''def categorical_preprocessing(df):
    temp = df.copy()

    cols = temp.select_dtypes(include = [object]).columns.tolist()
    temp[cols] = temp[cols].astype(str).apply(lambda x: x.str.lower().apply(sys.intern))

    os_build_lab_cat = 'OsBuildLab'
    if os_build_lab_cat in temp.columns:
        os_build_lab_df = temp[os_build_lab_cat].str.split(pat = '.', n = 5, expand = True)
        os_build_lab_df = os_build_lab_df.astype(str).apply(lambda x: x.str.lower().apply(sys.intern))
        os_build_lab_df = os_build_lab_df.add_prefix(os_build_lab_cat + '_')

        temp = pd.concat([temp, os_build_lab_df], axis = 1)
        temp = temp.drop(columns = os_build_lab_cat)

    smart_screen_cat = 'SmartScreen'
    if smart_screen_cat in temp.columns:
        temp.loc[temp[smart_screen_cat] == 'promt', smart_screen_cat] = 'prompt'
        temp.loc[temp[smart_screen_cat] == '00000000', smart_screen_cat] = '0'
        temp[smart_screen_cat] = temp[smart_screen_cat].astype(str).apply(sys.intern)

    disk_type_cat = 'Census_PrimaryDiskTypeName'
    if disk_type_cat in temp.columns:
        disk_types = ['HDD', 'SSD']
        temp.loc[~temp[disk_type_cat].isin(disk_types), disk_type_cat] == 'na'
        temp[disk_type_cat] = temp[disk_type_cat].astype(str).apply(sys.intern)

    role_name_cat = 'Census_PowerPlatformRoleName'
    if role_name_cat in temp.columns:
        na_types = ['unspecified', 'unknown', np.nan]
        temp.loc[temp[role_name_cat].isin(na_types), role_name_cat] == 'na'
        temp[role_name_cat] = temp[role_name_cat].astype(str).apply(sys.intern)

    return temp'''

"def categorical_preprocessing(df):\n    temp = df.copy()\n\n    cols = temp.select_dtypes(include = [object]).columns.tolist()\n    temp[cols] = temp[cols].astype(str).apply(lambda x: x.str.lower().apply(sys.intern))\n\n    os_build_lab_cat = 'OsBuildLab'\n    if os_build_lab_cat in temp.columns:\n        os_build_lab_df = temp[os_build_lab_cat].str.split(pat = '.', n = 5, expand = True)\n        os_build_lab_df = os_build_lab_df.astype(str).apply(lambda x: x.str.lower().apply(sys.intern))\n        os_build_lab_df = os_build_lab_df.add_prefix(os_build_lab_cat + '_')\n\n        temp = pd.concat([temp, os_build_lab_df], axis = 1)\n        temp = temp.drop(columns = os_build_lab_cat)\n\n    smart_screen_cat = 'SmartScreen'\n    if smart_screen_cat in temp.columns:\n        temp.loc[temp[smart_screen_cat] == 'promt', smart_screen_cat] = 'prompt'\n        temp.loc[temp[smart_screen_cat] == '00000000', smart_screen_cat] = '0'\n        temp[smart_screen_cat] = temp[smart_screen_cat].astype(s

In [None]:
'''train_df = categorical_preprocessing(train_df)'''


'train_df = categorical_preprocessing(train_df)'

In [None]:
'''label_column = 'HasDetections'
train_df.drop_duplicates(inplace = True)
y = train_df.pop(label_column)'''

"label_column = 'HasDetections'\ntrain_df.drop_duplicates(inplace = True)\ny = train_df.pop(label_column)"

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.neural_network import MLPClassifier
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32),   # Three hidden layers with 128, 64, and 32 neurons
    activation='relu',                 # Activation function for the hidden layers
    solver='adam',                     # Optimizer
    alpha=0.0001,                      # L2 penalty (regularization term) parameter
    batch_size='auto',                 # Size of minibatches for stochastic optimizers
    learning_rate='constant',          # Learning rate schedule for weight updates
    learning_rate_init=0.001,          # Initial learning rate used
    max_iter=25,                      # Maximum number of iterations
    shuffle=True,                      # Whether to shuffle samples in each iteration
    random_state=42,                   # Random state for reproducibility
    tol=0.0001,                        # Tolerance for the optimization
    verbose=True,                      # Print progress messages to stdout
    early_stopping=True,               # Whether to use early stopping to stop training when
    validation_fraction=0.2            # Proportion of training data to set aside
)

# Train the neural network using MLPClassifier
#mlp = MLPClassifier(hidden_layer_sizes=(128, 64, 32), max_iter=100, random_state=42)
mlp.fit(X_train_preprocessed, y_train)

Iteration 1, loss = 0.63607970
Validation score: 0.630563
Iteration 2, loss = 0.62276807
Validation score: 0.633387
Iteration 3, loss = 0.61340268
Validation score: 0.631725
Iteration 4, loss = 0.60150761
Validation score: 0.627413
Iteration 5, loss = 0.58655580
Validation score: 0.627062
Iteration 6, loss = 0.56843645
Validation score: 0.621363
Iteration 7, loss = 0.54904088
Validation score: 0.616513
Iteration 8, loss = 0.52969739
Validation score: 0.619812
Iteration 9, loss = 0.51102726
Validation score: 0.611875
Iteration 10, loss = 0.49408149
Validation score: 0.609137
Iteration 11, loss = 0.47876094
Validation score: 0.604213
Iteration 12, loss = 0.46408648
Validation score: 0.609313
Iteration 13, loss = 0.45227438
Validation score: 0.603500
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


In [None]:
from sklearn.metrics import accuracy_score
y_pred = mlp.predict(X_test_preprocessed)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.63488

In [None]:
#pip install torch

In [None]:
'''import torch
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_preprocessed, dtype=torch.float32).shape[0]
X_test_tensor = torch.tensor(X_test_preprocessed, dtype=torch.float32).shape[0]
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)'''

'import torch\nX_train_preprocessed = preprocessor.fit_transform(X_train)\nX_test_preprocessed = preprocessor.transform(X_test)\n# Convert to PyTorch tensors\nX_train_tensor = torch.tensor(X_train_preprocessed, dtype=torch.float32).shape[0]\nX_test_tensor = torch.tensor(X_test_preprocessed, dtype=torch.float32).shape[0]\ny_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)\ny_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)\n\n# Create DataLoader\ntrain_dataset = TensorDataset(X_train_tensor, y_train_tensor)\ntest_dataset = TensorDataset(X_test_tensor, y_test_tensor)\n\ntrain_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)\ntest_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)'

In [None]:
'''import tensorflow as tf
X_train_preprocessed = preprocessor.fit_transform(X_train).toarray()
X_test_preprocessed = preprocessor.transform(X_test).toarray()

# Convert to tensors
X_train_tensor = tf.convert_to_tensor(X_train_preprocessed, dtype=tf.float32)
X_test_tensor = tf.convert_to_tensor(X_test_preprocessed, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train.values, dtype=tf.float32)
y_test_tensor = tf.convert_to_tensor(y_test.values, dtype=tf.float32)'''


'import tensorflow as tf\nX_train_preprocessed = preprocessor.fit_transform(X_train).toarray()\nX_test_preprocessed = preprocessor.transform(X_test).toarray()\n\n# Convert to tensors\nX_train_tensor = tf.convert_to_tensor(X_train_preprocessed, dtype=tf.float32)\nX_test_tensor = tf.convert_to_tensor(X_test_preprocessed, dtype=tf.float32)\ny_train_tensor = tf.convert_to_tensor(y_train.values, dtype=tf.float32)\ny_test_tensor = tf.convert_to_tensor(y_test.values, dtype=tf.float32)'

In [None]:
'''import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)'''

"import tensorflow as tf\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Dense, Dropout\n\nmodel = Sequential([\n    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),\n    Dropout(0.3),\n    Dense(64, activation='relu'),\n    Dropout(0.3),\n    Dense(32, activation='relu'),\n    Dropout(0.3),\n    Dense(1, activation='sigmoid')\n])\n\n# Compile the model\nmodel.compile(optimizer='adam',\n              loss='binary_crossentropy',\n              metrics=['accuracy'])\n\n# Train the model\nhistory = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)"

In [None]:
'''X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)  # Replace NaN and inf with 0
y_train = np.nan_to_num(y_train, nan=0.0, posinf=0.0, neginf=0.0)  # Replace NaN and inf with 0

# Ensure X_train is of numeric type
X_train = X_train.astype(np.float32)

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)'''

'X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)  # Replace NaN and inf with 0\ny_train = np.nan_to_num(y_train, nan=0.0, posinf=0.0, neginf=0.0)  # Replace NaN and inf with 0\n\n# Ensure X_train is of numeric type\nX_train = X_train.astype(np.float32)\n\nhistory = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)'

CNN

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

X_train_dense = X_train_preprocessed.toarray()
X_test_dense = X_test_preprocessed.toarray()

X_train_tensor = torch.tensor(X_train_dense, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_dense, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.fc1 = nn.Linear(X_train_tensor.shape[1], 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc4(x))
        return x

# Initialize the model, loss function, and optimizer
model = SimpleCNN()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

Epoch 1/20, Loss: 0.6409
Epoch 2/20, Loss: 0.6326
Epoch 3/20, Loss: 0.6279
Epoch 4/20, Loss: 0.6241
Epoch 5/20, Loss: 0.6206
Epoch 6/20, Loss: 0.6174
Epoch 7/20, Loss: 0.6142
Epoch 8/20, Loss: 0.6114
Epoch 9/20, Loss: 0.6081
Epoch 10/20, Loss: 0.6048
Epoch 11/20, Loss: 0.6020
Epoch 12/20, Loss: 0.5988
Epoch 13/20, Loss: 0.5964
Epoch 14/20, Loss: 0.5937
Epoch 15/20, Loss: 0.5909
Epoch 16/20, Loss: 0.5879
Epoch 17/20, Loss: 0.5855
Epoch 18/20, Loss: 0.5834
Epoch 19/20, Loss: 0.5809
Epoch 20/20, Loss: 0.5791


In [None]:
model.eval()
y_pred = []
with torch.no_grad():
    for inputs, _ in test_loader:
        outputs = model(inputs).squeeze()
        y_pred.extend(outputs.tolist())

# Convert predictions to binary
y_pred = np.round(y_pred)
accuracy = (y_pred == y_test_tensor.numpy()).mean()
accuracy

0.62144