In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import shutil

In [None]:
# Load data
data = pd.read_csv('/content/drive/MyDrive/Threat_Prediction_Data.csv')


**Data Cleaning**

Objective: Remove or handle missing values, outliers, and duplicates.

Handling Missing Values:

* Remove Missing Values: Drop rows or columns with missing values.
* Remove Outliers: Identify and remove outliers that deviate significantly from the rest of the data.
* Remove Duplicates: Identify and remove duplicate rows to avoid redundant information.

In [None]:
# Displaying the basic information
print(data.info())
print(data.describe())
print(data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   MachineId                         50000 non-null  object 
 1   ProductName                       50000 non-null  object 
 2   HasTpm                            50000 non-null  int64  
 3   Platform                          50000 non-null  object 
 4   Processor                         50000 non-null  object 
 5   SkuEdition                        50000 non-null  object 
 6   IsProtected                       46951 non-null  float64
 7   Firewall                          49447 non-null  float64
 8   AdminApprovalMode                 49942 non-null  float64
 9   DeviceType                        50000 non-null  object 
 10  PrimaryDiskTotalCapacity          49686 non-null  float64
 11  PrimaryDiskTypeName               49917 non-null  object 
 12  Syst

In [None]:
# Remove duplicates
data_cleaned = data.drop_duplicates()

In [None]:
numerical_cols = data_cleaned.select_dtypes(include=['float64', 'int64']).columns

# Handling outliers (example: remove rows where numerical features are outside 3 standard deviations)
data_cleaned = data_cleaned[(np.abs(data_cleaned[numerical_cols] - data_cleaned[numerical_cols].mean()) <= (3 * data_cleaned[numerical_cols].std())).all(axis=1)]

# Print data info after cleaning
print(data_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
Index: 31256 entries, 0 to 44240
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   MachineId                         31256 non-null  object 
 1   ProductName                       31256 non-null  object 
 2   HasTpm                            31256 non-null  int64  
 3   Platform                          31256 non-null  object 
 4   Processor                         31256 non-null  object 
 5   SkuEdition                        31256 non-null  object 
 6   IsProtected                       31256 non-null  float64
 7   Firewall                          31256 non-null  float64
 8   AdminApprovalMode                 31256 non-null  float64
 9   DeviceType                        31256 non-null  object 
 10  PrimaryDiskTotalCapacity          31256 non-null  float64
 11  PrimaryDiskTypeName               31226 non-null  object 
 12  SystemVol


**Normalization**

Objective: Scale numerical data to a standard range, typically [0,1] or [-1,1].

In [None]:
# Normalization using Min-Max Scaling
scaler = MinMaxScaler()
data_cleaned[numerical_cols] = scaler.fit_transform(data_cleaned[numerical_cols])

# Alternative: Normalization using Z-score (StandardScaler)
scaler = StandardScaler()
data_cleaned[numerical_cols] = scaler.fit_transform(data_cleaned[numerical_cols])

print(data_cleaned.head())

                          MachineId   ProductName  HasTpm   Platform  \
0  0000028988387b115f69f31a3bf04f09  win8defender     0.0  windows10   
1  000007535c3f730efa9ea0b7ef1bd645  win8defender     0.0  windows10   
2  000007905a28d863f6d0d597892cd692  win8defender     0.0  windows10   
3  00000b11598a75ea8ba1beea8459149f  win8defender     0.0  windows10   
4  000014a5f00daa18e76b81417eeb99fc  win8defender     0.0  windows10   

  Processor SkuEdition  IsProtected  Firewall  AdminApprovalMode DeviceType  \
0       x64        Pro          0.0       0.0                0.0    Desktop   
1       x64        Pro          0.0       0.0                0.0   Notebook   
2       x64       Home          0.0       0.0                0.0    Desktop   
3       x64        Pro          0.0       0.0                0.0    Desktop   
4       x64       Home          0.0       0.0                0.0   Notebook   

   ...  SystemVolumeTotalCapacity HasOpticalDiskDrive  TotalPhysicalRAM  \
0  ...           


**Encoding Categorical Data **

Objective: Convert categorical data into numerical format.

* One-Hot Encoding: Converts categorical variables into a series of binary columns.
* Label Encoding: Assigns each category a unique integer.




In [None]:
# Encoding categorical data using One-Hot Encoding
categorical_cols = data_cleaned.select_dtypes(include=['object']).columns
data_encoded = pd.get_dummies(data_cleaned, columns=categorical_cols)

print(data_encoded.head())

   HasTpm  IsProtected  Firewall  AdminApprovalMode  PrimaryDiskTotalCapacity  \
0     0.0          0.0       0.0                0.0                 -0.082836   
1     0.0          0.0       0.0                0.0                 -0.082836   
2     0.0          0.0       0.0                0.0                 -1.240722   
3     0.0          0.0       0.0                0.0                 -0.844603   
4     0.0          0.0       0.0                0.0                 -0.082836   

   SystemVolumeTotalCapacity  HasOpticalDiskDrive  TotalPhysicalRAM  \
0                  -0.239952                  0.0         -0.465228   
1                  -0.909595                  0.0         -0.465228   
2                  -0.870443                  0.0         -0.465228   
3                  -0.485751                  0.0         -0.465228   
4                  -0.911243                  0.0          0.123819   

   IsSecureBootEnabled  IsPenCapable  ...  \
0             0.982493           0.0  ...

In [None]:
# Encoding categorical data using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data_cleaned[col] = le.fit_transform(data_cleaned[col])
    label_encoders[col] = le

# Display the first few rows of the DataFrame
print(data_cleaned.head())

   MachineId  ProductName  HasTpm  Platform  Processor  SkuEdition  \
0          0            1     0.0         0          1           6   
1          1            1     0.0         0          1           6   
2          2            1     0.0         0          1           4   
3          3            1     0.0         0          1           6   
4          4            1     0.0         0          1           4   

   IsProtected  Firewall  AdminApprovalMode  DeviceType  ...  \
0          0.0       0.0                0.0           2  ...   
1          0.0       0.0                0.0           7  ...   
2          0.0       0.0                0.0           2  ...   
3          0.0       0.0                0.0           2  ...   
4          0.0       0.0                0.0           7  ...   

   SystemVolumeTotalCapacity  HasOpticalDiskDrive  TotalPhysicalRAM  \
0                  -0.239952                  0.0         -0.465228   
1                  -0.909595                  0.0   

**Feature Engineering**

Objective: Create new features that may enhance the predictive power of the model.

In [None]:
# Creating a new feature by combining existing features
data_encoded['DiskCapacityRatio'] = data_encoded['PrimaryDiskTotalCapacity'] / data_encoded['SystemVolumeTotalCapacity']

# Extracting time-based features from timestamps (if you have any timestamp columns)
# data_encoded['Year'] = pd.to_datetime(data_encoded['TimestampColumn']).dt.year
# data_encoded['Month'] = pd.to_datetime(data_encoded['TimestampColumn']).dt.month
# data_encoded['Day'] = pd.to_datetime(data_encoded['TimestampColumn']).dt.day

# Display the first few rows of the final DataFrame
print(data_encoded.head())

   HasTpm  IsProtected  Firewall  AdminApprovalMode  PrimaryDiskTotalCapacity  \
0     0.0          0.0       0.0                0.0                 -0.082836   
1     0.0          0.0       0.0                0.0                 -0.082836   
2     0.0          0.0       0.0                0.0                 -1.240722   
3     0.0          0.0       0.0                0.0                 -0.844603   
4     0.0          0.0       0.0                0.0                 -0.082836   

   SystemVolumeTotalCapacity  HasOpticalDiskDrive  TotalPhysicalRAM  \
0                  -0.239952                  0.0         -0.465228   
1                  -0.909595                  0.0         -0.465228   
2                  -0.870443                  0.0         -0.465228   
3                  -0.485751                  0.0         -0.465228   
4                  -0.911243                  0.0          0.123819   

   IsSecureBootEnabled  IsPenCapable  ...  AutoUpdate_DownloadFullAuto  \
0           

In [None]:
# Step 3: Save the Preprocessed Data
preprocessed_file_path = '/content/drive/My Drive/Threat_Prediction_Data_preprocessed.csv'
data.to_csv(preprocessed_file_path, index=False)

# Move the preprocessed data to Google Drive
destination_path = '/content/drive/My Drive/Threat_Prediction_Data_preprocessed.csv'
shutil.move(preprocessed_file_path, destination_path)

# Display the first few rows of the preprocessed dataset to verify
data.head()


Unnamed: 0,MachineId,ProductName,HasTpm,Platform,Processor,SkuEdition,IsProtected,Firewall,AdminApprovalMode,DeviceType,...,SystemVolumeTotalCapacity,HasOpticalDiskDrive,TotalPhysicalRAM,AutoUpdate,GenuineStateOS,IsSecureBootEnabled,IsPenCapable,IsAlwaysOnAlwaysConnectedCapable,IsGamer,IsInfected
0,0000028988387b115f69f31a3bf04f09,win8defender,1,windows10,x64,Pro,0.0,1.0,1.0,Desktop,...,299451.0,0,4096.0,UNKNOWN,Invalid,1.0,0,1.0,0.0,0
1,000007535c3f730efa9ea0b7ef1bd645,win8defender,1,windows10,x64,Pro,0.0,1.0,1.0,Notebook,...,102385.0,0,4096.0,UNKNOWN,OFFLINE,1.0,0,1.0,0.0,0
2,000007905a28d863f6d0d597892cd692,win8defender,1,windows10,x64,Home,0.0,1.0,1.0,Desktop,...,113907.0,0,4096.0,NOTIFY,Invalid,1.0,0,1.0,0.0,0
3,00000b11598a75ea8ba1beea8459149f,win8defender,1,windows10,x64,Pro,0.0,1.0,1.0,Desktop,...,227116.0,0,4096.0,NOTIFY,Invalid,1.0,0,1.0,0.0,1
4,000014a5f00daa18e76b81417eeb99fc,win8defender,1,windows10,x64,Home,0.0,1.0,1.0,Notebook,...,101900.0,0,6144.0,NOTIFY,Invalid,1.0,0,1.0,0.0,1


Data preprocessing steps are crucial for ensuring that your data is clean, well-structured, and suitable for analysis and modeling. Proper data cleaning, normalization, encoding, and feature engineering can significantly enhance the performance of machine learning models and lead to more accurate and insightful results.