# Machine Learning (Final Project)

# Import some required libraries

In [1]:
!pip install -U imbalanced-learn




In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from google.colab import drive
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.ensemble import IsolationForest

# Load data and data preprocess

## Load data

In [3]:
# Load data
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/Malware_subset.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
sns.boxplot(data=df)
plt.xticks(rotation=45)  # Rotate x-axis labels if needed
plt.title("Boxplot for All Features")
plt.show()

# Data preprocessing

In [None]:
# Replace np.inf and -np.inf with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop np.nan numbers
print(f"NAN number: {df.isnull().sum()}")

# Drop duplicated data
df = df.drop_duplicates()
print(f"Duplicated data: {df.duplicated().sum()}")

# Handle missing data
print(f"Missing data's number: {df.isna().sum()}")

# Drop features with the full 0 or -1
df = df.loc[:, ~((df == 0) | (df == -1)).all(axis=0)]

# Label Encoder
encoder = LabelEncoder()
encoded = encoder.fit_transform(df['Label'])
df['Label'] = encoder.fit_transform(df['Label'])

# Remove anomalies
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(df)

# Keep only normal data points
df = df[outliers == 1]

# Split the features and labels
X = df.drop('Label', axis=1)
y = df["Label"]

# Standardization
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Choose important features
lasso = Lasso(alpha=0.01)
lasso.fit(X, y)
selected_features = X.columns[np.abs(lasso.coef_) > 0]
X = X[selected_features]

# Remove unimportant features
selector = VarianceThreshold(threshold=0.01)
X = selector.fit_transform(X)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Since it's the imbalanced data, i use SMOTE to oversampling datasets
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)

# Choose the model

In [None]:
# # Use Random Forest Model
# rf = RandomForestClassifier(n_estimators=20, random_state=42)
# rf.fit(X_train, y_train)

# # Make predictions
# y_pred = rf.predict(X_test)

from imblearn.ensemble import BalancedBaggingClassifier

#Create an instance
classifier = BalancedBaggingClassifier(estimator=RandomForestClassifier(),
                                sampling_strategy='not majority',
                                replacement=False,
                                random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)


In [None]:

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)