<h1>Android Malware Detection</h1>
<p><b>Task : </b> Detection if there is presence of malware by using the attributes extracted from Android applications as features.</p>

<img src="https://securityintelligence.com/wp-content/uploads/2014/04/201310DIY-Android-Malware-Analysis-Taking-apart-OBAD-630x330.jpg" style="width : 100%;height : 90%;text-align : center;">

In [None]:
import pandas as pd
import numpy as np
np.random.seed(0)
from sklearn.metrics import precision_score,recall_score,f1_score
import tensorflow as tf
tf.compat.v1.set_random_seed(0)
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix

In [None]:
data = pd.read_csv("../input/android-malware-dataset-for-machine-learning/drebin-215-dataset-5560malware-9476-benign.csv")
print("Total missing values : ",sum(list(data.isna().sum())))
data

<p>The output class contains categorical values 'B' and 'S'. We have to encode them into integer values. The dataset contains some random characters like '?' and 'S'. We can set them to NULl and remove them using dropna()</p>

In [None]:
classes,count = np.unique(data['class'],return_counts=True)
#Perform Label Encoding
lbl_enc = LabelEncoder()
print(lbl_enc.fit_transform(classes),classes)
data = data.replace(classes,lbl_enc.fit_transform(classes))

#Dataset contains special characters like ''?' and 'S'. Set them to NaN and use dropna() to remove them
data=data.replace('[?,S]',np.NaN,regex=True)
print("Total missing values : ",sum(list(data.isna().sum())))
data.dropna(inplace=True)
for c in data.columns:
    data[c] = pd.to_numeric(data[c])
data

<p>Since the data values belong to either 0 or 1, only label encoding of last column will be enough.</p>

In [None]:
print("Total Features : ",len(data.columns)-1)

In [None]:

plt.bar(classes,count)
plt.title("Class balance")
plt.xlabel("Classes")
plt.ylabel("Count")
plt.show()

In [None]:
train_x,test_x,train_y,test_y = train_test_split(data[data.columns[:len(data.columns)-1]].to_numpy(),
                                                 data[data.columns[-1]].to_numpy(),
                                                  test_size = 0.2,
                                                  shuffle=True)

In [None]:
print("Train features size : ",len(train_x))
print("Train labels size : ",len(train_y))
print("Test features size : ",len(test_x))
print("Test features size : ",len(test_y))

In [None]:
print("Train features : ",train_x.shape)
print("Train labels : ",train_y.shape)
print("Test Features : ",test_x.shape)
print("Test labels : ",test_y.shape)

In [None]:
train_y = train_y.reshape((-1,1))
test_y = test_y.reshape((-1,1))

In [None]:
print("Train features : ",train_x.shape)
print("Train labels : ",train_y.shape)
print("Test Features : ",test_x.shape)
print("Test labels : ",test_y.shape)

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(215,activation='relu',input_shape=(None,215)))
model.add(keras.layers.Dense(100,activation='relu'))
model.add(keras.layers.Dense(1,activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer = keras.optimizers.RMSprop(0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
ep=5

In [None]:
history = model.fit(train_x,
                    train_y,
                    validation_data = (test_x,test_y),
                    epochs = ep)

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.plot([str(i) for i in range(1,ep+1)],history.history['accuracy'],label="Train Accuracy")
plt.plot([str(i) for i in range(1,ep+1)],history.history['val_accuracy'],label="Validation Accuracy")
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Epoch vs Train Loss")

plt.subplot(1,2,2)
plt.plot([str(i) for i in range(1,ep+1)],history.history['loss'],label="Train Loss")
plt.plot([str(i) for i in range(1,ep+1)],history.history['val_loss'],label="Validation Loss")
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Epoch vs Validation loss")

plt.show()

In [None]:
y_pred = model.predict(test_x)
for i in range(len(y_pred)):
    if y_pred[i] > (1-y_pred[i]):
        y_pred[i]=1
    else:
        y_pred[i]=0
print("Precision : ",precision_score(test_y,y_pred)*100)
print("Recall : ",recall_score(test_y,y_pred)*100)
print("F1 Score : ",f1_score(test_y,y_pred)*100)

In [None]:
classes = ["B","S"]
cm = confusion_matrix(y_pred,test_y)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=classes)
fig, ax = plt.subplots(figsize=(10,10))
plt.title("Confusion Matrix")
disp = disp.plot(ax=ax)
plt.show()