# Fitting Android Permissions Dataset to Naive Bayes Model

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/android-permission-dataset/__notebook_source__.ipynb
/kaggle/input/android-permission-dataset/Android_Permission.csv


In [2]:
import numpy as np 
import pandas as pd  

df = pd.read_csv('../input/android-permission-dataset/Android_Permission.csv')
df.head()


Unnamed: 0,App,Package,Category,Description,Rating,Number of ratings,Price,Related apps,Dangerous permissions count,Safe permissions count,...,Your personal information : read calendar events (D),Your personal information : read contact data (D),Your personal information : read sensitive log data (D),Your personal information : read user defined dictionary (D),Your personal information : retrieve system internal state (S),Your personal information : set alarm in alarm clock (S),Your personal information : write Browser's history and bookmarks (D),Your personal information : write contact data (D),Your personal information : write to user defined dictionary (S),Class
0,Canada Post Corporation,com.canadapost.android,Business,Canada Post Mobile App gives you access to som...,3.1,77,0.0,"{com.adaffix.pub.ca.android, com.kevinquan.gas...",7.0,1,...,0,1,0,0,0,0,0,1,0,0
1,Word Farm,com.realcasualgames.words,Brain & Puzzle,Speed and strategy combine in this exciting wo...,4.3,199,0.0,"{air.com.zubawing.FastWordLite, com.joybits.do...",3.0,2,...,0,0,0,0,0,0,0,0,0,0
2,Fortunes of War FREE,fortunesofwar.free,Cards & Casino,"Fortunes of War is a fast-paced, easy to learn...",4.1,243,0.0,"{com.kevinquan.condado, hu.monsta.pazaak, net....",1.0,1,...,0,0,0,0,0,0,0,0,0,0
3,Better Keyboard: Avatar Purple,com.cc.betterkeyboard.skins.avatarpurple,Libraries & Demo,Skin for Better Keyboard featuring a glossy fe...,3.6,2589,0.0,{eu.gdumoulin.betterandroid.skins.transparent....,0.0,0,...,0,0,0,0,0,0,0,0,0,0
4,Boxing Day,indiaNIC.android.BoxingDay,Lifestyle,Boxing Day by Christopher Jaymes<p>Based on a ...,0.0,0,5.99,,1.0,0,...,0,0,0,0,0,0,0,0,0,1


## Initial clean up and exploration

In [3]:
df.shape

(29999, 184)

In [4]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True) 
df.shape

(26586, 184)

### Ratio off malware to benign application in original data set

In [5]:
obs=len(df)
mal= len(df.loc[df['Class'] == 1])
not_mal= len(df.loc[df['Class'] == 0])
print('Percentages of malware and benign applications in original dataset:')
print('Num of Malware: {0} ({1:.2f}%)'.format(mal, (mal/obs)*100))
print('Num of benign: {0} ({1:.2f}%)'.format(not_mal, (not_mal/obs)*100))

Percentages of malware and benign applications in original dataset:
Num of Malware: 17592 (66.17%)
Num of benign: 8994 (33.83%)


## Below finds all columns with binary values that are application permissions, parsed using using the ':' character.
### only the list of permissions, both danerous and safe, are going to be used in evaluation the naive bayes model


In [6]:
permission_columns=df.columns[df.columns.str.contains(':')].to_list()


# Gathering data and preparing the Naive Bayes Model

In [7]:
from sklearn.model_selection import train_test_split
# Selecting feature and target columns
feature_cols =  permission_columns 
target_cols= ['Class']

# setting up 2 dataframes; one for the features, and one for the target variable
X= df[feature_cols].values #predictor columns
y= df[target_cols].values



X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=42)

In [8]:
### Training Algorithm
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model.fit(X_train, y_train.ravel())

GaussianNB()

# Evaluating performance of Naive Bayes

In [9]:
# Predicting values based on training data
predict_train = model.predict(X_train)
predict_test = model.predict(X_test)

from sklearn import metrics

print(" Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, predict_train)))
print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, predict_test)))

 Training Accuracy: 0.6659
Testing Accuracy: 0.6630


## Confusion Matrix and Classification Report

In [10]:
print('Confusion Matrix')
print(metrics.confusion_matrix(y_test, predict_test), end="\n")
print('Classification Report')
print(metrics.classification_report(y_test, predict_test))

Confusion Matrix
[[ 168 2505]
 [ 183 5120]]
Classification Report
              precision    recall  f1-score   support

           0       0.48      0.06      0.11      2673
           1       0.67      0.97      0.79      5303

    accuracy                           0.66      7976
   macro avg       0.58      0.51      0.45      7976
weighted avg       0.61      0.66      0.56      7976

