In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
import lazypredict
from sklearn.ensemble import *
from sklearn.metrics import *
import lime.lime_tabular
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
df=pd.read_csv("dataset.csv",sep='|')
df.head()

In [None]:

print(df.shape[0])

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.Target.value_counts() 

In [None]:
df.info()

In [None]:
plt.pie(df.Target.value_counts().values.tolist(), labels=['Safe','Ransomware'], autopct='%.2f%%')
plt.legend()
plt.title(f"Distribution of Labelled Data, total - {len(df)}")
plt.show()

In [None]:
df.head()

In [None]:
df.md5.nunique()

In [None]:
df.md5.isnull().sum()

In [None]:
df.head()

In [None]:

sns.heatmap(df.drop(['Name','md5','Target'], axis=1).corr())
plt.show()

In [None]:

corr_matrix = df.drop(['Name','md5','Target'], axis=1).corr().abs() 


upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))


to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

print(to_drop)

In [None]:

df.drop(to_drop, axis=1, inplace=True)

In [None]:
sns.heatmap(df.drop(['Name','md5','Target'], axis=1).corr())

In [None]:
#feature selection
def relief(data, target, bins=10, show_woe=False):
    newDF,woeDF = pd.DataFrame(), pd.DataFrame()
    
    cols = data.columns
    
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
        d['Non-Events'] = d['N'] - d['Events']
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        d.insert(loc=0, column='Variable', value=ivars)
        print("Information value of " + ivars + " is " + str(round(d['IV'].sum(),6)))
        temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF=pd.concat([newDF,temp], axis=0)
        woeDF=pd.concat([woeDF,d], axis=0)

        if show_woe == True:
            print(d)
    return newDF, woeDF

In [None]:
df.Target.dtypes

In [None]:
iv, woe = relief(df.drop(['Name','md5'],axis=1), 'Target')

In [None]:
iv.sort_values(by = 'IV', ascending=False)

In [None]:
thresh = 1
res = len(iv)-len(iv[iv['IV']>thresh]) 
print(res) 

In [None]:
features = iv.sort_values(by = 'IV', ascending=False)['Variable'][:res].values.tolist()

In [None]:
print(features,'\n')
print('Total number of features-\n',len(features))

In [None]:
X = df[features]
y = df['Target']

In [None]:
randomseed = 42
ac=[]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=randomseed)

In [None]:
#existing decision tree 
from sklearn import tree
model=tree.DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
ac.append(accuracy_score(y_pred,y_test)*100)
accuracy_score(y_pred,y_test)

In [None]:
cm = confusion_matrix(y_test,y_pred)
classes = ['Safe', 'Malware']

cmd = ConfusionMatrixDisplay(cm, display_labels=classes)
cmd.plot()
plt.show()

In [None]:
#random forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=2, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
ac.append(accuracy_score(y_pred,y_test)*100)
accuracy_score(y_pred,y_test)

In [None]:
cm = confusion_matrix(y_test,y_pred)
classes = ['Safe', 'Malware']

cmd = ConfusionMatrixDisplay(cm, display_labels=classes)
cmd.plot()
plt.show()

In [None]:
#Extreme tree
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
ac.append(accuracy_score(y_pred,y_test)*100)
accuracy_score(y_pred,y_test)

In [None]:
cm = confusion_matrix(y_test,y_pred)
classes = ['Safe', 'Malware']

cmd = ConfusionMatrixDisplay(cm, display_labels=classes)
cmd.plot()
plt.show()

In [None]:
import numpy as np
import seaborn as sns
import matplotlib as pl

pl.style.use('dark_background')
x=['Decision Tree','Random Forest','Extreme Random Tree']
 
ax=sns.barplot(x,ac)
ax.set_title('Accuracy comparison beore Balancing')
ax.set_ylabel('Accuracy')
#ax.yaxis.set_major_locator(ticker.LinearLocator())
print("the accuracy of {} is {} and {} is {}".format(x[0],ac[0],x[1],ac[1]))
ax.set_ylim(50,100)
import pandas as pd
data={'Agorithms':x,
     "accuracy":ac}
df=pd.DataFrame(data)
df.head()

In [None]:
counter = Counter(y_train)
print('Before', counter)

In [None]:
#applying smote to convert imbalanced daraset to balanced one then again performing classification to avoid underfitting
ac=[]
smt = SMOTE()
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

counter = Counter(y_train_sm)
print('After', counter)

In [None]:
from sklearn import tree
model=tree.DecisionTreeClassifier()
model.fit(X_train_sm, y_train_sm)
y_pred = model.predict(X_test)
ac.append(accuracy_score(y_pred,y_test)*100)
accuracy_score(y_pred,y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=2, random_state=0)
model.fit(X_train_sm, y_train_sm)
y_pred = model.predict(X_test)
ac.append(accuracy_score(y_pred,y_test)*100)
accuracy_score(y_pred,y_test)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
ac.append(accuracy_score(y_pred,y_test)*100)
accuracy_score(y_pred,y_test)

In [None]:
cm = confusion_matrix(y_test,y_pred)


In [None]:
classes = ['Safe', 'Malware']

cmd = ConfusionMatrixDisplay(cm, display_labels=classes)
cmd.plot()
plt.show()

In [None]:
import numpy as np
import seaborn as sns
import matplotlib as plt

plt.style.use('dark_background')
x=['Decision Tree','Random Forest','Extreme Random Tree']
 
ax=sns.barplot(x,ac)
ax.set_title('Accuracy comparison After Balancing')
ax.set_ylabel('Accuracy')
#ax.yaxis.set_major_locator(ticker.LinearLocator())
print("the accuracy of {} is {} and {} is {}".format(x[0],ac[0],x[1],ac[1]))
ax.set_ylim(50,100)
import pandas as pd
data={'Agorithms':x,
     "accuracy":ac}
df=pd.DataFrame(data)
df.head()

In [None]:
y_pred

In [None]:
#test data prediction if prediction is 1 -ransome ware ,0--normal
for i in y_pred:
    if i==1:
        print("Ransome attack")
    else:
        print("No attack")

In [None]:
pred=model.predict([[4194304.0, 23, 7.998828, 4, 48, 1048576, 258, 306688, 4, 6, 2, 2.146071, 12288, 335872]])

In [None]:
pred