In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from ast import literal_eval
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.model_selection import ParameterGrid
from keras.utils import to_categorical
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
# from keras.models import Sequential
# from keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Flatten
import os
import openpyxl


In [None]:
def read_data_from_csv(file_path):
    df = pd.read_csv(file_path)
    return df

def xlsx_to_csv_pd(path_xls):
    temp = path_xls.rsplit('.', 1)
    path_csv = temp[0] + '.csv'
    data_xls = pd.read_excel(path_xls, index_col=0)
    data_xls.to_csv(path_csv, encoding='utf-8')
    return path_csv

# Label 0 Is Legiti 1 Is Malware
# 4 kinds
file_path = 'data/Classifer_BaseOnNPM/embedding/pm_npm/all.csv'  
# file_path = 'data/Classifer_BaseOnNPM/embedding/pm_npm/dynamic.csv'  
# file_path = 'data/Classifer_BaseOnNPM/embedding/pm_npm/meta4.csv'  
# file_path = 'data/Classifer_BaseOnNPM/embedding/pm_npm/static.csv'  

# Static without 'no get'
# file_path = 'data/Classifer_BaseOnNPM/embedding/pm_static_noget/npm.csv'  

df = read_data_from_csv(file_path)

# Randomly Select 200 Pieces Each
label_0 = df[df['label'] == 0].sample(n=200, random_state=42)
label_1 = df[df['label'] == 1].sample(n=200, random_state=42)
df = pd.concat([label_0, label_1])
# # print(df)

df['feature'] = df['feature'].apply(literal_eval)
endan=[]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def preprocess_data(df):
    scaler = StandardScaler()

    X_expanded = pd.DataFrame(df['feature'].tolist()).astype(float)

    X_expanded.fillna(0, inplace=True)  # Replace Nan With 0 You May Need To Choose Other Strategies

    X_scaled = scaler.fit_transform(X_expanded)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, df['label'], test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

def train_and_predict(model, X_train, X_test, y_train, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    X_test_expanded = X_test.astype(float)
    y_pred = best_model.predict(X_test_expanded)

    return y_pred, best_model, grid_search.best_params_, grid_search.best_score_

# The Main Function Of Machine Learning
def traditional_machine():
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values

    X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df)

    models_and_params = {
        LogisticRegression(): {'C': [0.001, 0.01, 0.1, 1, 10, 100]},
        DecisionTreeClassifier(): {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]},
        RandomForestClassifier(): {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]},
        SVC(): {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        # # GaussianNB(): {},
        KNeighborsClassifier(): {'n_neighbors': [3, 5, 7]},
        MLPClassifier(): {'hidden_layer_sizes': [(50,), (100,), (50, 50)]}
    }

    for model, params in models_and_params.items():
        model_name = model.__class__.__name__
        print(f"Tuning {model_name} parameters and performing cross-validation evaluation...")

        y_pred, best_model, best_params, best_score = train_and_predict(model, X_train_scaled, X_test_scaled, y_train, params)

        # Output best model and parameters
        print(f"Best model: {best_model}")
        print(f"Best parameters: {best_params}")
        print(f"Best cross-validation accuracy: {best_score:.2f}")

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print(f"{model_name} test accuracy: {accuracy:.10f}")
        print(f"{model_name} Precision: {precision:.10f}")
        print(f"{model_name} Recall: {recall:.10f}")
        print(f"{model_name} F1-Score: {f1:.10f}")
        endan.append([model_name,best_params,accuracy,precision,recall,f1])
        cm = confusion_matrix(y_test, y_pred)
        print(f"{model_name} Confusion Matrix:\n{cm}")

# Run machine learning
traditional_machine()


In [None]:
X_train = df.drop('label', axis=1)  
X_train = pd.DataFrame(df['feature'].tolist()).astype(float)
y_train = df['label']  

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_train_cnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_cnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

X_train_lstm = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_lstm = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

param_grid = {
    'epochs': [10, 20],
    'batch_size': [32, 64],
    'filters': [32, 64],
    'kernel_size': [3, 5]
}


best_params = None
best_performance = 0.0
best_precision = 0
best_recall = 0
best_f1 = 0

for params in ParameterGrid(param_grid):
    print("Current Parameters:", params)
    
    lstm_model = Sequential()
    lstm_model.add(LSTM(50, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
    lstm_model.add(Dense(1, activation='sigmoid'))  

    lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    lstm_model.fit(X_train_lstm, y_train, epochs=params['epochs'], batch_size=params['batch_size'], validation_split=0.1)

    lstm_predictions = lstm_model.predict(X_test_lstm)
    lstm_predictions = (lstm_predictions > 0.5).astype(int)  
    
    lstm_accuracy = accuracy_score(y_test, lstm_predictions)
    lstm_precision = precision_score(y_test, lstm_predictions)
    lstm_recall = recall_score(y_test, lstm_predictions)
    lstm_f1 = f1_score(y_test, lstm_predictions)
    
    print("LSTM Accuracy:", lstm_accuracy)
    print("LSTM Precision:", lstm_precision)
    print("LSTM Recall:", lstm_recall)
    print("LSTM F1-Score:", lstm_f1)

    lstm_cm = confusion_matrix(y_test, lstm_predictions)
    print("LSTM Confused Matrix:\n", lstm_cm)

    # Update The Best Performance
    if lstm_accuracy > best_performance:
        best_performance = lstm_accuracy
        best_params = params
        best_precision = lstm_precision
        best_recall = lstm_recall
        best_f1 = lstm_f1

print("Best Parameters:", best_params)
print("Best Performance (Accuracy):", best_performance)
print("LSTM Recall:", best_recall)
print("LSTM Precision:", best_precision)
print("LSTM F1-Score:", best_f1)
endan.append(['LSTM',best_params,best_performance,best_precision,best_recall,best_f1])



In [None]:
X_train = df.drop('label', axis=1)  
X_train = pd.DataFrame(df['feature'].tolist()).astype(float)
y_train = df['label']  

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_train_cnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_cnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)


param_grid = {
    'epochs': [10, 20],
    'batch_size': [32, 64],
    'filters': [32, 64],
    'kernel_size': [3, 5]
}
best_performance = 0.0
best_precision = 0
best_recall = 0
best_f1 = 0
for params in ParameterGrid(param_grid):
    print("Current Parameters:", params)
    
    cnn_model = Sequential()
    cnn_model.add(Conv1D(filters=params['filters'], kernel_size=params['kernel_size'], activation='relu', input_shape=(X_train_cnn.shape[1], X_train_cnn.shape[2])))
    cnn_model.add(MaxPooling1D(pool_size=2))
    cnn_model.add(Flatten())
    cnn_model.add(Dense(1, activation='sigmoid'))  

    cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    cnn_model.fit(X_train_cnn, y_train, epochs=params['epochs'], batch_size=params['batch_size'], validation_split=0.1)
    cnn_predictions = cnn_model.predict(X_test_cnn)
    cnn_predictions = (cnn_predictions > 0.5).astype(int)  
    
    cnn_accuracy = accuracy_score(y_test, cnn_predictions)
    cnn_precision = precision_score(y_test, cnn_predictions)
    cnn_recall = recall_score(y_test, cnn_predictions)
    cnn_f1 = f1_score(y_test, cnn_predictions)
    
    print("CNN Accuracy:", cnn_accuracy)
    print("CNN Precision:", cnn_precision)
    print("CNN Recall:", cnn_recall)
    print("CNN F1-Score:", cnn_f1)

    cnn_cm = confusion_matrix(y_test, cnn_predictions)
    print("CNN Confused Matrix:\n", cnn_cm)
    
    # Update The Best Performance
    if cnn_accuracy > best_performance:
        best_performance = cnn_accuracy
        best_params = params
        best_precision = cnn_precision
        best_recall = cnn_recall
        best_f1 = cnn_f1

print("Best Parameters:", best_params)
print("Best Performance (Accuracy):", best_performance)
print("CNN Recall:", best_recall)
print("CNN Precision:", best_precision)
print("CNN F1-Score:", best_f1)
endan.append(['CNN',best_params,best_performance,best_precision,best_recall,best_f1])


In [None]:
print(endan)