<a href="https://colab.research.google.com/github/VictoKu1/API_Security_Research/blob/master/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cisco - Ariel University API Security Detection Challenge 2023


---



---



## Dataset 1 (Task 1)

The most basic API traffic containing the least number of attacks and endpoints. Will basically enable to have a soft start. 

```
Dataset 1 baseline score:

                   precision    recall  f1-score   support

    Benign          0.95715   0.93922   0.99792       480
    Malware         0.99799   0.94129   0.96881       528
    
    accuracy                            0.96825      1008
    macro avg       0.96860   0.96960   0.96824      1008
    weighted avg    0.97000   0.96825   0.96827      1008

```

[Link to the Dataset 1](https://drive.google.com/file/d/15MxHRAdwPXCENACwn8wLMkb98ZCjDeh6/view?usp=share_link)

## Baseline code

### Imports and global settings

In [None]:
# Imports, settings and first dataset view
import pandas as pd
import seaborn as sns
import numpy as np
import json

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.decomposition import TruncatedSVD

import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as ps
import matplotlib.pyplot as plt
import plotly.io as pio
pio.templates.default = "plotly_dark"
pio.templates['plotly_dark'].layout.autosize = False
pio.templates['plotly_dark'].layout.width = 1_000
pio.templates['plotly_dark'].layout.height = 800

from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer, recall_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
import pickle

# from ipywidgets import widgets
# Set pandas to show all columns when you print a dataframe
pd.set_option('display.max_columns', None)

# Global setting here you choose the dataset number and classification type for the model
dataset_number = 1   # Options are [1, 2, 3, 4]
test_type = 'label'  # Options are ['label', 'attack_type']

# Read the json and read it to a pandas dataframe object, you can change these settings
with open(f'./dataset_{str(dataset_number)}_train.json') as file:
    raw_ds = json.load(file)
df_org = pd.json_normalize(raw_ds, max_level=2)

# Copy data
df = df_org.copy()
print("Original Data:", df_org.shape,
      "\nCopy Data:", df.shape)

df.head()

### Basic dataset label arrangements

In [None]:
# Fill the black attack tag lines with "Benign" string
df['request.Attack_Tag'] = df['request.Attack_Tag'].fillna('Benign')
df['attack_type'] = df['request.Attack_Tag']

# This function will be used in the lambda below to iterate over the label columns
# You can use this snippet to run your own lambda on any data with the apply() method
def categorize(row):
    if row['request.Attack_Tag'] == 'Benign':
        return 'Benign'
    return 'Malware'

df['label'] = df.apply(lambda row: categorize(row), axis=1)

# After finishing the arrangements we delete the irrelevant column
df.drop('request.Attack_Tag', axis=1, inplace=True)
df

### Preprocessing Data


In [None]:
from collections import Counter
COLUMNS_TO_DROP = []

for col in df.columns:
    # More the 50% Null
    if df[col].isna().sum()/df.shape[0]*100 > 50:
        COLUMNS_TO_DROP.append(df[col])
        print(f"Column {col} has {df[col].isna().sum()} NaN values, "
              f"which is {round(df[col].isna().sum() / df.shape[0] * 100, 2)}%, "
              f"and has {df[col].nunique()} unique values")
        # df.drop([col], axis=1, inplace=True)

for col in df.columns:
    # All rows have the same value
    if len(Counter(df[col])) == 1:
        COLUMNS_TO_DROP.append(df[col])
        print(f"Column {col} has the same value for all rows")
        # df.drop([col], axis=1, inplace=True)

for col in df.columns[df.isna().any()].tolist():
    df[col] = df[col].fillna('None')

print("Deleted:", len(COLUMNS_TO_DROP), " New Shape:", df.shape)
df.head()

In [None]:
# On these headers we will run a "simple" BOW
SIMPLE_HEADERS = ['request.headers.Accept-Encoding',
                  # 'request.headers.Host',
                  'request.method',
                  'request.headers.Accept-Language',
                  'request.headers.Sec-Fetch-Site',
                  'request.headers.Sec-Fetch-Mode',
                  'request.headers.Sec-Fetch-Dest',
                  'response.status',
                  ]

# On these headers we will run HashingVector
COMPLEX_HEADERS = ['request.headers.User-Agent',
                   'request.headers.Set-Cookie',
                   'request.headers.Date',
                   'request.url',
                   'response.headers.Content-Type',
                   'response.body',
                   # 'response.headers.Location',
                   # 'request.headers.Content-Length',
                   # 'request.headers.Cookie',
                   # 'response.headers.Set-Cookie'
                   ]

COLUMNS_TO_REMOVE = ['request.body',
                     'response.headers.Content-Length',
                     'request.headers.Date',
                     'request.headers.Accept', ###
                     'request.headers.Connection', ###
                     'request.headers.Sec-Fetch-User', ###
                     'request.headers.Cookie', #
                     'response.headers.Location', #
                     'request.headers.Content-Length', #
                     'response.headers.Set-Cookie', #
                     'request.headers.Host', ##
                     ]

def vector_df(df_):
    le = LabelEncoder()
    h_vec = HashingVectorizer(n_features=8)

    # Run LabelEncoder on the chosen features
    for j in SIMPLE_HEADERS:
        print("1", j)
        df_[j] = le.fit_transform(df_[j])

    # Run HashingVectorized on the chosen features
    for j in COMPLEX_HEADERS:
        print("2", j)
        newHVec = h_vec.fit_transform(df_[j])
        df_[j] = newHVec.todense()

    # Remove cols
    for j in COLUMNS_TO_REMOVE:
        print("3", j)
        df_.drop(j, axis=1, inplace=True)

    return df_

df = vector_df(df)
print("Vector_df:", df.shape)
df.head()

In [None]:
# Choose features
features_list = df.columns.to_list()
features_list.remove('label')
features_list.remove('attack_type')

print(features_list)

# Check type
df.dtypes

In [None]:
ss = StandardScaler()
pca = PCA(n_components=2)
x_after_pca_in_2D = pca.fit_transform(ss.fit_transform(df[features_list].to_numpy()))

plt.scatter(x_after_pca_in_2D[:, 0],
            x_after_pca_in_2D[:, 1],
            c=df['label'].map({'Benign': 0, 'Malware': 1}))

In [None]:
pca = PCA()
pca.fit(ss.fit_transform(df[features_list].to_numpy()))
pca_exp_var = np.cumsum(pca.explained_variance_ratio_)

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');


As we can see we can compress the data into X components without losing any information.

## Train Test Split

*   x_Train and y_Train will be used for _Train_
*   x_test and y_test.T will be used for _Test_


In [None]:
# Convert the feature list to a numpy array
x = df[features_list]

# This column is the desired prediction we'll use to train our model
y = np.stack(df[test_type])

# Split the dataset to train and test
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y, 
                                                    test_size=0.1765, 
                                                    random_state=42, 
                                                    stratify=y)

# Print the resulted datasets 
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

# Count differences
counter = Counter(y)
counter

In [None]:
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

pca = PCA(n_components=8)
pca.fit(x_train)

x_train_pca = x_train
x_train_pca_ = pca.transform(x_train)

x_test_pca = x_test
x_test_pca_ = pca.transform(x_test)


In [None]:
def plot_feature_importance(model):
    plt.figure(figsize=(10, 10))
    plt.title("Feature importance")
    plt.barh(range(x_train_pca.shape[1]), model.feature_importances_, align="center")
    plt.yticks(np.arange(x_train_pca.shape[1]), features_list)
    plt.ylim([-1, x_train_pca.shape[1]])
    plt.show()

In [None]:
# Feature selection with Random Forest Classifier
rfc_fs = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_fs.fit(x_train, y_train)

# Plot the feature importance of the forest
plot_feature_importance(rfc_fs)

In [None]:
# Feature selection with AdaBoost Classifier
abc_fs = AdaBoostClassifier(n_estimators=100, random_state=42)
abc_fs.fit(x_train, y_train)

# Plot the feature importance of the forest
plot_feature_importance(abc_fs)

In [None]:
# Feature selection with Gradient Boosting Classifier
gbc_fs = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbc_fs.fit(x_train, y_train)

# Plot the feature importance of the forest
plot_feature_importance(gbc_fs)

In [None]:
from sklearn.svm import LinearSVC

# Feature importance with Linear SVC
linear_svc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(x_train_pca, y_train)
var = linear_svc.coef_

# Plot feature importance with Linear SVC
plt.figure(figsize=(10, 10))
plt.title("Feature importance")
plt.barh(range(x_train.shape[1]), linear_svc.coef_[0], align="center")
plt.yticks(np.arange(x_train.shape[1]), features_list)
plt.ylim([-1, x_train.shape[1]])
plt.show()

In [None]:
# Feature selection with Decision Tree Classifier
dtc_fs = DecisionTreeClassifier(random_state=42)
dtc_fs.fit(x_train, y_train)

plot_feature_importance(dtc_fs)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

# Feature selection with Extra Trees Classifier
etc_fs = ExtraTreesClassifier(n_estimators=100, random_state=42)
etc_fs.fit(x_train, y_train)

plot_feature_importance(etc_fs)

In [None]:
# Print top 10 feature ranking
fs_table = pd.DataFrame(columns=['Feature',
                                 'Random Forest',
                                 'AdaBoost',
                                 'Gradient Boosting',
                                 'Linear SVC',
                                 'Decision Tree',
                                 'Extra Trees'])
fs_table['Feature'] = features_list
fs_table['Random Forest'] = rfc_fs.feature_importances_
fs_table['AdaBoost'] = abc_fs.feature_importances_
fs_table['Gradient Boosting'] = gbc_fs.feature_importances_
fs_table['Linear SVC'] = np.abs(linear_svc.coef_[0])
fs_table['Decision Tree'] = dtc_fs.feature_importances_
fs_table['Extra Trees'] = etc_fs.feature_importances_
fs_table['Mean'] = fs_table.mean(axis=1)
fs_table.sort_values(by='Mean', ascending=False, inplace=True)
fs_table.head(10)

In [None]:
# Feature selection

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

# Create the RFE object and compute a cross-validated score
recall_scorer = make_scorer(recall_score, pos_label='Malware')
rfecv = RFECV(estimator=LogisticRegression(),
              step=1,
              cv=StratifiedKFold(2),
              # scoring=recall_scorer,
              verbose=3,
              n_jobs=-1)
rfecv.fit(x_train_pca, y_train)

print("Optimal number of features: %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number Of Features")
plt.ylabel("Cross Validation Score (nb of correct classifications)")
plt.errorbar(range(1, len(rfecv.cv_results_["mean_test_score"]) + 1),
             rfecv.cv_results_["mean_test_score"],
             yerr=rfecv.cv_results_["std_test_score"])
plt.show()

x_train_pca = rfecv.transform(x_train_pca)
x_test_pca = rfecv.transform(x_test_pca)

print(x_train_pca.shape)

In [None]:
# Grid search to find the best hyperParameters for the model
def create_grid_search(model, params):
    return GridSearchCV(estimator=model,
                        param_grid=params,
                        # scoring=recall_scorer,
                        n_jobs=-1,
                        cv=3,
                        verbose=3,
                        return_train_score=True)

In [None]:
def model_report_presentation(model):
    # We print our results
    sns.set(rc={'figure.figsize': (15, 8)})
    pred = model.predict(x_test_pca)
    true_labels = y_test
    cf_matrix = confusion_matrix(true_labels, pred)
    report_model = classification_report(true_labels, pred, digits=5)
    heatmap = sns.heatmap(cf_matrix,
                          annot=True,
                          cmap='Blues',
                          fmt='g',
                          xticklabels=np.unique(true_labels),
                          yticklabels=np.unique(true_labels))

    # The heatmap is cool but this is the most important result
    print(report_model)

### Model Choosing & Fitting


#### Random Forest

Ensemble learning method that uses multiple decision trees to make predictions. 
It is a robust and flexible model that can handle high-dimensional and complex data,making it a good choice for API classification tasks.

In [None]:
rf = RandomForestClassifier()
rf_param = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [3, 5],
}
rf_gcv = create_grid_search(rf, rf_param)
rf_gcv.fit(x_train_pca, y_train)

In [None]:
print("Best parameters:", rf_gcv.best_params_)

In [None]:
model_report_presentation(rf_gcv)

In [None]:
# # Save the model
clf = rf_gcv.best_estimator_
# filename = "Model/"+str(dataset_number)+"/rf"+str(dataset_number)+"_model.sav"
# pickle.dump(clf, open(filename, 'wb'))

#### SVM - Support Vector Machines 

Popular choice for classification tasks, including API classification. 
They work by finding the hyperplane in a high-dimensional space that maximally separates different classes of data points.

In [None]:
svc = SVC(cache_size=500)
svc_param = {
    "C": [1, 2, 3],
    "kernel": ["poly", "rbf", "sigmoid"],
    "probability": [True],
}
svc_gcv = create_grid_search(svc, svc_param)
svc_gcv.fit(x_train_pca, y_train)

In [None]:
print("Best parameters:", svc_gcv.best_params_)

In [None]:
model_report_presentation(svc_gcv)


In [None]:
# # Save the model
svc = svc_gcv.best_estimator_
# filename = "Model/"+str(dataset_number)+"/svc"+str(dataset_number)+"_model.sav"
# pickle.dump(svc, open(filename, 'wb'))

#### Logistic Regression

Simple yet effective linear model for classification tasks. 
It is particularly useful for predicting binary outcomes (e.g., malicious vs benign) and is often used as a baseline model for comparison with more complex models.

In [None]:
lr = LogisticRegression()
lr_param = {
    "C": [1, 2, 3, 4, 5],
    "penalty": ["none", "l2"],
}
lr_gcv = create_grid_search(lr, lr_param)
lr_gcv.fit(x_train_pca, y_train)

In [None]:
print("Best parameters:", lr_gcv.best_params_)

In [None]:
model_report_presentation(lr_gcv)

In [None]:
# # Save the model
lr = lr_gcv.best_estimator_
# filename = "Model/"+str(dataset_number)+"/lr"+str(dataset_number)+"_model.sav"
# pickle.dump(lr, open(filename, 'wb'))

#### KNN

Simple yet effective classification algorithm that works by finding the k nearest neighbors of a data point.
It is a non-parametric model that does not make any assumptions about the underlying data distribution.


In [None]:
knn = KNeighborsClassifier()
knn_param = {
    "n_neighbors": [3, 5, 9, 13],
    "weights": ["uniform", "distance"],
}
knn_gcv = create_grid_search(knn, knn_param)
knn_gcv.fit(x_train_pca, y_train)

In [None]:
print("Best parameters:", knn_gcv.best_params_)

In [None]:
model_report_presentation(knn_gcv)

In [None]:
# # Save the model
knn = knn_gcv.best_estimator_
# filename = "Model/"+str(dataset_number)+"/knn"+str(dataset_number)+"_model.sav"
# pickle.dump(knn, open(filename, 'wb'))

#### Decision Trees

Popular choice for classification tasks.
They work by recursively partitioning the feature space into smaller and smaller regions, until each region contains only a single class of data points.


In [None]:
dt = DecisionTreeClassifier()
dt_param = {
    "criterion": ["gini", "entropy"],
    "max_depth": [5, 7, 9],
}
dt_gcv = create_grid_search(dt, dt_param)
dt_gcv.fit(x_train_pca, y_train)

In [None]:
print("Best parameters:", dt_gcv.best_params_)

In [None]:
model_report_presentation(dt_gcv)

In [None]:
# # Save the model
dt = dt_gcv.best_estimator_
# filename = "Model/"+str(dataset_number)+"/dt"+str(dataset_number)+"_model.sav"
# pickle.dump(dt, open(filename, 'wb'))

#### AdaBoost

Popular ensemble method that works by combining the predictions of multiple weak learners.
It is a simple way to improve the performance of a model by combining the predictions of multiple models.

In [None]:
abc = AdaBoostClassifier()
abc_param = {
    "n_estimators": [3, 5, 10],
    "learning_rate": [1, 100, 1000],
}
abc_gcv = create_grid_search(abc, abc_param)
abc_gcv.fit(x_train_pca, y_train)

In [None]:
print("Best parameters:", abc_gcv.best_params_)

In [None]:
model_report_presentation(abc_gcv)

In [None]:
# # Save the model
abc = abc_gcv.best_estimator_
# filename = "Model/"+str(dataset_number)+"/abc"+str(dataset_number)+"_model.sav"
# pickle.dump(abc, open(filename, 'wb'))


#### Gradient Boosting

Popular ensemble method that works by combining the predictions of multiple weak learners.
It is a simple way to improve the performance of a model by combining the predictions of multiple models.


In [None]:
gbc = GradientBoostingClassifier()
gbc_param = {
    "n_estimators": [3, 5, 10],
    "learning_rate": [0.001, 0.1],
}
gbc_gcv = create_grid_search(gbc, gbc_param)
gbc_gcv.fit(x_train_pca, y_train)

In [None]:
print("Best parameters:", gbc_gcv.best_params_)

In [None]:
model_report_presentation(gbc_gcv)

In [None]:
# # Save the model
gbc = gbc_gcv.best_estimator_
# filename = "Model/"+str(dataset_number)+"/gbc"+str(dataset_number)+"_model.sav"
# pickle.dump(gbc, open(filename, 'wb'))

#### Neural Networks

popular choice for classification tasks.
They work by learning the weights of the connections between neurons in the network.

In [None]:
mlp = MLPClassifier()
mlp_param = {
    "hidden_layer_sizes": [(10, 50, 100, 150, 100, 50, 10),
                           (10, 50, 100, 150, 200, 150, 100, 50, 10),
                           (10, 50, 100, 150, 200, 250, 200, 150, 100, 50, 10),
                           ],
    "activation": ["relu", "logistic"],
    "solver": ["adam", "sgd"],
    "learning_rate": ["constant", "invscaling", "adaptive"],
    "shuffle": [True], 
    "early_stopping": [True],
}
mlp_gcv = create_grid_search(mlp, mlp_param)
mlp_gcv.fit(x_train_pca, y_train)

In [None]:
print("Best parameters:", mlp_gcv.best_params_)

In [None]:
model_report_presentation(mlp_gcv)

In [None]:
# # Save the model
mlp = mlp_gcv.best_estimator_
# filename = "Model/"+str(dataset_number)+"/mlp"+str(dataset_number)+"_model.sav"
# pickle.dump(mlp, open(filename, 'wb'))

#### ExtraTreesClassifier


In [None]:
etc = ExtraTreesClassifier()
etc_param = {
    "n_estimators": [300],
}
etc_gcv = create_grid_search(etc, etc_param)
etc_gcv.fit(x_train_pca, y_train)

In [None]:
print("Best parameters:", etc_gcv.best_params_)

In [None]:
model_report_presentation(etc_gcv)

In [None]:
# # Save the model
etc = etc_gcv.best_estimator_
# filename = "Model/"+str(dataset_number)+"/abc"+str(dataset_number)+"_model.sav"
# pickle.dump(etc, open(filename, 'wb'))

#### Voting Classifier

Meta-classifier that combines the predictions of multiple classifiers.
It is a simple way to improve the performance of a model by combining the predictions of multiple models.


In [None]:
vc = VotingClassifier(estimators=[("clf",clf),
                                  ("svc",svc),
                                  ("lr",lr),
                                  ("knn",knn),
                                  ("dt",dt),
                                  ("abc",abc),
                                  ("gbc",gbc),
                                  ("mlp",mlp),
                                  ("etc",etc)])
vc_param = {
    "weights": [[1,1,1,1,1,1,1,1,1],
                [2,1,1,1,1,1,1,1,1],
                [1,2,1,1,1,1,1,1,1],
                [1,1,2,1,1,1,1,1,1],
                [1,1,1,2,1,1,1,1,1],
                [1,1,1,1,3,1,1,1,1],
                [1,1,1,1,1,2,1,1,1],
                [1,1,1,1,1,1,2,1,1],
                [1,1,1,1,1,1,1,2,1],
                [1,1,1,1,1,1,2,1,2]],
    "voting": ["hard", "soft"],
}
vc_gcv = create_grid_search(vc, vc_param)
vc_gcv.fit(x_train_pca, y_train)

In [None]:
print("Best parameters:", vc_gcv.best_params_)

In [None]:
model_report_presentation(vc_gcv)

In [None]:
# # Save the model
vc = vc_gcv.best_estimator_
# filename = "Model/"+str(dataset_number)+"/vc"+str(dataset_number)+"_model.sav"
# pickle.dump(vc, open(filename, 'wb'))

#### Stacking Classifier

Meta-classifier that combines the predictions of multiple classifiers.
It is a simple way to improve the performance of a model by combining the predictions of multiple models.


In [None]:
sc = StackingClassifier(estimators=[("clf",clf),
                                    ("svc",svc),
                                    ("lr",lr),
                                    ("knn",knn),
                                    ("dt",dt),
                                    ("abc",abc),
                                    ("gbc",gbc),
                                    ("mlp",mlp),
                                    ("etc",etc)],
                        final_estimator=LogisticRegression())
sc_param = {"final_estimator__C": [1, 2, 3],
            "cv": ["prefit", None]}
sc_gcv = create_grid_search(sc, sc_param)
sc_gcv.fit(x_train_pca, y_train)

In [None]:
print("Best parameters:", sc_gcv.best_params_)

In [None]:
model_report_presentation(sc_gcv)

In [None]:
# # Save the model
sc = sc_gcv.best_estimator_
# filename = "Model/"+str(dataset_number)+"/sc"+str(dataset_number)+"_model.sav"
# pickle.dump(sc, open(filename, 'wb'))

### Choose The Best Model


In [None]:
models = [clf, svc, lr, knn, dt, abc, gbc, mlp, vc, sc, etc]

model_names = ["Random Forest",
               "SVM",
               "Logistic Regression",
               "KNN",
               "Decision Tree",
               "AdaBoost",
               "Gradient Boosting",
               "Neural Network",
               "Voting Classifier",
               "Stacking Classifier",
               "Extra Trees"]

model_scores = []

for model in models:
    predictions = model.predict(x_test_pca)
    model_report = classification_report(y_test, predictions, digits=5)
    model_scores.append(model_report.split()[12])

#### Result


In [None]:
best_model = models[model_scores.index(max(model_scores))]
best_model_name = model_names[model_scores.index(max(model_scores))]
best_model_score = max(model_scores)

print("Best model:", best_model_name)
print("Score:", str(best_model_score))

model_report = classification_report(y_test, predictions, digits=5)
print(model_report)

## Test


In [None]:
# Read the valuation json

with open(f'./dataset_{str(dataset_number)}_val.json') as file:
    raw_ds = json.load(file)
test_df = pd.json_normalize(raw_ds, max_level=2)

In [None]:
# Preprocess data

# Replace NAN values
for column in test_df.columns[test_df.isna().any()].tolist():
    test_df[column] = test_df[column].fillna('None')

test_df = vector_df(test_df)

test_df.head()

In [None]:
# Run the model

x = test_df[features_list].to_numpy()
x = ss.transform(x)
x = rfecv.transform(x)
pred_ = vc.predict(x)

In [None]:
# Save your predictions

enc = LabelEncoder()
np.savetxt(f'./dataset_{str(dataset_number)}_{test_type}_result.txt',
           enc.fit_transform(pred_), fmt='%2d')