<a href="https://colab.research.google.com/github/RONOGIT/NEWREPO/blob/main/n1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load the dataset from CSV file
file_path = 'your_file_path.csv'
df = pd.read_csv(file_path)

# Assuming the last column is the target variable (Class)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Weight calculation
n_samples = len(df)
n_classes = len(df['Class'].unique())

weights = [n_samples / (n_classes * len(df[df['Class'] == i])) for i in range(1, n_classes + 1)]

# Categorical classification
Cf = RandomForestClassifier()
Cf.fit(X_train, y_train)

# Bernoulli classification
Bf = BernoulliNB()
Bf.fit(X_train, y_train)

# Gaussian classification
Gf = GaussianNB()
Gf.fit(X_train, y_train)

# Prediction probabilities
P1 = Cf.predict_proba(X_test)[:, 1]
P2 = Bf.predict_proba(X_test)[:, 1]
P3 = Gf.predict_proba(X_test)[:, 1]
P4 = Gf.predict_proba(X_test)[:, 1]

# Lambda values
lambda_values = [0.3, 0.4, 0.5]

for lambda_val in lambda_values:
    P = lambda Pk, w: 1 if (Pk * w) >= lambda_val else 0

    # Final predicted values
    predictions = [P(P1[i], weights[0]) + P(P2[i], weights[1]) + P(P3[i], weights[2]) + P(P4[i], weights[3]) for i in range(len(X_test))]

    # Output the final predicted values
    print("Final Predictions:", predictions)
    print("Accuracy:", accuracy_score(y_test, predictions))


In [None]:
import pandas as pd

def separate_features_by_data_type(data):
    object_features = data.select_dtypes(include='object')
    binary_features = data.select_dtypes(include='bool')
    integer_features = data.select_dtypes(include='int')
    float_features = data.select_dtypes(include='float')

    return object_features, binary_features, integer_features, float_features

# Example usage:
# Load your dataset into a pandas DataFrame
df = pd.read_csv('/content/UNSW_2018_IoT_Botnet_Final_10_best_Testing.csv')  # Replace 'your_dataset.csv' with the actual file path

# Separate features by data type
object_features, binary_features, integer_features, float_features = separate_features_by_data_type(df)

# Print or explore the separated features
print("Object Features:")
print(object_features.head())

print("\nBinary Features:")
print(binary_features.head())

print("\nInteger Features:")
print(integer_features.head())

print("\nFloat Features:")
print(float_features.head())


Object Features:
  proto            saddr  sport          daddr dport category subcategory
0   udp  192.168.100.150  48516  192.168.100.3    80      DoS         UDP
1   tcp  192.168.100.148  22267  192.168.100.3    80     DDoS         TCP
2   udp  192.168.100.149  28629  192.168.100.3    80     DDoS         UDP
3   tcp  192.168.100.148  42142  192.168.100.3    80     DDoS         TCP
4   tcp  192.168.100.149   1645  192.168.100.5    80      DoS         TCP

Binary Features:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]

Integer Features:
   pkSeqID     seq  N_IN_Conn_P_SrcIP  state_number  N_IN_Conn_P_DstIP  attack
0   792371  175094                100             4                100       1
1  2056418  143024                100             1                100       1
2  2795650  167033                 73             4                100       1
3  2118009  204615                 56             1                100       1
4   303688   40058                100             3     

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

def preprocess_object_features(object_features):
    label_encoder = LabelEncoder()
    min_max_scaler = MinMaxScaler()

    for column in object_features.columns:
        # Label encoding
        object_features[column] = label_encoder.fit_transform(object_features[column])

        # Min-Max scaling
        object_features[column] = min_max_scaler.fit_transform(object_features[column].values.reshape(-1, 1))

    return object_features

# Example usage:
# Assuming you have already separated object features
object_features = separate_features_by_data_type(df)[0]

# Preprocess object features
preprocessed_object_features = preprocess_object_features(object_features)

# Print or explore the preprocessed object features
print("Preprocessed Object Features:")
print(preprocessed_object_features.head())


Preprocessed Object Features:
   proto     saddr     sport     daddr     dport  category  subcategory
0   1.00  0.266667  0.653076  0.279070  0.894142       0.2     0.857143
1   0.75  0.133333  0.208084  0.279070  0.894142       0.0     0.714286
2   1.00  0.200000  0.315933  0.279070  0.894142       0.0     0.857143
3   0.75  0.133333  0.545059  0.279070  0.894142       0.0     0.714286
4   0.75  0.200000  0.109467  0.325581  0.894142       0.2     0.714286


In [None]:
print(df.columns)

Index(['proto', 'saddr', 'sport', 'daddr', 'dport', 'category', 'subcategory',
       'pkSeqID', 'seq', 'N_IN_Conn_P_SrcIP', 'state_number',
       'N_IN_Conn_P_DstIP', 'attack', 'stddev', 'min', 'mean', 'drate',
       'srate', 'max'],
      dtype='object')


In [None]:
import numpy as np
import pandas as pd

def calculate_class_weights(data):
    total_samples = len(data)
    total_classes = len(data['category'].unique())

    class_weights = [total_samples / (total_classes * np.sum(data['category'] == i)) for i in data['category'].unique()]

    return class_weights

# Example usage:
# Assuming you have already loaded your dataset and separated features and labels
# For example, assuming you have preprocessed object_features, binary_features, integer_features, float_features, and labels (y)
df = pd.concat([preprocessed_object_features, binary_features, integer_features, float_features], axis=1)
class_weights = calculate_class_weights(df)

# Print or explore the calculated class weights
print("Class Weights:", class_weights)


Class Weights: [0.37006953150392347, 0.31766422644901, 6.721217295975376, 1106.970486111111, 7590.6547619047615, 106269.16666666667]


In [None]:
import numpy as np
import pandas as pd

def calculate_class_weights(data, category, subcategory):
    total_samples = len(data)
    total_classes = len(np.unique(data[[category, subcategory]]))

    class_weights = [total_samples / (total_classes * np.sum(data[category] == i) + np.sum(data[subcategory] == i)) for i in np.unique(data[[category, subcategory]])]

    return class_weights

# Example usage:
# Assuming you have already loaded your dataset
# For example, assuming you have preprocessed object_features, binary_features, integer_features, float_features,
# and labels in two columns Class_1 and Class_2
df = pd.concat([preprocessed_object_features, binary_features, integer_features, float_features], axis=1)
class_weights = calculate_class_weights(df, 'category', 'subcategory')

# Print or explore the calculated class weights
print("Class Weights:", class_weights)


Class Weights: [0.15881419201568975, 45543.92857142857, 0.18503476575196173, 6641.822916666667, 553.4852430555555, 201.3945041061276, 50.42427837089759, 3.360608647987688, 2.3067224762676544, 3795.3273809523807, 1.849102730668399, 49047.307692307695]


In [None]:
# Assuming df is your DataFrame
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 637615 entries, 0 to 637614
Data columns (total 19 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   proto              637615 non-null  float64
 1   saddr              637615 non-null  float64
 2   sport              637615 non-null  float64
 3   daddr              637615 non-null  float64
 4   dport              637615 non-null  float64
 5   category           637615 non-null  float64
 6   subcategory        637615 non-null  float64
 7   pkSeqID            637615 non-null  int64  
 8   seq                637615 non-null  int64  
 9   N_IN_Conn_P_SrcIP  637615 non-null  int64  
 10  state_number       637615 non-null  int64  
 11  N_IN_Conn_P_DstIP  637615 non-null  int64  
 12  attack             637615 non-null  int64  
 13  stddev             637615 non-null  float64
 14  min                637615 non-null  float64
 15  mean               637615 non-null  float64
 16  dr

In [None]:
from sklearn.naive_bayes import CategoricalNB, BernoulliNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming df is your preprocessed DataFrame containing features and labels
# Replace 'category', 'binary', 'integer', 'float', 'Class' with the actual column names in your dataset

# Naive Bayes categorical classification
def naive_bayes_categorical(X_train, y_train, X_test):
    clf = CategoricalNB()
    clf.fit(X_train, y_train)
    return clf.predict_proba(X_test)[:, 1]

# Naive Bayes binary classification
def naive_bayes_binary(X_train, y_train, X_test):
    clf = BernoulliNB()
    clf.fit(X_train, y_train)
    return clf.predict_proba(X_test)[:, 1]

# Naive Bayes Gaussian classification
def naive_bayes_gaussian(X_train, y_train, X_test):
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    return clf.predict_proba(X_test)[:, 1]

# Feature encoding for Naive Bayes categorical classification
def feature_encoding_categorical(data, target_column):
    le = LabelEncoder()
    encoded_data = data.copy()
    encoded_data[target_column] = le.fit_transform(encoded_data[target_column])
    return encoded_data

# Split data into train and test sets
# Adjust column names as needed
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['Class'], axis=1), df['Class'], test_size=0.2, random_state=42
)

# Apply Naive Bayes categorical classification
#predicted_categorical = naive_bayes_categorical(X_train[['category']], y_train, X_test[['category']])

# Apply Naive Bayes binary classification
#predicted_binary = naive_bayes_binary(X_train[['binary']], y_train, X_test[['binary']])

# Apply Naive Bayes Gaussian classification for integer and float features
predicted_gaussian = naive_bayes_gaussian(X_train[['integer', 'float']], y_train, X_test[['integer', 'float']])

# Combine predictions using the voting technique
#final_predictions = np.round(np.mean([predicted_categorical, predicted_binary, predicted_gaussian], axis=0))
final_predictions = np.round(np.mean([predicted_gaussian], axis=0))

# Evaluate accuracy
accuracy = accuracy_score(y_test, final_predictions)
print("Final Accuracy:", accuracy)


In [None]:
from sklearn.naive_bayes import CategoricalNB, BernoulliNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming df is your preprocessed DataFrame containing features and labels
# Replace 'Class' with the actual column name in your dataset

# Naive Bayes categorical classification
#def naive_bayes_categorical(X_train, y_train, X_test):
    #clf = CategoricalNB()
    #clf.fit(X_train, y_train)
    #return clf.predict_proba(X_test)[:, 1]

# Naive Bayes binary classification
def naive_bayes_binary(X_train, y_train, X_test):
    clf = BernoulliNB()
    clf.fit(X_train, y_train)
    return clf.predict_proba(X_test)[:, 1]

# Naive Bayes Gaussian classification
def naive_bayes_gaussian(X_train, y_train, X_test):
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    return clf.predict_proba(X_test)[:, 1]

# Feature encoding for Naive Bayes categorical classification
#def feature_encoding_categorical(data, target_column):
    #le = LabelEncoder()
    #encoded_data = data.copy()
    #encoded_data[target_column] = le.fit_transform(encoded_data[target_column])
    #return encoded_data

# Split data into train and test sets
# Adjust column names as needed
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['category'], axis=1), df['category'], test_size=0.2, random_state=42
)

# Apply Naive Bayes categorical classification
#final_predictions = np.round(np.mean([predicted_categorical, predicted_binary, predicted_gaussian], axis=0))
#predicted_categorical = naive_bayes_categorical(X_train.select_dtypes(include=['object']), y_train, X_test.select_dtypes(include=['object']))

# Apply Naive Bayes binary classification
predicted_binary = naive_bayes_binary(X_train.select_dtypes(include=['int64']), y_train, X_test.select_dtypes(include=['int64']))

# Apply Naive Bayes Gaussian classification for integer and float features
predicted_gaussian = naive_bayes_gaussian(X_train.select_dtypes(include=['float64', 'int64']), y_train, X_test.select_dtypes(include=['float64', 'int64']))

# Combine predictions using the voting technique
final_predictions = np.round(np.mean([predicted_binary, predicted_gaussian], axis=0))
#final_predictions = np.round(np.mean([predicted_categorical, predicted_binary, predicted_gaussian], axis=0))
# Evaluate accuracy
accuracy = accuracy_score(y_test, final_predictions)
print("Final Accuracy:", accuracy)


ValueError: Unknown label type: (array([0. , 0.2, 0. , ..., 0. , 0. , 0.2]),)

In [None]:
# Assuming df is your preprocessed DataFrame containing features and labels
# Replace 'Class' with the actual column name in your dataset

# Feature encoding for Naive Bayes binary classification
def feature_encoding_binary(data, target_column):
    le = LabelEncoder()
    encoded_data = data.copy()
    encoded_data[target_column] = le.fit_transform(encoded_data[target_column])
    return encoded_data

# Split data into train and test sets
# Adjust column names as needed
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['category'], axis=1), df['category'], test_size=0.2, random_state=42
)

# Apply Naive Bayes categorical classification
#predicted_categorical = naive_bayes_categorical(X_train.select_dtypes(include=['object']), y_train, X_test.select_dtypes(include=['object']))

# Apply Naive Bayes binary classification
X_train_binary = feature_encoding_binary(X_train.select_dtypes(include=['int64']), 'category')
X_test_binary = feature_encoding_binary(X_test.select_dtypes(include=['int64']), 'category')
predicted_binary = naive_bayes_binary(X_train_binary.drop(['category'], axis=1), X_train_binary['category'], X_test_binary.drop(['category'], axis=1))

# Apply Naive Bayes Gaussian classification for integer and float features
predicted_gaussian = naive_bayes_gaussian(X_train.select_dtypes(include=['float64', 'int64']), y_train, X_test.select_dtypes(include=['float64', 'int64']))

# Combine predictions using the voting technique
#final_predictions = np.round(np.mean([predicted_categorical, predicted_binary, predicted_gaussian], axis=0))
final_predictions = np.round(np.mean([predicted_binary, predicted_gaussian], axis=0))
# Evaluate accuracy
accuracy = accuracy_score(y_test, final_predictions)
print("Final Accuracy:", accuracy)


KeyError: 'category'

In [None]:
from sklearn.naive_bayes import CategoricalNB, BernoulliNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming df is your preprocessed DataFrame containing features and labels
# Replace 'attack_1' and 'attack_2' with the actual column names in your dataset

# Naive Bayes categorical classification
def naive_bayes_categorical(X_train, y_train, X_test):
    clf = CategoricalNB()
    clf.fit(X_train, y_train)
    return clf.predict_proba(X_test)[:, 1]

# Naive Bayes binary classification
def naive_bayes_binary(X_train, y_train, X_test):
    clf = BernoulliNB()
    clf.fit(X_train, y_train)
    return clf.predict_proba(X_test)[:, 1]

# Naive Bayes Gaussian classification
def naive_bayes_gaussian(X_train, y_train, X_test):
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    return clf.predict_proba(X_test)[:, 1]

# Feature encoding for Naive Bayes binary classification
def feature_encoding_binary(data, target_column):
    le = LabelEncoder()
    encoded_data = data.copy()
    encoded_data[target_column] = le.fit_transform(encoded_data[target_column])
    return encoded_data

# Split data into train and test sets
# Adjust column names as needed
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['category', 'subcategory'], axis=1), df[['category', 'subcategory']], test_size=0.2, random_state=42
)

# Apply Naive Bayes categorical classification for each target column
predicted_categorical_1 = naive_bayes_categorical(X_train.select_dtypes(include=['object']), y_train['category'], X_test.select_dtypes(include=['object']))
predicted_categorical_2 = naive_bayes_categorical(X_train.select_dtypes(include=['object']), y_train['subcategory'], X_test.select_dtypes(include=['object']))

# Apply Naive Bayes binary classification for each target column
X_train_binary = feature_encoding_binary(X_train.select_dtypes(include=['int64']), 'category')
X_test_binary = feature_encoding_binary(X_test.select_dtypes(include=['int64']), 'category')
predicted_binary_1 = naive_bayes_binary(X_train_binary.drop(['category'], axis=1), X_train_binary['category'], X_test_binary.drop(['category'], axis=1))

X_train_binary = feature_encoding_binary(X_train.select_dtypes(include=['int64']), 'subcategory')
X_test_binary = feature_encoding_binary(X_test.select_dtypes(include=['int64']), 'subcategory')
predicted_binary_2 = naive_bayes_binary(X_train_binary.drop(['subcategory'], axis=1), X_train_binary['subcategory'], X_test_binary.drop(['subcategory'], axis=1))

# Apply Naive Bayes Gaussian classification for integer and float features
predicted_gaussian = naive_bayes_gaussian(X_train.select_dtypes(include=['float64', 'int64']), y_train, X_test.select_dtypes(include=['float64', 'int64']))

# Combine predictions using the voting technique
final_predictions = np.column_stack((predicted_categorical_1, predicted_categorical_2, predicted_binary_1, predicted_binary_2, predicted_gaussian))
final_predictions = np.round(np.mean(final_predictions, axis=1))

# Evaluate accuracy
accuracy = accuracy_score(np.column_stack((y_test['category'], y_test['subcategory'])), final_predictions)
print("Final Accuracy:", accuracy)


ValueError: at least one array or dtype is required

In [None]:
from sklearn.naive_bayes import CategoricalNB, BernoulliNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming df is your preprocessed DataFrame containing features and labels
# Replace 'category' and 'subcategory' with the actual column names in your dataset

# Naive Bayes categorical classification
def naive_bayes_categorical(X_train, y_train, X_test):
    clf = CategoricalNB()
    clf.fit(X_train, y_train)
    return clf.predict_proba(X_test)[:, 1]

# Naive Bayes binary classification
def naive_bayes_binary(X_train, y_train, X_test):
    clf = BernoulliNB()
    clf.fit(X_train, y_train)
    return clf.predict_proba(X_test)[:, 1]

# Naive Bayes Gaussian classification
def naive_bayes_gaussian(X_train, y_train, X_test):
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    return clf.predict_proba(X_test)[:, 1]

# Feature encoding for Naive Bayes categorical classification
def feature_encoding_categorical(data, target_column):
    le = LabelEncoder()
    encoded_data = data.copy()
    encoded_data[target_column] = le.fit_transform(encoded_data[target_column])
    return encoded_data

# Split data into train and test sets
# Adjust column names as needed
X_train, X_test, y_train, y_test = train_test_split(
     df.drop(['pkSeqID'], axis=1),df[['category', 'subcategory']], test_size=0.2, random_state=42
)

# Apply Naive Bayes categorical classification for each target column
y_train_categorical_1 = y_train['category'].astype('category') if y_train['category'].dtype != 'category' else y_train['category']
y_train_categorical_2 = y_train['subcategory'].astype('category') if y_train['subcategory'].dtype != 'category' else y_train['subcategory']
predicted_categorical_1 = naive_bayes_categorical(X_train.select_dtypes(include=['object']), y_train_categorical_1, X_test.select_dtypes(include=['object']))
predicted_categorical_2 = naive_bayes_categorical(X_train.select_dtypes(include=['object']), y_train_categorical_2, X_test.select_dtypes(include=['object']))

# Apply Naive Bayes binary classification for each target column
predicted_binary_1 = naive_bayes_binary(X_train.select_dtypes(include=['int64']), y_train['category'], X_test.select_dtypes(include=['int64']))
predicted_binary_2 = naive_bayes_binary(X_train.select_dtypes(include=['int64']), y_train['subcategory'], X_test.select_dtypes(include=['int64']))

# Apply Naive Bayes Gaussian classification for integer and float features
predicted_gaussian = naive_bayes_gaussian(X_train.select_dtypes(include=['float64', 'int64']), y_train, X_test.select_dtypes(include=['float64', 'int64']))

# Combine predictions using the voting technique
final_predictions = np.column_stack((predicted_categorical_1, predicted_categorical_2, predicted_binary_1, predicted_binary_2, predicted_gaussian))
final_predictions = np.round(np.mean(final_predictions, axis=1))

# Evaluate accuracy
accuracy = accuracy_score(np.column_stack((y_test['category'], y_test['subcategory'])), final_predictions)
print("Final Accuracy:", accuracy)


ValueError: at least one array or dtype is required

In [None]:
X = df.drop(['category', 'subcategory'], axis=1)
y = df[['category', 'subcategory']]



In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.naive_bayes import CategoricalNB

clf_category = CategoricalNB()
clf_subcategory = CategoricalNB()

clf_category.fit(X_train, y_train['category'])
clf_subcategory.fit(X_train, y_train['subcategory'])
predicted_category = clf_category.predict(X_test)
predicted_subcategory = clf_subcategory.predict(X_test)


ValueError: Unknown label type: (array([0. , 0.2, 0. , ..., 0. , 0. , 0.2]),)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_category = accuracy_score(y_test['category'], predicted_category)
accuracy_subcategory = accuracy_score(y_test['subcategory'], predicted_subcategory)

print("Accuracy (Category):", accuracy_category)
print("Accuracy (Subcategory):", accuracy_subcategory)


NameError: name 'predicted_category' is not defined