In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [None]:
# Load the datasets
test_data = pd.read_csv('/content/Test_Data.csv')
train_data = pd.read_csv('/content/Train_Data.csv')
sample_submission = pd.read_csv('/content/submission.csv')

# Display the first few rows of the training data
print(train_data.head())


   duration protocoltype      service flag  srcbytes  dstbytes  land  \
0         0          tcp  netbios_dgm  REJ         0         0     0   
1         0          tcp         smtp   SF      1239       400     0   
2         0          tcp         http   SF       222       945     0   
3         0          tcp         http   SF       235      1380     0   
4         0          tcp    uucp_path  REJ         0         0     0   

   wrongfragment  urgent  hot  ...  dsthostsamesrvrate  dsthostdiffsrvrate  \
0              0       0    0  ...                0.06                0.06   
1              0       0    0  ...                0.45                0.04   
2              0       0    0  ...                1.00                0.00   
3              0       0    0  ...                1.00                0.00   
4              0       0    0  ...                0.01                0.08   

   dsthostsamesrcportrate  dsthostsrvdiffhostrate  dsthostserrorrate  \
0                    0.00 

In [None]:

# Assuming the target column name is 'attack'
target_column = 'attack'  # Adjust this if the actual column name is different

In [None]:
# Separate features and target variable from training data
X = train_data.drop(columns=[target_column])
y = train_data[target_column]

In [None]:
# Check for missing values
print(X.isnull().sum())

duration                  0
protocoltype              0
service                   0
flag                      0
srcbytes                  0
dstbytes                  0
land                      0
wrongfragment             0
urgent                    0
hot                       0
numfailedlogins           0
loggedin                  0
numcompromised            0
rootshell                 0
suattempted               0
numroot                   0
numfilecreations          0
numshells                 0
numaccessfiles            0
numoutboundcmds           0
ishostlogin               0
isguestlogin              0
count                     0
srvcount                  0
serrorrate                0
srvserrorrate             0
rerrorrate                0
srvrerrorrate             0
samesrvrate               0
diffsrvrate               0
srvdiffhostrate           0
dsthostcount              0
dsthostsrvcount           0
dsthostsamesrvrate        0
dsthostdiffsrvrate        0
dsthostsamesrcportra

In [None]:
# Encode categorical variables
categorical_columns = ['protocoltype', 'service', 'flag']
X_encoded = pd.get_dummies(X, columns=categorical_columns)
test_data_encoded = pd.get_dummies(test_data, columns=categorical_columns)

In [None]:
# Align the test data to ensure it has the same columns as the training data
test_data_encoded = test_data_encoded.reindex(columns=X_encoded.columns, fill_value=0)


In [None]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
test_data_scaled = scaler.transform(test_data_encoded)


In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Evaluate the model
y_val_pred = model.predict(X_val)
f1 = f1_score(y_val, y_val_pred)
print(f"F1 Score on Validation Set: {f1}")

F1 Score on Validation Set: 1.0


In [None]:
# Cross-validation to check model performance
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='f1')
print(f"Cross-validated F1 Score: {cv_scores.mean()}")

Cross-validated F1 Score: 1.0


In [None]:
# Predict on the test data
test_predictions = model.predict(test_data_scaled)

In [None]:
# Create the submission DataFrame
submission = pd.DataFrame({
    'attack': test_predictions
})

In [None]:
# Save the submission file
submission.to_csv('my_submission.csv', index=False)
print("Submission file created: my_submission.csv")

Submission file created: my_submission.csv


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
report = classification_report(y_val, y_val_pred)
print("Classification Report:")
print(report)


Confusion Matrix:
[[10762     0]
 [    0  6607]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10762
           1       1.00      1.00      1.00      6607

    accuracy                           1.00     17369
   macro avg       1.00      1.00      1.00     17369
weighted avg       1.00      1.00      1.00     17369



In [None]:
# Cross-validation to check model performance
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='f1')
print(f"Cross-validated F1 Scores: {cv_scores}")
print(f"Mean Cross-validated F1 Score: {cv_scores.mean()}")


Cross-validated F1 Scores: [1. 1. 1. 1. 1.]
Mean Cross-validated F1 Score: 1.0
