In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score


In [6]:
# Load training data
train_data = pd.read_csv('train_data.csv')

# Load test data
test_data = pd.read_csv('test_data.csv')



In [7]:
# Display the first few rows of the training data
print(train_data.head())
# Get info about the data
print(train_data.info())
# Describe the data
print(train_data.describe())

   duration protocoltype      service flag  srcbytes  dstbytes  land  \
0         0          tcp  netbios_dgm  REJ         0         0     0   
1         0          tcp         smtp   SF      1239       400     0   
2         0          tcp         http   SF       222       945     0   
3         0          tcp         http   SF       235      1380     0   
4         0          tcp    uucp_path  REJ         0         0     0   

   wrongfragment  urgent  hot  ...  dsthostsamesrvrate  dsthostdiffsrvrate  \
0              0       0    0  ...                0.06                0.06   
1              0       0    0  ...                0.45                0.04   
2              0       0    0  ...                1.00                0.00   
3              0       0    0  ...                1.00                0.00   
4              0       0    0  ...                0.01                0.08   

   dsthostsamesrcportrate  dsthostsrvdiffhostrate  dsthostserrorrate  \
0                    0.00 

In [9]:
X = train_data.drop('attack', axis=1)
y = train_data['attack']

categorical_cols = ['protocoltype', 'service', 'flag']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Define preprocessing for numerical features: scaling
numerical_transformer = StandardScaler()

# Define preprocessing for categorical features: one-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [10]:
# Create a pipeline that combines preprocessing and model training
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])


In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Train the model
model.fit(X_train, y_train)


In [17]:
# Predict on validation set
y_val_pred = model.predict(X_val)

# Print classification report
print(classification_report(y_val, y_val_pred))

# Calculate F1 score
f1 = f1_score(y_val, y_val_pred, pos_label=1)
print(f"F1 Score: {f1}")



              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10762
           1       1.00      1.00      1.00      6607

    accuracy                           1.00     17369
   macro avg       1.00      1.00      1.00     17369
weighted avg       1.00      1.00      1.00     17369

F1 Score: 0.9999243284146804


In [19]:
# Ensure test data columns match those of training data
test_data.columns = X.columns

# Predict on the test set
test_predictions = model.predict(test_data)

# Save the predictions
output = pd.DataFrame({'attack': test_predictions})
output.to_csv('submission.csv', index=False)
