In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer



In [2]:
# Load the training data
train_df = pd.read_csv('train_data.csv')


train_df.head()




Unnamed: 0,duration,protocoltype,service,flag,srcbytes,dstbytes,land,wrongfragment,urgent,hot,...,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,lastflag,attack
0,0,tcp,netbios_dgm,REJ,0,0,0,0,0,0,...,0.06,0.06,0.0,0.0,0.0,0.0,1.0,1.0,21,1
1,0,tcp,smtp,SF,1239,400,0,0,0,0,...,0.45,0.04,0.0,0.0,0.11,0.0,0.02,0.0,18,0
2,0,tcp,http,SF,222,945,0,0,0,0,...,1.0,0.0,0.02,0.03,0.0,0.0,0.0,0.0,21,0
3,0,tcp,http,SF,235,1380,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,0
4,0,tcp,uucp_path,REJ,0,0,0,0,0,0,...,0.01,0.08,0.0,0.0,0.0,0.0,1.0,1.0,19,1


In [3]:
# Encode the target variable
label_encoder = LabelEncoder()
train_df['attack'] = label_encoder.fit_transform(train_df['attack'])



In [4]:
# Define features and target variable
X = train_df.drop('attack', axis=1)
y = train_df['attack']



In [5]:
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns


In [6]:

# Create a preprocessing pipeline for categorical data
preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],remainder='passthrough')



In [7]:
# Create a pipeline that includes preprocessing and model training
pipeline = Pipeline(steps=[('preprocessor', preprocessor),('imputer', SimpleImputer(strategy='median')),('classifier', RandomForestClassifier(random_state=42))])



In [8]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [9]:
# Train the model
pipeline.fit(X_train, y_train)



In [10]:
# Make predictions on the validation set
y_val_pred = pipeline.predict(X_val)



In [11]:
# Calculate the F1 score on the validation set
f1 = f1_score(y_val, y_val_pred)
print(f'Validation F1 Score: {f1}')



Validation F1 Score: 1.0


In [12]:
# Load the test data
test_df = pd.read_csv('test_data.csv')



In [13]:
# Make predictions on the test set
test_predictions = pipeline.predict(test_df)



In [14]:
# Create a submission dataframe
submission_df = pd.DataFrame({'attack': test_predictions})

# Save the submission to a CSV file
submission_df.to_csv('submission.csv', index=False)