<a href="https://colab.research.google.com/github/SahithiK792/ML/blob/main/lab7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded= files.upload()

for filename in uploaded.keys():
  print(filename)

Saving iotsim-air-quality-1 (1).csv to iotsim-air-quality-1 (1).csv
iotsim-air-quality-1 (1).csv


Question 1

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("iotsim-air-quality-1 (1).csv")

# Drop irrelevant / empty columns
drop_cols = ["ip.tos", "frame.time", "eth.src", "eth.dst", "ip.src", "ip.dst",
             "ip.checksum", "tcp.checksum", "tcp.options"]
df = df.drop(columns=drop_cols)

# Features + labels
X = df.drop("label", axis=1)
y = df["label"]

# Encode labels (Benign, TCP Scan, etc.)
le = LabelEncoder()
y = le.fit_transform(y)

# Split train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

# Identify column types
num_cols = [c for c in X.columns if X[c].dtype in ["int64", "float64"]]
cat_cols = [c for c in X.columns if X[c].dtype == "object"]

# Preprocessing: impute + scale numeric, impute + one-hot categorical
numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
])


Question 2

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline


# Random Forest

rf = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(random_state=42))
])

rf_params = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [None, 10, 20, 30],
    "model__min_samples_split": [2, 5, 10]
}

rf_search = RandomizedSearchCV(
    rf,
    rf_params,
    n_iter=5,
    cv=3,
    n_jobs=-1,
    scoring="accuracy",
    random_state=42
)

rf_search.fit(X_train, y_train)

print("Best RandomForest params:", rf_search.best_params_)
print("RandomForest Train acc:", rf_search.score(X_train, y_train))
print("RandomForest Test acc:", rf_search.score(X_test, y_test))


# LinearSVC (fast linear SVM)

svm = Pipeline([
    ("preprocess", preprocessor),
    ("model", LinearSVC(max_iter=5000, random_state=42))
])

svm_params = {
    "model__C": [0.1, 1, 10]
}

svm_search = RandomizedSearchCV(
    svm,
    svm_params,
    n_iter=3,
    cv=3,
    n_jobs=-1,
    scoring="accuracy",
    random_state=42
)

svm_search.fit(X_train, y_train)

print("Best LinearSVC params:", svm_search.best_params_)
print("LinearSVC Train acc:", svm_search.score(X_train, y_train))
print("LinearSVC Test acc:", svm_search.score(X_test, y_test))



# SGDClassifier (linear SVM approximation, very fast)

sgd = Pipeline([
    ("preprocess", preprocessor),
    ("model", SGDClassifier(loss="hinge", max_iter=1000, tol=1e-3, random_state=42))
])

sgd_params = {
    "model__alpha": [1e-4, 1e-3, 1e-2],  # regularization strength
    "model__penalty": ["l2", "l1"]
}

sgd_search = RandomizedSearchCV(
    sgd,
    sgd_params,
    n_iter=3,
    cv=3,
    n_jobs=-1,
    scoring="accuracy",
    random_state=42
)

sgd_search.fit(X_train, y_train)

print("Best SGDClassifier params:", sgd_search.best_params_)
print("SGDClassifier Train acc:", sgd_search.score(X_train, y_train))
print("SGDClassifier Test acc:", sgd_search.score(X_test, y_test))


Best RandomForest params: {'model__n_estimators': 100, 'model__min_samples_split': 5, 'model__max_depth': 30}
RandomForest Train acc: 0.9908948479825617
RandomForest Test acc: 0.9645808736717828
Best LinearSVC params: {'model__C': 10}
LinearSVC Train acc: 0.7881973615494653
LinearSVC Test acc: 0.7869403323948778
Best SGDClassifier params: {'model__penalty': 'l1', 'model__alpha': 0.0001}
SGDClassifier Train acc: 0.7877205331395745
SGDClassifier Test acc: 0.7854872400326945


Question 3

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(eval_metric="mlogloss", use_label_encoder=False),
    "NaiveBayes": GaussianNB(),
    "MLP": MLPClassifier(max_iter=300)
}

results = []

for name, model in models.items():
    clf = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)

    results.append({
        "Model": name,
        "Train Acc": accuracy_score(y_train, y_pred_train),
        "Test Acc": accuracy_score(y_test, y_pred_test),
        "Precision": precision_score(y_test, y_pred_test, average="weighted"),
        "Recall": recall_score(y_test, y_pred_test, average="weighted"),
        "F1 Score": f1_score(y_test, y_pred_test, average="weighted")
    })

results_df = pd.DataFrame(results)
print(results_df)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              Model  Train Acc  Test Acc  Precision    Recall  F1 Score
0      DecisionTree   0.997366  0.993370   0.993383  0.993370  0.993369
1      RandomForest   0.997343  0.970938   0.970949  0.970938  0.970928
2          AdaBoost   0.757748  0.762964   0.776884  0.762964  0.762925
3  GradientBoosting   0.947799  0.943693   0.943805  0.943693  0.943649
4           XGBoost   0.954679  0.944056   0.944127  0.944056  0.944052
5        NaiveBayes   0.179901  0.180728   0.703857  0.180728  0.213836
6               MLP   0.821189  0.820271   0.833841  0.820271  0.816331
