In [None]:
import pandas as pd

# Read the Label.csv file
labels = pd.read_csv('../data/CICD/Label.csv')
# Read the Data.csv file
data = pd.read_csv('../data/CICD/Data.csv')


In [None]:
#Train/test split
from sklearn.model_selection import train_test_split

# Create X (features) and y (target)
X = data
y = labels['Label']

# Create train/test split with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print the shapes of the resulting splits
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)

In [None]:
from sklearn.pipeline import Pipeline
import numpy as np

# Import needed additional libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from time import time

# Create pipeline with data transformations and model selection
def evaluate_model(model, X_train, X_test, y_train, y_test):
    start = time()
    model.fit(X_train, y_train)
    train_time = time() - start
    
    start = time()
    accuracy = model.score(X_test, y_test)
    inference_time = time() - start
    
    return accuracy, train_time, inference_time

# Create simple pipelines for each model type
pipelines = {
    'Decision Tree': Pipeline([
        ('clf', DecisionTreeClassifier(max_depth=20, class_weight='balanced', random_state=42))
    ]),
    'Random Forest': Pipeline([
        ('clf', RandomForestClassifier(n_estimators=100, max_depth=20, class_weight='balanced', random_state=42, n_jobs=-1))
    ]),
    'KNN': Pipeline([
        ('clf', KNeighborsClassifier(n_neighbors=5, n_jobs=-1))
    ]),
    'Logistic Regression': Pipeline([
        ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42, n_jobs=-1))
    ])
}

# Evaluate all models
results = {}
for name, pipeline in pipelines.items():
    accuracy, train_time, inference_time = evaluate_model(pipeline, X_train_scaled, X_test_scaled, y_train, y_test)
    results[name] = {
        'accuracy': accuracy,
        'training_time': train_time,
        'inference_time': inference_time
    }
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Training time: {train_time:.2f} seconds")
    print(f"Inference time: {inference_time:.2f} seconds")

# Find best model
best_model = max(results.items(), key=lambda x: x[1]['accuracy'])
print(f"\nBest model: {best_model[0]}")
print(f"Accuracy: {best_model[1]['accuracy']:.4f}")
print(f"Training time: {best_model[1]['training_time']:.2f} seconds")

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

# Import additional models

# Add more models to the pipeline dictionary
pipelines.update({
    'Gradient Boosting': Pipeline([
        ('clf', GradientBoostingClassifier(n_estimators=100, random_state=42))
    ]),
    'Linear SVC': Pipeline([
        ('clf', LinearSVC(max_iter=2000, random_state=42))
    ])
})

# Evaluate additional models
for name in ['Gradient Boosting', 'Linear SVC']:
    accuracy, train_time, inference_time = evaluate_model(pipelines[name], X_train_scaled, X_test_scaled, y_train, y_test)
    results[name] = {
        'accuracy': accuracy,
        'training_time': train_time,
        'inference_time': inference_time
    }
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Training time: {train_time:.2f} seconds")
    print(f"Inference time: {inference_time:.2f} seconds")

# Find best model including new models
best_model = max(results.items(), key=lambda x: x[1]['accuracy'])
print(f"\nBest model overall: {best_model[0]}")
print(f"Accuracy: {best_model[1]['accuracy']:.4f}")
print(f"Training time: {best_model[1]['training_time']:.2f} seconds")