__Title:__ Case Study 5: SVM & SGD  
__Authors:__ Will Butler, Robert (Reuven) Derner 
__Date:__ 10/31/23 

## Business Understanding

Firewall traffic

A cybersecur 

Data Source:

Provided by client with metadata dictionary regarding terms 

Use SVM and SGD to attempt to model and predict the class of an internet connection request.


In [None]:
# Import libraries
import pandas as pd


In [None]:
# Import data from github (next phase)
url = 'https://raw.githubusercontent.com/ReuvenDerner/MSDS_QuantifyingTheWorld/main/log2.csv'
data = pd.read_csv(url, encoding = "utf-8")

## Data Quality
Verify data quality: Explain any missing values, duplicate data, and outliers. Are those mistakes? How do you deal with these problems? Give justifications for your methods.

In [None]:
data.shape

### Examine the data

In [None]:
data.head()

In [None]:
data.describe()

__Missing Values__  
The dataset contains no missing values upon code examination.

In [None]:
data.isnull().sum()

In [None]:
# Features with Null Values and Percent missing
null_df = pd.DataFrame(data[data.columns[data.isnull().any()]].isnull().sum()).reset_index()
null_df.columns = ['Feature', 'Value']
null_df['Percent'] = round((null_df['Value'] / data.shape[0] * 100),2)

null_df

__Duplicate Values__  
There are 0 duplicate values in the data set. No action was needed.

## EDA Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Visualize the distribution of the target variable
sns.countplot(x='Action', data=data)
plt.title('Distribution of Target Class')
plt.show()


### Examine any Correlations 

In [None]:
correlation_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [None]:
#Split the data into features and Target Variable
X = data.drop('Action', axis=1)
y = data['Action']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=219)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='linear', C=1.0, decision_function_shape='ovr')
svm_classifier.fit(X_train, y_train)



In [None]:
from sklearn.linear_model import SGDClassifier

sgd_classifier = SGDClassifier(loss='log', max_iter=1000, tol=1e-3)
sgd_classifier.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

svm_predictions = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))


In [None]:
sgd_predictions = sgd_classifier.predict(X_test)
sgd_accuracy = accuracy_score(y_test, sgd_predictions)

print("SGD Classifier Accuracy:", sgd_accuracy)
print("SGD Classifier Classification Report:")
print(classification_report(y_test, sgd_predictions))


In [None]:
from collections import Counter

# Count the predicted classes using SVM
svm_class_counts = Counter(svm_predictions)
print("SVM Predicted Class Counts:", svm_class_counts)

# Count the predicted classes using SGD Classifier
sgd_class_counts = Counter(sgd_predictions)
print("SGD Predicted Class Counts:", sgd_class_counts)


In [None]:
# Create and plot a confusion matrix
svm_cm = confusion_matrix(y_test, svm_predictions)
sgd_cm = confusion_matrix(y_test, sgd_predictions)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.heatmap(svm_cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('SVM Confusion Matrix')

plt.subplot(1, 2, 2)
sns.heatmap(sgd_cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('SGD Confusion Matrix')

plt.tight_layout()
plt.show()

### Variant 2

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset from the provided GitHub link
url = "https://raw.githubusercontent.com/ReuvenDerner/MSDS_QuantifyingTheWorld/main/log2.csv"
data = pd.read_csv(url)

# Split the data into features (X) and the target variable (y)
X = data.drop("Action", axis=1)
y = data["Action"]

# Convert the categorical target variable to numerical using Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Support Vector Machine (SVM) model for multiclass classification
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)



In [None]:
# Stochastic Gradient Descent (SGD) model for multiclass classification
sgd_model = SGDClassifier(loss='hinge', max_iter=1000, random_state=42)
sgd_model.fit(X_train, y_train)



In [None]:
# Predictions
svm_predictions = svm_model.predict(X_test)
sgd_predictions = sgd_model.predict(X_test)

# Calculate accuracy
svm_accuracy = accuracy_score(y_test, svm_predictions)
sgd_accuracy = accuracy_score(y_test, sgd_predictions)



In [None]:
# Print accuracy and classification report
print("Support Vector Machine (SVM) Model:")
print(f"Accuracy: {svm_accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, svm_predictions, target_names=label_encoder.classes_))

print("Stochastic Gradient Descent (SGD) Model:")
print(f"Accuracy: {sgd_accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, sgd_predictions, target_names=label_encoder.classes_))


In [None]:

# Create and plot a confusion matrix
svm_cm = confusion_matrix(y_test, svm_predictions)
sgd_cm = confusion_matrix(y_test, sgd_predictions)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.heatmap(svm_cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('SVM Confusion Matrix')

plt.subplot(1, 2, 2)
sns.heatmap(sgd_cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('SGD Confusion Matrix')

plt.tight_layout()
plt.show()


In [None]:

from sklearn.model_selection import RandomizedSearchCV

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=10)

# Define your SVM and SGD parameter distributions for RandomizedSearchCV
#svm_param_dist = {
#    'C': [0.1, 1, 10],
#    'kernel': ['linear', 'rbf', 'poly'],
#}

sgd_param_dist = {
    'alpha': [0.0001, 0.001, 0.01],
    'max_iter': [1000, 2000, 3000],
    'loss' : ['hinge','log_loss','huber'],
}

# Define a function to calculate accuracy on the validation set for early stopping
def evaluate_model(clf, X_val, y_val):
    y_pred = clf.predict(X_val)
    return accuracy_score(y_val, y_pred)

# Perform RandomizedSearchCV for the SVM model
#svm_search = RandomizedSearchCV(SVC(), param_distributions=svm_param_dist, n_iter=9, cv=3, random_state=10)
#svm_search.fit(X_train, y_train)

# Early stopping based on the validation accuracy
#best_svm_model = svm_search.best_estimator_
#best_svm_val_accuracy = evaluate_model(best_svm_model, X_val, y_val)
#print("Best SVM Model:", best_svm_model)
#print("Validation Accuracy:", best_svm_val_accuracy)

# Perform RandomizedSearchCV for the SGD model
sgd_search = RandomizedSearchCV(SGDClassifier( random_state=10), param_distributions=sgd_param_dist, n_iter=50, cv=3, random_state=10)
sgd_search.fit(X_train, y_train)

# Early stopping based on the validation accuracy
best_sgd_model = sgd_search.best_estimator_
best_sgd_val_accuracy = evaluate_model(best_sgd_model, X_val, y_val)
print("Best SGD Model:", best_sgd_model)
print("Validation Accuracy:", best_sgd_val_accuracy)

