In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.svm import OneClassSVM

In [6]:
# Load the CSV data into a DataFrame
data = pd.read_csv('3804ICT_Preprocessed_Data_With_Productivity_Difference.csv')

data = data.drop(['date', 'day', 'targeted_productivity', 'over_time', 'idle_time', 
         'idle_men', 'no_of_workers', 'actual_productivity', 'productivity_difference'], axis=1)

data = data.astype('float64')

data.head()

Unnamed: 0,day_sin,day_cos,dummy_department_finishing,dummy_department_sewing,dummy_team_1,dummy_team_2,dummy_team_3,dummy_team_4,dummy_team_5,dummy_team_6,...,dummy_no_of_style_change_no,dummy_no_of_style_change_yes,dummy_incentive_no,dummy_incentive_yes,norm_no_of_workers,norm_targeted_productivity,norm_over_time,norm_idle_time,norm_idle_men,norm_productivity_difference
0,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.09877,0.718836,0.750275,-0.057449,-0.112957,-0.844982
1,-0.433884,-0.900969,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,-1.198767,0.208064,-1.077232,-0.057449,-0.112957,-0.818587
2,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,-0.185148,0.718836,-0.270979,-0.057449,-0.112957,0.030535
3,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,-0.185148,0.718836,-0.270979,-0.057449,-0.112957,0.030535
4,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.96362,0.718836,-0.790564,-0.057449,-0.112957,0.031713


In [7]:

# Assume that all columns in the CSV are features. If there's a target column, you'll need to drop it.
# Also, make sure to handle any missing values before feeding data to the model.

# Train a one-class SVM
# The nu parameter is an upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors.
# It should be between 0 and 1. Commonly set to the expected proportion of outliers.
clf = OneClassSVM(kernel="rbf", gamma=0.001, nu=0.03)
clf.fit(data)

# Predict anomalies
predictions = clf.predict(data)

results = ['Normal' if prediction == 1 else 'Anomaly' for prediction in predictions]

# Create a DataFrame with the results
results_df = pd.DataFrame(results, columns=['Classification'])

# If you want the original data and the classification side-by-side:
final_df = pd.concat([data, results_df], axis=1)

final_df.head()

Unnamed: 0,day_sin,day_cos,dummy_department_finishing,dummy_department_sewing,dummy_team_1,dummy_team_2,dummy_team_3,dummy_team_4,dummy_team_5,dummy_team_6,...,dummy_no_of_style_change_yes,dummy_incentive_no,dummy_incentive_yes,norm_no_of_workers,norm_targeted_productivity,norm_over_time,norm_idle_time,norm_idle_men,norm_productivity_difference,Classification
0,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.09877,0.718836,0.750275,-0.057449,-0.112957,-0.844982,Normal
1,-0.433884,-0.900969,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-1.198767,0.208064,-1.077232,-0.057449,-0.112957,-0.818587,Normal
2,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-0.185148,0.718836,-0.270979,-0.057449,-0.112957,0.030535,Normal
3,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-0.185148,0.718836,-0.270979,-0.057449,-0.112957,0.030535,Normal
4,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.96362,0.718836,-0.790564,-0.057449,-0.112957,0.031713,Normal


In [8]:
import numpy as np
from cvxopt import solvers, matrix

class OneClassSVMFromScratch:
    def __init__(self, kernel="rbf", gamma=0.001, nu=0.03):
        self.kernel = kernel
        self.gamma = gamma
        self.nu = nu
        self.alpha = None
        self.support_vectors = None
        self.rho = None

    def rbf_kernel(self, X1, X2):
        return np.exp(-self.gamma * np.linalg.norm(X1 - X2) ** 2)

    def fit(self, X):
        n, _ = X.shape
        
        # Calculate the kernel matrix
        K = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                if self.kernel == "rbf":
                    K[i, j] = self.rbf_kernel(X[i], X[j])

        P = matrix(K, tc='d')
        q = matrix(-np.ones((n, 1)), tc='d')
        G = matrix(np.vstack((-np.eye(n), np.eye(n))), tc='d')
        h = matrix(np.hstack((np.zeros(n), np.ones(n) * (1 / (self.nu * n)))), tc='d')
        A = matrix(np.ones(n), (1, n), tc='d')
        b = matrix(1.0, tc='d')
        
        solution = solvers.qp(P, q, G, h, A, b)
        alpha = np.ravel(solution['x'])
        
        # Support vectors
        sv = (alpha > 1e-5)
        ind = np.arange(len(alpha))[sv]
        self.alpha = alpha[sv]
        self.support_vectors = X[sv]
        
        # Calculate rho
        self.rho = 0
        for i in range(len(self.alpha)):
            self.rho += np.sum(self.alpha * K[ind[i], sv])
        self.rho /= len(self.alpha)

    def predict(self, X):
        y_pred = []
        for x in X:
            prediction = 0
            for alpha, sv in zip(self.alpha, self.support_vectors):
                prediction += alpha * self.rbf_kernel(x, sv)
            y_pred.append(1 if prediction >= self.rho else -1)
        return np.array(y_pred)

# Your data preprocessing goes here

clf = OneClassSVMFromScratch(kernel="rbf", gamma=0.001, nu=0.03)
clf.fit(data.values)

predictions = clf.predict(data.values)

results_2 = ['Normal' if prediction == 1 else 'Anomaly' for prediction in predictions]
results2_df = pd.DataFrame(results, columns=['Classification'])
final2_df = pd.concat([data, results_df], axis=1)
final2_df.head()

     pcost       dcost       gap    pres   dres
 0: -6.0254e-01 -3.6997e+01  3e+03  5e+01  1e-15
 1: -5.4546e-01 -3.5309e+01  8e+01  8e-01  1e-15
 2: -5.2820e-01 -8.4684e+00  1e+01  6e-02  1e-15
 3: -5.2073e-01 -1.3312e+00  8e-01  1e-15  1e-15
 4: -5.3271e-01 -1.2810e+00  7e-01  8e-16  1e-15
 5: -5.2129e-01 -9.4650e-01  4e-01  1e-15  1e-15
 6: -5.2388e-01 -8.4525e-01  3e-01  1e-15  1e-15
 7: -5.3767e-01 -8.5121e-01  3e-01  2e-16  1e-15
 8: -5.3905e-01 -8.1216e-01  3e-01  3e-15  9e-16
 9: -5.4260e-01 -6.8050e-01  1e-01  2e-15  1e-15
10: -5.4176e-01 -6.6207e-01  1e-01  6e-16  1e-15
11: -5.4394e-01 -6.5454e-01  1e-01  2e-15  1e-15
12: -5.4640e-01 -6.3825e-01  9e-02  9e-16  1e-15
13: -5.4816e-01 -6.2978e-01  8e-02  7e-16  1e-15
14: -5.4872e-01 -5.9602e-01  5e-02  3e-16  1e-15
15: -5.4974e-01 -5.9404e-01  4e-02  1e-15  1e-15
16: -5.5136e-01 -5.9060e-01  4e-02  2e-15  1e-15
17: -5.5224e-01 -5.8388e-01  3e-02  3e-16  1e-15
18: -5.5309e-01 -5.7765e-01  2e-02  7e-16  1e-15
19: -5.5380e-01 -5.73

Unnamed: 0,day_sin,day_cos,dummy_department_finishing,dummy_department_sewing,dummy_team_1,dummy_team_2,dummy_team_3,dummy_team_4,dummy_team_5,dummy_team_6,...,dummy_no_of_style_change_yes,dummy_incentive_no,dummy_incentive_yes,norm_no_of_workers,norm_targeted_productivity,norm_over_time,norm_idle_time,norm_idle_men,norm_productivity_difference,Classification
0,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.09877,0.718836,0.750275,-0.057449,-0.112957,-0.844982,Normal
1,-0.433884,-0.900969,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-1.198767,0.208064,-1.077232,-0.057449,-0.112957,-0.818587,Normal
2,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-0.185148,0.718836,-0.270979,-0.057449,-0.112957,0.030535,Normal
3,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-0.185148,0.718836,-0.270979,-0.057449,-0.112957,0.030535,Normal
4,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.96362,0.718836,-0.790564,-0.057449,-0.112957,0.031713,Normal


In [9]:
# Export both DataFrames to CSV
final_df.to_csv('OneClassSVM.csv', index=False)
final2_df.to_csv('OneClassSVMFromScratch.csv', index=False)

In [2]:
# Import WEKA classification csv
# Load the CSV data into a DataFrame
import numpy as np
weka_data = pd.read_csv('weka_svm.csv')
weka_data.head()

Unnamed: 0,inst#,actual,predicted,error,prediction
0,1,1:1,1:1,,1
1,2,1:1,1:1,,1
2,3,1:1,1:1,,1
3,4,1:1,1:1,,1
4,5,1:1,1:1,,1


In [3]:
# Find unique values in the prediction column
weka_data['prediction'].unique()

array(['1', '?'], dtype=object)

In [4]:
# Replace the values with 1 for normal and -1 for anomaly
weka_data['prediction'] = weka_data['prediction'].replace(['1', '?'], ['Normal', 'Anomaly'])
weka_data['prediction'].unique()

array(['Normal', 'Anomaly'], dtype=object)

In [7]:
# Add the prediction column to the original DataFrame
final_df = pd.concat([data, weka_data['prediction']], axis=1)

# Change the column name to Classification
final_df = final_df.rename(columns={'prediction': 'Classification'})

final_df.head()

Unnamed: 0,day_sin,day_cos,dummy_department_finishing,dummy_department_sewing,dummy_team_1,dummy_team_2,dummy_team_3,dummy_team_4,dummy_team_5,dummy_team_6,...,dummy_no_of_style_change_yes,dummy_incentive_no,dummy_incentive_yes,norm_no_of_workers,norm_targeted_productivity,norm_over_time,norm_idle_time,norm_idle_men,norm_productivity_difference,Classification
0,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.09877,0.718836,0.750275,-0.057449,-0.112957,-0.844982,Normal
1,-0.433884,-0.900969,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-1.198767,0.208064,-1.077232,-0.057449,-0.112957,-0.818587,Normal
2,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-0.185148,0.718836,-0.270979,-0.057449,-0.112957,0.030535,Normal
3,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-0.185148,0.718836,-0.270979,-0.057449,-0.112957,0.030535,Normal
4,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.96362,0.718836,-0.790564,-0.057449,-0.112957,0.031713,Normal


In [8]:
# Output the final DataFrame to CSV
final_df.to_csv('OneClassSVMWEKA.csv', index=False)