In [None]:
import os
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv(os.path.join(os.getcwd(), "Data/wafer_23012020_041211.csv"))

In [None]:
data.head()

In [None]:
data['Unnamed: 0'].value_counts()

In [None]:
data.copy().drop(["Good/Bad", "Unnamed: 0"], axis=1).to_csv("Data/test2.csv", index=False)

In [None]:
def get_redundant_cols(df1: pd.DataFrame, missing_tresh=0.7):
    ratio = df1.isna().sum() / df1.shape[0]
    l1 = list(ratio[ratio > missing_tresh].index)
    return l1

In [None]:
cols_to_drop = get_redundant_cols(data)
cols_to_drop.append('Unnamed: 0')
cols_to_drop.append('Good/Bad')
x = data.drop((cols_to_drop), axis=1)

In [None]:
y = data['Good/Bad']

In [None]:
y.unique()

In [None]:
y = np.where(y == -1, 0, 1)

In [None]:
y

In [None]:
df.info()
# df.size()

In [None]:
df['Unnamed: 0'].info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df[df.duplicated()].sum()

In [None]:
df[df.isnull()]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df.index, df['Sensor-1'], c=df['Good/Bad'], cmap='viridis')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Anomaly Detection Results')
plt.show()


In [None]:
sns.boxplot(df)

In [None]:
df.isna().sum() / df.shape[0]

In [None]:
# ratio=df.isna().sum().div(df.shape[0])
# l1=list(ratio[ratio>0.9].index)
# l1
df[df.isna().sum().div(df.shape[0]) > 0.9]

In [None]:
def get_redundant_cols(df1: pd.DataFrame, missing_tresh=0.7):
    ratio = df1.isna().sum() / df1.shape[0]
    l1 = list(ratio[ratio > missing_tresh].index)
    return l1

In [None]:
l = get_redundant_cols(df)
print(l)

In [None]:
df.drop(['Unnamed: 0'], axis=1).std()[df.drop(['Unnamed: 0'], axis=1).std() == 0].index

In [None]:
def col_0_std(df2: pd.DataFrame):
    standard_deviation = list(df.drop(['Unnamed: 0'], axis=1).std()[df.drop(['Unnamed: 0'], axis=1).std() == 0].index)
    return standard_deviation

In [None]:
l2 = col_0_std(df)
print(len(l2))

In [None]:
all_col_to_delete = l + l2
all_col_to_delete.append('Unnamed: 0')

In [None]:
all_col_to_delete

In [None]:
len(all_col_to_delete)

In [None]:
x, y = df.drop(all_col_to_delete, axis=1), df[['Good/Bad']]

In [None]:
x.head()

In [None]:
y

In [None]:
x.shape, y.shape

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline

In [None]:
imputer = KNNImputer(n_neighbors=5)
pipeline = Pipeline(
    steps=[('imputer', imputer),
           ('scaler', RobustScaler())]
)

# Transforming

In [None]:
x_trans = pipeline.fit_transform(x)

In [None]:
x_trans.shape

In [None]:
from imblearn.combine import SMOTETomek

X, y = x_trans[:, :-1], y
resampler = SMOTETomek(sampling_strategy="auto")
X_res, y_res = resampler.fit_resample(X, y)

In [None]:
print("Before resampling, Shape of training instances: ", np.c_[X, y].shape)
print("After resampling, Shape of training instances: ", np.c_[X_res, y_res].shape)

In [None]:
print(np.unique(y_res))
print(f"Value Counts: \n-1: {len(y_res[y_res == -1])}, 1: {len(y_res[y_res == 1])}")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=1 / 3, random_state=42)

print(f"train set: {X_train.shape, y_train.shape}")
print(f"test set: {X_test.shape, y_test.shape}")

In [None]:
# Prepared training and test sets

X_prep = X_train
y_prep = y_train
X_test_prep = X_test
y_test_prep = y_test

print(X_prep.shape, y_prep.shape)
print(X_test_prep.shape, y_test_prep.shape)

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score

# Shortlisted base Models
svc_clf = SVC(kernel='linear')
svc_rbf_clf = SVC(kernel='rbf')
random_clf = RandomForestClassifier(random_state=42)
xgb_clf = XGBClassifier()

In [None]:
## A function to display Scores

def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard Deviation: ", scores.std())

In [None]:
## SVC Scores

svc_scores = cross_val_score(svc_clf, X_prep, y_prep, scoring='roc_auc', cv=10, verbose=2)

In [None]:
display_scores(svc_scores)

In [None]:
## Performance on test set using cross-validation

# Predictions using cross-validation
svc_preds = cross_val_predict(svc_clf, X_test_prep, y_test_prep, cv=5)

# AUC score
svc_auc = roc_auc_score(y_test_prep, svc_preds)
svc_auc

In [None]:
## SVC rbf Scores

svc_rbf_scores = cross_val_score(svc_rbf_clf, X_prep, y_prep, scoring='roc_auc', cv=10, verbose=2)

In [None]:
display_scores(svc_rbf_scores)

In [None]:
## Performance on test set using cross-validation

# Predictions using cross-validation
svc_rbf_preds = cross_val_predict(svc_rbf_clf, X_test_prep, y_test_prep, cv=5)

# AUC score
svc_rbf_auc = roc_auc_score(y_test_prep, svc_rbf_preds)
svc_rbf_auc

In [None]:
## Random Forest Scores

random_clf_scores = cross_val_score(random_clf, X_prep, y_prep, scoring='roc_auc', cv=10, verbose=2)

In [None]:
display_scores(random_clf_scores)

In [None]:
## Performance on test set using cross-validation

# Predictions using cross-validation
random_clf_preds = cross_val_predict(random_clf, X_test_prep, y_test_prep, cv=5)

# AUC score
random_clf_auc = roc_auc_score(y_test_prep, random_clf_preds)
random_clf_auc

In [None]:
# y_prep
Y_xgb_prep = y_prep
Y_xgb_prep[Y_xgb_prep == -1] = 0
Y_xgb_prep

In [None]:
xgb = cross_val_score(xgb_clf, X_prep, Y_xgb_prep, scoring='roc_auc', cv=10, verbose=2)

In [None]:
# display_scores(xgb_clf)
Y_xgb_test = y_test_prep
Y_xgb_test[Y_xgb_test == -1] = 0
Y_xgb_test

In [None]:
xgb_clf_preds = cross_val_predict(xgb_clf, X_test_prep, y_test_prep, cv=5)

# AUC score
xgb_clf_auc = roc_auc_score(y_test_prep, random_clf_preds)
xgb_clf_auc

In [None]:

import os
from datetime import datetime

LOG_FILE = f"{datetime.now().strftime('%d_%m_%Y_%H_%M_%S')}.log"


In [None]:
logs_path = os.path.join(os.getcwd(), 'Logs', LOG_FILE)

In [None]:
os.path.join(logs_path, LOG_FILE)