In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, accuracy_score
import seaborn as seab
#from sklearn import preprocessing
#from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
#from interpret.blackbox import LimeTabular
#from interpret import show

In [4]:
data = pd.read_excel('eines.xlsx')
data.head(10)

Unnamed: 0,_time,SkidNr,Line,Camera,Recognized,Quality1,Quality2,Quality3,Quality4,Quality_Min
0,2022-06-27T14:30:04.000+0200,1322,3,6,OK,0.88,0.93,0.91,0.92,0.88
1,2022-06-27T14:29:58.000+0200,365,3,4,OK,0.97,0.93,0.97,0.89,0.89
2,2022-06-27T14:29:30.000+0200,329,2,4,OK,0.95,0.91,0.94,0.96,0.91
3,2022-06-27T14:29:27.000+0200,104,1,2,OK,0.96,0.84,0.96,0.93,0.84
4,2022-06-27T14:29:23.000+0200,576,2,6,OK,0.98,0.91,0.94,0.95,0.91
5,2022-06-27T14:29:02.000+0200,427,2,1,OK,0.91,0.88,0.87,0.85,0.85
6,2022-06-27T14:28:55.000+0200,506,1,3,OK,0.95,0.9,0.96,0.96,0.9
7,2022-06-27T14:28:47.000+0200,766,2,2,OK,0.95,0.91,0.93,0.93,0.91
8,2022-06-27T14:28:43.000+0200,1465,1,5,OK,0.86,0.92,0.95,0.9,0.86
9,2022-06-27T14:28:33.000+0200,189,3,2,OK,0.94,0.83,0.93,0.93,0.83


In [5]:
#parse column _time to datetime format
data['_time'] = pd.to_datetime(data['_time'])

data['Date'] = pd.to_datetime(data['_time']).dt.date
data['Time'] = pd.to_datetime(data['_time']).dt.time
second_column = data.pop('Date')
data.insert(1, 'Date', second_column)
third_column = data.pop('Time')
data.insert(2, 'Time', third_column)
data.pop('_time')

0        2022-06-27 14:30:04+02:00
1        2022-06-27 14:29:58+02:00
2        2022-06-27 14:29:30+02:00
3        2022-06-27 14:29:27+02:00
4        2022-06-27 14:29:23+02:00
                    ...           
211258   2022-06-03 05:08:34+02:00
211259   2022-06-03 05:08:22+02:00
211260   2022-06-03 05:08:06+02:00
211261   2022-06-03 05:07:44+02:00
211262   2022-06-03 05:07:38+02:00
Name: _time, Length: 211263, dtype: datetime64[ns, pytz.FixedOffset(120)]

In [6]:
#bin encoding of column data['Recognized']
#OK - 1, NOK - 0
data['Recognized'] = data['Recognized'].map(lambda x: 1 if x=='OK' else 0)
data.head(10)

Unnamed: 0,Date,Time,SkidNr,Line,Camera,Recognized,Quality1,Quality2,Quality3,Quality4,Quality_Min
0,2022-06-27,14:30:04,1322,3,6,1,0.88,0.93,0.91,0.92,0.88
1,2022-06-27,14:29:58,365,3,4,1,0.97,0.93,0.97,0.89,0.89
2,2022-06-27,14:29:30,329,2,4,1,0.95,0.91,0.94,0.96,0.91
3,2022-06-27,14:29:27,104,1,2,1,0.96,0.84,0.96,0.93,0.84
4,2022-06-27,14:29:23,576,2,6,1,0.98,0.91,0.94,0.95,0.91
5,2022-06-27,14:29:02,427,2,1,1,0.91,0.88,0.87,0.85,0.85
6,2022-06-27,14:28:55,506,1,3,1,0.95,0.9,0.96,0.96,0.9
7,2022-06-27,14:28:47,766,2,2,1,0.95,0.91,0.93,0.93,0.91
8,2022-06-27,14:28:43,1465,1,5,1,0.86,0.92,0.95,0.9,0.86
9,2022-06-27,14:28:33,189,3,2,1,0.94,0.83,0.93,0.93,0.83


In [8]:
# splitting dataset: 80% training and 20% test
#features = list(data.columns.values.tolist())
features = ['Date', 'Time', 'SkidNr', 'Line', 'Camera', 'Quality1', 'Quality2', 'Quality3', 'Quality4', 'Quality_Min']
X = data.loc[:, features]
y = data.loc[:, ['Recognized']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, train_size= 0.8,  random_state=0)

In [9]:
# IForest model implementation with target class 'Recognized'
random_state = np.random.RandomState(42)

model = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.2), random_state= random_state)
model.fit(data[['Recognized']])
print(model.get_params())



{'bootstrap': False, 'contamination': 0.2, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 100, 'n_jobs': None, 'random_state': RandomState(MT19937) at 0x7F9670F52440, 'verbose': 0, 'warm_start': False}


In [26]:
# anomaly score for 'Recognized'
#Anonalies are assigned a score value of -1
data['Score'] = model.decision_function(data[['Recognized']])
data['Anomaly_score'] = model.predict(data[['Recognized']])
data[data['Anomaly_score'] == -1].head(10)

Unnamed: 0,Date,Time,SkidNr,Line,Camera,Recognized,Quality1,Quality2,Quality3,Quality4,Quality_Min,Score,Anomaly_score
107,2022-06-27,13:37:39,1580,1,1,0,0.79,0.79,0.64,0.94,0.64,-0.07742,-1
328,2022-06-27,12:10:41,8634,2,6,0,0.6,0.97,0.94,0.95,0.6,-0.07742,-1
3990,2022-06-28,07:08:19,1097,1,1,0,0.83,0.93,0.87,0.65,0.65,-0.07742,-1
5215,2022-06-27,22:27:00,826,2,1,0,0.92,0.92,0.85,0.54,0.54,-0.07742,-1
5980,2022-06-27,16:35:41,1485,1,1,0,0.77,0.88,0.64,0.86,0.64,-0.07742,-1
6607,2022-06-29,09:48:49,897,2,1,0,0.91,0.92,0.89,0.66,0.66,-0.07742,-1
7104,2022-06-29,06:40:24,1357,2,1,0,0.85,0.85,0.69,0.86,0.69,-0.07742,-1
7751,2022-06-29,01:20:20,1326,1,1,0,0.75,0.68,0.78,0.86,0.68,-0.07742,-1
8518,2022-06-28,19:07:02,1633,1,1,0,0.84,0.9,0.68,0.82,0.68,-0.07742,-1
8592,2022-06-28,18:35:50,1309,1,1,0,0.76,0.68,0.93,0.87,0.68,-0.07742,-1


In [30]:
#How many anomalies are in the dataset
anomalies = len(data[data['Anomaly_score'] == -1])
rows = len(data)
#errorPercentile = anomalies / rows
print("Number of anomalies in dataset", anomalies, "out of", rows, "rows.")

Number of anomalies in dataset 290 out of 211263 rows.


In [62]:
#Preprocessing the dataset - anomaly values only
dataA = data[data['Anomaly_score'] == -1]
dataA.head(10)

Unnamed: 0,Date,Time,SkidNr,Line,Camera,Recognized,Quality1,Quality2,Quality3,Quality4,Quality_Min,Score,Anomaly_score
107,2022-06-27,13:37:39,1580,1,1,0,0.79,0.79,0.64,0.94,0.64,-0.07742,-1
328,2022-06-27,12:10:41,8634,2,6,0,0.6,0.97,0.94,0.95,0.6,-0.07742,-1
3990,2022-06-28,07:08:19,1097,1,1,0,0.83,0.93,0.87,0.65,0.65,-0.07742,-1
5215,2022-06-27,22:27:00,826,2,1,0,0.92,0.92,0.85,0.54,0.54,-0.07742,-1
5980,2022-06-27,16:35:41,1485,1,1,0,0.77,0.88,0.64,0.86,0.64,-0.07742,-1
6607,2022-06-29,09:48:49,897,2,1,0,0.91,0.92,0.89,0.66,0.66,-0.07742,-1
7104,2022-06-29,06:40:24,1357,2,1,0,0.85,0.85,0.69,0.86,0.69,-0.07742,-1
7751,2022-06-29,01:20:20,1326,1,1,0,0.75,0.68,0.78,0.86,0.68,-0.07742,-1
8518,2022-06-28,19:07:02,1633,1,1,0,0.84,0.9,0.68,0.82,0.68,-0.07742,-1
8592,2022-06-28,18:35:50,1309,1,1,0,0.76,0.68,0.93,0.87,0.68,-0.07742,-1


In [66]:
#How many anomalies per Line
pd.value_counts(dataA['Line']).sort_index(ascending= True).to_frame()

Unnamed: 0,Line
1,72
2,184
3,34


In [67]:
#How many anomalies per
pd.value_counts(dataA['Camera']).sort_index(ascending= True).to_frame()

Unnamed: 0,Camera
1,211
2,5
3,8
4,7
5,28
6,31


In [59]:
#data[['Line', 'Anomaly_score']].value_counts().sort_index(ascending= True).to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Line,Anomaly_score,Unnamed: 2_level_1
1,-1,72
1,1,61392
2,-1,184
2,1,74979
3,-1,34
3,1,74602
