In [13]:
import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings(action='ignore')

In [14]:
data = pd.read_csv('uci-secom.csv')

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 592 entries, Time to Pass/Fail
dtypes: float64(590), int64(1), object(1)
memory usage: 7.1+ MB


### Preprocessing Data

In [24]:
def preprocess_input(df): 
    df = df.copy()
    
    #drop the time columns from axis 1 
    df = df.drop('Time', axis=1 )
    
    # drop the isna columns 
    missing_value_columns = df.columns[df.isna().mean()>= 0.25] 
    df = df.drop(missing_value_columns ,axis=1 )
    
    # fill the missing value in here 
    for column in df.columns:
            df[column] = df[column].fillna(df[column].mean())
            
    # replace -1 and 1 to pass and fail in here 
    df['Pass/Fail'] = df['Pass/Fail'].replace({-1:'Pass', 1:'Fail'})
    
    # remove the single variable in here 
    single_value_columns = [
            '5','13','42','49','52','69','97','141','149','178','179','186','189','190',
'191','192','193','194','226','229','230','231','232','233','234','235','236',
'237','240','241','242','243','256','257','258','259','260','261','262','263',
'264','265','266','276','284','313','314','315','322','325','326','327','328',
'329','330','364','369','370','371','372','373','374','375','378','379','380',
'381','394','395','396','397','398','399','400','401','402','403','404','414',
'422','449','450','451','458','461','462','463','464','465','466','481','498',
'501','502','503','504','505','506','507','508','509','512','513','514','515',
'528','529','530','531','532','533','534','535','536','537','538' ]
    
    df = df.drop(single_value_columns, axis=1 )
    
    # split the df into X and y in here 
    y = df['Pass/Fail']
    X = df.drop('Pass/Fail' , axis=1)
    
    # and continue use the train-test slpit in here that use 70% of the data 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state= 1)
    
    # we scale the value in here 
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    # 一开始return df 然后现在换 return X_train, X_test, y_train, y_test
    # return X, y
    return X_train, X_test, y_train, y_test 
    

In [25]:
x = preprocess_input(data)

In [18]:
x

(             0         1         2         3         4         6         7  \
 390  -0.349832 -0.784979 -0.702110 -0.837070 -0.062886  0.550891  0.079727   
 635   0.033698  0.353357  0.015921 -1.183386 -0.051551  0.845131 -0.117085   
 78    0.246294  0.059202 -0.586681  0.429161 -0.052493 -0.356155  0.263419   
 733   0.315914 -1.701055  0.354622  0.405983 -0.058342  0.791001  0.027244   
 1262 -0.448795  0.573974 -0.926520 -0.573652 -0.059723  0.447072  0.355265   
 ...        ...       ...       ...       ...       ...       ...       ...   
 715   0.760898 -0.466310  0.543717 -1.204002 -0.054619  0.995094 -0.340139   
 905  -0.920769  0.379892  0.543337 -0.652297 -0.061495 -0.202822 -0.195810   
 1096  0.659998 -0.060836  2.263423  1.479871 -0.047401 -1.023442 -0.051481   
 235   0.423734 -0.074230 -0.101032 -0.341214 -0.062369 -0.213299 -0.222052   
 1061  1.873705  1.582037  1.059743  1.401125 -0.053581 -1.198609  0.184694   
 
              8         9        10  ...       576

# Most important check the objective have missing value or not ?

In [19]:
x['Pass/Fail'].isna().sum()

TypeError: tuple indices must be integers or slices, not str

In [22]:
# let see the missing value in each columns 
x.isna().sum()

AttributeError: 'tuple' object has no attribute 'isna'

In [23]:
# this guy check the missing value if the mean is greater than 25% he will remove the columns
x.isna().mean() >= 0.25

AttributeError: 'tuple' object has no attribute 'isna'

In [24]:
#thus he highlight the cloums that are true isna
x.columns[x.isna().mean()>= 0.25] 

# we go back to drop these columns 


Index([], dtype='object')

In [54]:
# we droped the mean that are greater than 0.25 thus we continue to fil in the mean of na in the remain columns


In [58]:
# after all the na had been fill, we make the objective value to be more clear using replace function 

In [60]:
# then split df into X and y and use train test slpit 

In [6]:
X_train, X_test, y_train, y_test = preprocess_input(data) 

In [7]:
X_train 

Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,576,577,582,583,584,585,586,587,588,589
390,-0.349832,-0.784979,-0.702110,-0.837070,-0.062886,0.550891,0.079727,1.660382,0.575205,-2.431983,...,-0.229337,-0.101179,-1.421245,-0.104493,0.090875,-0.091661,-0.896661,-0.895327,-0.972313,-0.222332
635,0.033698,0.353357,0.015921,-1.183386,-0.051551,0.845131,-0.117085,-1.250633,0.022325,0.591865,...,-0.236675,0.059216,-0.063064,-0.367069,-0.378555,-0.351676,0.896361,0.747328,0.485680,-0.307801
78,0.246294,0.059202,-0.586681,0.429161,-0.052493,-0.356155,0.263419,-0.036013,-0.166304,-0.468738,...,-0.208856,0.044560,-1.155514,-0.185286,-0.065602,-0.176462,-0.488429,0.120526,0.054910,0.135286
733,0.315914,-1.701055,0.354622,0.405983,-0.058342,0.791001,0.027244,0.827113,3.098939,-0.107682,...,-0.214130,-0.328386,-1.037412,0.232143,0.122170,0.233756,0.952392,-0.884520,-0.773496,-0.787101
1262,-0.448795,0.573974,-0.926520,-0.573652,-0.059723,0.447072,0.355265,-1.193634,-1.285073,-1.168285,...,-0.195474,-0.249656,-1.509822,-0.037166,-0.159488,-0.028790,-0.632511,1.730761,1.811128,1.406800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,0.760898,-0.466310,0.543717,-1.204002,-0.054619,0.995094,-0.340139,-0.701000,-1.239541,0.275941,...,-0.219221,-0.656602,-1.096463,0.009963,-0.034307,0.017244,-1.152807,0.477155,0.386272,1.968328
905,-0.920769,0.379892,0.543337,-0.652297,-0.061495,-0.202822,-0.195810,0.057628,0.139406,0.501601,...,-0.213599,-0.647856,1.413220,-0.225682,-0.284669,-0.223534,-0.040173,-0.549505,-0.574678,-0.484059
1096,0.659998,-0.060836,2.263423,1.479871,-0.047401,-1.023442,-0.051481,-0.994138,0.399585,-0.615418,...,-0.216786,-0.198393,0.616026,-0.272811,-0.128193,-0.268303,-0.296319,-0.938555,-0.972313,-0.591078
235,0.423734,-0.074230,-0.101032,-0.341214,-0.062369,-0.213299,-0.222052,-0.521861,-2.176185,-0.175380,...,-0.267046,-0.597438,-1.568874,-0.205484,-0.003011,-0.191936,-2.001291,-0.787258,-0.773496,1.767280


In [8]:
y_train

390     Pass
635     Pass
78      Pass
733     Pass
1262    Pass
        ... 
715     Pass
905     Pass
1096    Pass
235     Fail
1061    Pass
Name: Pass/Fail, Length: 1096, dtype: object

### Need to scale the value of the training sets as they are too big diffenrent in between the number 

In [66]:
# the number like 3000 and 3 in the test set have too big gap that will make the ML model to know perfrom well


In [9]:
X_train, X_test, y_train, y_test = preprocess_input(data) 

In [10]:
# The data had been scale to same scale in between -1 and 1 
X_train

Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,576,577,582,583,584,585,586,587,588,589
390,-0.349832,-0.784979,-0.702110,-0.837070,-0.062886,0.550891,0.079727,1.660382,0.575205,-2.431983,...,-0.229337,-0.101179,-1.421245,-0.104493,0.090875,-0.091661,-0.896661,-0.895327,-0.972313,-0.222332
635,0.033698,0.353357,0.015921,-1.183386,-0.051551,0.845131,-0.117085,-1.250633,0.022325,0.591865,...,-0.236675,0.059216,-0.063064,-0.367069,-0.378555,-0.351676,0.896361,0.747328,0.485680,-0.307801
78,0.246294,0.059202,-0.586681,0.429161,-0.052493,-0.356155,0.263419,-0.036013,-0.166304,-0.468738,...,-0.208856,0.044560,-1.155514,-0.185286,-0.065602,-0.176462,-0.488429,0.120526,0.054910,0.135286
733,0.315914,-1.701055,0.354622,0.405983,-0.058342,0.791001,0.027244,0.827113,3.098939,-0.107682,...,-0.214130,-0.328386,-1.037412,0.232143,0.122170,0.233756,0.952392,-0.884520,-0.773496,-0.787101
1262,-0.448795,0.573974,-0.926520,-0.573652,-0.059723,0.447072,0.355265,-1.193634,-1.285073,-1.168285,...,-0.195474,-0.249656,-1.509822,-0.037166,-0.159488,-0.028790,-0.632511,1.730761,1.811128,1.406800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,0.760898,-0.466310,0.543717,-1.204002,-0.054619,0.995094,-0.340139,-0.701000,-1.239541,0.275941,...,-0.219221,-0.656602,-1.096463,0.009963,-0.034307,0.017244,-1.152807,0.477155,0.386272,1.968328
905,-0.920769,0.379892,0.543337,-0.652297,-0.061495,-0.202822,-0.195810,0.057628,0.139406,0.501601,...,-0.213599,-0.647856,1.413220,-0.225682,-0.284669,-0.223534,-0.040173,-0.549505,-0.574678,-0.484059
1096,0.659998,-0.060836,2.263423,1.479871,-0.047401,-1.023442,-0.051481,-0.994138,0.399585,-0.615418,...,-0.216786,-0.198393,0.616026,-0.272811,-0.128193,-0.268303,-0.296319,-0.938555,-0.972313,-0.591078
235,0.423734,-0.074230,-0.101032,-0.341214,-0.062369,-0.213299,-0.222052,-0.521861,-2.176185,-0.175380,...,-0.267046,-0.597438,-1.568874,-0.205484,-0.003011,-0.191936,-2.001291,-0.787258,-0.773496,1.767280


In [11]:
# the 5 become 0 bcs it have single value and we want to remove it 
# the reason of have single value bcs me fit the Na wor 
# thus we need to find it before the test 

# we return x and y 
X, y = preprocess_input(data)

ValueError: too many values to unpack (expected 2)

### Need to memorize it 

In [12]:
# then use pandas series to count the single value 
(pd.Series({column: len(X[column].unique()) for column in X.columns}) ==1).sum()

NameError: name 'X' is not defined

In [79]:
# there are 116 colunms that have single value 
#@1 so to know which are the 116 -right now we only know no5 is one of the 116 single value- 
pd.Series({column: len(X[column].unique()) for column in X.columns})

0      1521
1      1505
2       508
3       519
4       504
       ... 
585    1503
586     323
587     261
588     121
589     612
Length: 558, dtype: int64

In [80]:
#@2 and we take the true false of the 116
pd.Series({column: len(X[column].unique()) for column in X.columns}) ==1

0      False
1      False
2      False
3      False
4      False
       ...  
585    False
586    False
587    False
588    False
589    False
Length: 558, dtype: bool

In [82]:
### we mix @1 and @2 tgt to show the result of it 
pd.Series({column: len(X[column].unique()) for column in X.columns})[pd.Series({column: len(X[column].unique()) for column in X.columns}) ==1].index

Index(['5', '13', '42', '49', '52', '69', '97', '141', '149', '178',
       ...
       '529', '530', '531', '532', '533', '534', '535', '536', '537', '538'],
      dtype='object', length=116)

In [87]:
# before we remove it we should define it we need to define it 
single_value_columns = pd.Series({column: len(X[column].unique()) for column in X.columns})[pd.Series({column: len(X[column].unique()) for column in X.columns}) ==1].index


In [89]:
list(single_value_columns)

['5',
 '13',
 '42',
 '49',
 '52',
 '69',
 '97',
 '141',
 '149',
 '178',
 '179',
 '186',
 '189',
 '190',
 '191',
 '192',
 '193',
 '194',
 '226',
 '229',
 '230',
 '231',
 '232',
 '233',
 '234',
 '235',
 '236',
 '237',
 '240',
 '241',
 '242',
 '243',
 '256',
 '257',
 '258',
 '259',
 '260',
 '261',
 '262',
 '263',
 '264',
 '265',
 '266',
 '276',
 '284',
 '313',
 '314',
 '315',
 '322',
 '325',
 '326',
 '327',
 '328',
 '329',
 '330',
 '364',
 '369',
 '370',
 '371',
 '372',
 '373',
 '374',
 '375',
 '378',
 '379',
 '380',
 '381',
 '394',
 '395',
 '396',
 '397',
 '398',
 '399',
 '400',
 '401',
 '402',
 '403',
 '404',
 '414',
 '422',
 '449',
 '450',
 '451',
 '458',
 '461',
 '462',
 '463',
 '464',
 '465',
 '466',
 '481',
 '498',
 '501',
 '502',
 '503',
 '504',
 '505',
 '506',
 '507',
 '508',
 '509',
 '512',
 '513',
 '514',
 '515',
 '528',
 '529',
 '530',
 '531',
 '532',
 '533',
 '534',
 '535',
 '536',
 '537',
 '538']

In [91]:
# This number need to remove before apply it to the test_split function
'5','13','42','49','52','69','97','141','149','178','179','186','189','190',
'191','192','193','194','226','229','230','231','232','233','234','235','236',
'237','240','241','242','243','256','257','258','259','260','261','262','263',
'264','265','266','276','284','313','314','315','322','325','326','327','328',
'329','330','364','369','370','371','372','373','374','375','378','379','380',
'381','394','395','396','397','398','399','400','401','402','403','404','414',
'422','449','450','451','458','461','462','463','464','465','466','481','498',
'501','502','503','504','505','506','507','508','509','512','513','514','515',
'528','529','530','531','532','533','534','535','536','537','538'

('528', '529', '530', '531', '532', '533', '534', '535', '536', '537', '538')

In [26]:
# check again the single value still exsits or not  
pd.Series({column: len(X_train[column].unique()) for column in X_train.columns})[pd.Series({column: len(X_train[column].unique()) for column in X_train.columns}) ==1]

Series([], dtype: int64)

## Examming Class Imbalance 

In [28]:
y.train.value.counts()

NameError: name 'y' is not defined

# Final Steps of the model 

In [27]:
# Training a model (Imbalanced Class-
# this can look back the video in supply chain 
def evaluate_model(model, X_test, y_test):
    
    acc = model.score(X_test, y_test)
    print("Accuracy: {:.2f}%".format(acc * 100))
    
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred, labels=['PASS', 'FAIL'])
    clr = classification_report(y_test, y_pred, labels=['PASS', 'FAIL'])
    
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='g', vmin=0, cmap='Blues', cbar=False)
    plt.xticks(ticks=[0.5, 1.5], labels=["PASS", "FAIL"])
    plt.yticks(ticks=[0.5, 1.5], labels=["PASS", "FAIL"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    print("Classification Report:\n----------------------\n", clr)

In [101]:
# using logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

evaluate_model(model, X_test, y_test)


Accuracy: 88.96%


ValueError: At least one label specified must be in y_true