In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from scipy.stats import shapiro
# imputer
from sklearn.impute import SimpleImputer, KNNImputer




url1 ='https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data'
names = ["feature" + str(x) for x in range(1, 591)]
df1 = pd.read_csv(url1,sep=" ", names=names, na_values = "NaN",header=None)
df1.head()

url2 ='https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data'
df2 = pd.read_csv(url2,sep=" ",names = ["Result","Date"])

#df2.columns =['Pass/Fail','Date']
df2.head()




#Convertion of Date into Datetime from Object(String) data types
df2['Date'] = pd.to_datetime(df2['Date'])
df2.dtypes



#Joinig TWO df1 and df2 Dataframe naming SECOM
Secom = pd.concat([df1,df2],axis = 1)
print(Secom)

Secom = Secom.drop(['Date'],axis=1)
                   
# establish target and features of the manufacturing data
# set the target to the encoded manufacturing outcome column
y = Secom[['Result']]
# set the features as the rest of the dataset after dropping the features that are no
x = Secom.drop(['Result'], axis=1)

# getting the shapes of new data sets x and y
print("shape of x:", x.shape)
print("shape of y:", y.shape)

#Splitting data


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1,stratify = y)



# getting the counts
print("shape of x_train: ", x_train.shape)
print("shape of x_test: ", x_test.shape)
print("shape of y_train: ", y_train.shape)
print("shape of y_test: ", y_test.shape)




#Removing features having Missing ratio more than 50%


def percentna(dataframe, threshold):
    columns = dataframe.columns[(dataframe.isnull().sum()/len(dataframe))>threshold]
    return columns.tolist()

 

na_columns = percentna(x_train, 0.5)
len(na_columns)
x_train_dn = x_train.drop(na_columns, axis=1)
x_train_dn.shape





#Low Variance Filter
from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(x_train_dn)

 

constant_columns = [column for column in x_train_dn.columns
                    if column not in x_train_dn.columns[var_thres.get_support()]]


print(len(constant_columns))

x_train_lv = x_train_dn.drop(constant_columns,axis=1)

      feature1  feature2   feature3   feature4  feature5  feature6  feature7  \
0      3030.93   2564.00  2187.7333  1411.1265    1.3602     100.0   97.6133   
1      3095.78   2465.14  2230.4222  1463.6606    0.8294     100.0  102.3433   
2      2932.61   2559.94  2186.4111  1698.0172    1.5102     100.0   95.4878   
3      2988.72   2479.90  2199.0333   909.7926    1.3204     100.0  104.2367   
4      3032.24   2502.87  2233.3667  1326.5200    1.5334     100.0  100.3967   
...        ...       ...        ...        ...       ...       ...       ...   
1562   2899.41   2464.36  2179.7333  3085.3781    1.4843     100.0   82.2467   
1563   3052.31   2522.55  2198.5667  1124.6595    0.8763     100.0   98.4689   
1564   2978.81   2379.78  2206.3000  1110.4967    0.8236     100.0   99.4122   
1565   2894.92   2532.01  2177.0333  1183.7287    1.5726     100.0   98.7978   
1566   2944.92   2450.76  2195.4444  2914.1792    1.5978     100.0   85.1011   

      feature8  feature9  feature10  ..

#### Outliers QuantileTransformer

In [2]:
def IQR_outliers(data,limit=1.5):
    numColumns = data.select_dtypes(include=np.number).columns.tolist(); # extract list of numeric columns
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3-Q1;
    outliers=((data[numColumns] < (Q1 - limit*IQR)) | (data[numColumns] > (Q3 + limit*IQR))).sum()*100/data.shape[0]
    return outliers 
from sklearn.preprocessing import QuantileTransformer
x_train_lv = x_train_lv.copy()
quantile_transformer = QuantileTransformer(output_distribution='normal', random_state= 42)
df_outliers = pd.DataFrame(quantile_transformer.fit_transform(x_train_lv),columns=x_train_lv.columns)
outliers = IQR_outliers(df_outliers)

In [3]:
outliers

feature1      0.912409
feature2      0.912409
feature3      0.912409
feature4      0.729927
feature5      0.821168
                ...   
feature586    0.821168
feature587    0.821168
feature588    0.821168
feature589    0.912409
feature590    2.372263
Length: 450, dtype: float64

#### Mean imputation

In [4]:
numColumns = df_outliers.select_dtypes(include=np.number).columns.tolist();

# initialize imputer. use strategy='mean' for mean imputation
imputer = SimpleImputer(strategy='mean')# fit the imputer on X_train. we pass only numeric columns with NA's here.
imputer.fit(df_outliers[numColumns])# transform the data using the fitted imputer
X_train_mean_impute = imputer.transform(df_outliers[numColumns])
X_test_mean_impute = imputer.transform(x_test[numColumns])# put the output into DataFrame. remember to pass columns used in fit/transform
X_train_mean_impute = pd.DataFrame(X_train_mean_impute, columns=numColumns)
X_test_mean_impute = pd.DataFrame(X_test_mean_impute, columns=numColumns)

#### feature reduction Boruta

In [5]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)
boruta_selector.fit(np.array(X_train_mean_impute), np.array(y_train)) 

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	450
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	450
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	450
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	450
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	450
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	450
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	450
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	14
Rejected: 	436
Iteration: 	9 / 100
Confirmed: 	1
Tentative: 	13
Rejected: 	436
Iteration: 	10 / 100
Confirmed: 	1
Tentative: 	13
Rejected: 	436
Iteration: 	11 / 100
Confirmed: 	1
Tentative: 	13
Rejected: 	436
Iteration: 	12 / 100
Confirmed: 	1
Tentative: 	13
Rejected: 	436
Iteration: 	13 / 100
Confirmed: 	1
Tentative: 	13
Rejected: 	436
Iteration: 	14 / 100
Confirmed: 	1
Tentative: 	13
Rejected: 	436
Iteration: 	15 / 100
Confirmed: 	1
Tentative: 	13
Rejected: 	436
Iteration: 	16 / 100
Confirmed: 	4
Tentat

BorutaPy(estimator=RandomForestClassifier(max_depth=5, n_estimators=105,
                                          random_state=RandomState(MT19937) at 0x1DE9A7ECA40),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x1DE9A7ECA40, verbose=2)

In [6]:
# Ranking of Boruta

print("Ranking: ",boruta_selector.ranking_)          
print("No. of significant features: ", boruta_selector.n_features_) 

Ranking:  [128 288 195 252 207 414 406 394 147 202 116 230 153 388  36 275 239 286
 280   1   5 326 331  22  18  20 244 313 175 363 261  20 327 331 271 189
   1 256  13 117 317 246 122 363 178 124 365  68 339 284  78 324 215 370
   1 384  18  26  16   1   1 141 135 147  67   9 190 243 434 125 109 100
 268 263 212 103 259 121 422 310 387 188 132 206 417 319 237 423 411 350
 400  84 253 427  30  56  80 359 209 309 219 226 270  65 221 339  35 374
  97 158  40 311  52  41 374 217 235 376 386 396 293  43 268 385 359 256
 316 147 354 236 394 418 346 328 227 167  93 212 295  51   1 145 319  27
  33 172 349 352  38  11   8 105 232  80 209  32 299 304 258 390 342 390
 366 349 420 371 228 157 194 333  57 173 140  51 199 126  71 222 233  35
 291   6 434 344 212 434  24  82  58 319  47 204 133 185 272 321  76 127
 263 264  59 196 180 275 106 105  78  47 200 218 313 421 346 324   1 302
 223 299 283 291 286 368 409 359 405 273 336 347 164 153 161 352  69   3
 182 221  92  63 167 324 369  12   8  49 

In [7]:
selected_rf_features = pd.DataFrame({'Feature':list(X_train_mean_impute.columns),
                                       'Ranking':boruta_selector.ranking_})
selected_rf_features.sort_values(by='Ranking').head(30)

Unnamed: 0,Feature,Ranking
60,feature66,1
281,feature349,1
140,feature154,1
36,feature39,1
426,feature563,1
403,feature540,1
360,feature461,1
214,feature268,1
330,feature427,1
59,feature65,1


In [8]:
# Using the BorutaPy object to transform the features in the dataset.

X_Filtered_train = boruta_selector.transform(np.array(X_train_mean_impute))

X_Filtered_test = boruta_selector.transform(np.array(X_test_mean_impute)) 


### Accuracy check

In [9]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
#from sklearn.metrics import scorer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV

classifiers = [['RandomForest :',RandomForestClassifier()]]

for name,classifier in classifiers:
    clf=classifier.fit(X_Filtered_train,y_train)
    y_pred=classifier.predict(X_Filtered_test)
    print(f'\n {name} \n')
    print(f'Training Score for {name}  {clf.score(X_Filtered_train,y_train) * 100:.2f}' )
    print(f'Testing Score for {name} {clf.score(X_Filtered_test,y_test) * 100:.2f}' )
    print(f'Classification report  \n {classification_report(y_test,y_pred)}' )
    print(f'Confusion matrix  \n {confusion_matrix(y_test,y_pred)}' )
    print(f'ROC AUC  : {roc_auc_score(y_test,y_pred)}' )


 RandomForest : 

Training Score for RandomForest :  100.00
Testing Score for RandomForest : 91.72
Classification report  
               precision    recall  f1-score   support

          -1       0.93      0.98      0.96       440
           1       0.00      0.00      0.00        31

    accuracy                           0.92       471
   macro avg       0.47      0.49      0.48       471
weighted avg       0.87      0.92      0.89       471

Confusion matrix  
 [[432   8]
 [ 31   0]]
ROC AUC  : 0.4909090909090909
