In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import seaborn as sns
from pandas_profiling import ProfileReport
import warnings
warnings.filterwarnings("ignore")
import missingno as msno
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from scipy.stats import shapiro
# imputer
from sklearn.impute import SimpleImputer, KNNImputer




url1 ='https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data'
names = ["feature" + str(x) for x in range(1, 591)]
df1 = pd.read_csv(url1,sep=" ", names=names, na_values = "NaN",header=None)
df1.head()

url2 ='https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data'
df2 = pd.read_csv(url2,sep=" ",names = ["Result","Date"])

#df2.columns =['Pass/Fail','Date']
df2.head()




#Convertion of Date into Datetime from Object(String) data types
df2['Date'] = pd.to_datetime(df2['Date'])
df2.dtypes



#Joinig TWO df1 and df2 Dataframe naming SECOM
Secom = pd.concat([df1,df2],axis = 1)
print(Secom)

Secom = Secom.drop(['Date'],axis=1)
                   
# establish target and features of the manufacturing data
# set the target to the encoded manufacturing outcome column
y = Secom[['Result']]
# set the features as the rest of the dataset after dropping the features that are no
x = Secom.drop(['Result'], axis=1)

# getting the shapes of new data sets x and y
print("shape of x:", x.shape)
print("shape of y:", y.shape)

#Splitting data


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1,stratify = y)



# getting the counts
print("shape of x_train: ", x_train.shape)
print("shape of x_test: ", x_test.shape)
print("shape of y_train: ", y_train.shape)
print("shape of y_test: ", y_test.shape)




#Removing features having Missing ratio more than 50%


def percentna(dataframe, threshold):
    columns = dataframe.columns[(dataframe.isnull().sum()/len(dataframe))>threshold]
    return columns.tolist()

 

na_columns = percentna(x_train, 0.5)
len(na_columns)
x_train_dn = x_train.drop(na_columns, axis=1)
x_train_dn.shape





#Low Variance Filter
from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(x_train_dn)

 

constant_columns = [column for column in x_train_dn.columns
                    if column not in x_train_dn.columns[var_thres.get_support()]]


print(len(constant_columns))

x_train_lv = x_train_dn.drop(constant_columns,axis=1)

In [None]:
def outliers(feat):
 upper_limit = feat.mean() + 3*feat.std()
 lower_limit = feat.mean() - 3*feat.std()

 feat = np.where(
    feat >upper_limit,
    upper_limit,
    np.where(
       feat <lower_limit,
        lower_limit,
        feat ))
 return feat

x_train_outliers_imputation =x_train_lv.copy()
for column in x_train_outliers_imputation:
  x_train_outliers_imputation[column] = outliers(x_train_outliers_imputation[column])

In [None]:
#Hot deck (LOCF - last observation carried forward )
numColumns = x_train_outliers_imputation.select_dtypes(include=np.number).columns.tolist();
x_train_Hot_deck1 = x_train_outliers_imputation.copy()
x_test_Hot_deck1 = x_test.copy()
x_train_Hot_deck1[numColumns] = x_train_Hot_deck1[numColumns].fillna(method ='ffill')
x_test_Hot_deck1[numColumns] = x_test_Hot_deck1[numColumns].fillna(method ='ffill')

#num_cols_with_na = num_cols[x_train_Hot_deck1[num_cols].isnull().mean() > 0]
#print(f"*** numerical columns that have NaN's ({len(num_cols_with_na)}): \n{num_cols_with_na}\n\n")


x_train_Hot_deck1.isnull().mean().sort_values(ascending =False)
x_test_Hot_deck1.isnull().mean().sort_values(ascending =False)

In [None]:
#BORUTA
#REFERENCE: https://github.com/bnsreenu/python_for_microscopists/blob/master/198_Boruta_feature_selection_breast_cancer.py
#pip install boruta
 
#Standarize train data for BORUTA
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_std = sc.fit_transform(x_train_Hot_deck1)
x_test_std = sc.transform(x_test_Hot_deck1)
 
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
 
# load y_train as an array
 
y = y_train.values
y = y.ravel()
 
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
 
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
 
# find all relevant features 
feat_selector.fit(x_train_std, y)
 
# check selected features 
feat_selector.support_
 
# check ranking of features
feat_selector.ranking_
 
# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(x_train_std)
 
import numpy as np
feature_names = np.array(Secom.columns)
 
# Ranked features greater than threshold
feature_ranks = list(zip(feature_names, 
 feat_selector.ranking_, 
 feat_selector.support_))
 
# print the results
for feat in feature_ranks:
 print('Feature: {:<30} Rank: {}, Keep: {}'.format(feat[0], feat[1], feat[2]))
 
#Now use the subset of features to fit XGBoost model on training data
import xgboost as xgb
xgb_model = xgb.XGBClassifier()
 
xgb_model.fit(X_filtered, y_train)
 
#Now predict on test data using the trained model. 
 
#First apply feature selector transform to make sure same features are selected from test data
X_test_filtered = feat_selector.transform(x_test_std)
prediction_xgb = xgb_model.predict(X_test_filtered)
 
#Print overall accuracy
from sklearn import metrics
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_xgb))
 
#Confusion Matrix - verify accuracy of each class
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, prediction_xgb)
#print(cm)
sns.heatmap(cm, annot=True)

In [None]:
#BORUTA
#REFERENCE: https://github.com/bnsreenu/python_for_microscopists/blob/master/198_Boruta_feature_selection_breast_cancer.py
#pip install boruta
 
#Standarize train data for BORUTA
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_std = sc.fit_transform(x_train)
x_test_std = sc.transform(x_test)
 
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
 
# load y_train as an array
 
y = y_train.values
y = y.ravel()
 
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
 
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
 
# find all relevant features 
feat_selector.fit(x_train_std, y)
 
# check selected features 
feat_selector.support_
 
# check ranking of features
feat_selector.ranking_
 
# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(x_train_std)
 
import numpy as np
feature_names = np.array(Secom.columns)
 
# Ranked features greater than threshold
feature_ranks = list(zip(feature_names, 
 feat_selector.ranking_, 
 feat_selector.support_))
 
# print the results
for feat in feature_ranks:
 print('Feature: {:<30} Rank: {}, Keep: {}'.format(feat[0], feat[1], feat[2]))
 
#Now use the subset of features to fit XGBoost model on training data
import xgboost as xgb
xgb_model = xgb.XGBClassifier()
 
xgb_model.fit(X_filtered, y_train)
 
#Now predict on test data using the trained model. 
 
#First apply feature selector transform to make sure same features are selected from test data
X_test_filtered = feat_selector.transform(x_test_std)
prediction_xgb = xgb_model.predict(X_test_filtered)
 
#Print overall accuracy
from sklearn import metrics
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_xgb))
 
#Confusion Matrix - verify accuracy of each class
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, prediction_xgb)
#print(cm)
sns.heatmap(cm, annot=True)