In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import seaborn as sns
from pandas_profiling import ProfileReport
import warnings
warnings.filterwarnings("ignore")
import missingno as msno
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from scipy.stats import shapiro
# imputer
from sklearn.impute import SimpleImputer, KNNImputer




url1 ='https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data'
names = ["feature" + str(x) for x in range(1, 591)]
df1 = pd.read_csv(url1,sep=" ", names=names, na_values = "NaN",header=None)
df1.head()

url2 ='https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data'
df2 = pd.read_csv(url2,sep=" ",names = ["Result","Date"])

#df2.columns =['Pass/Fail','Date']
df2.head()




#Convertion of Date into Datetime from Object(String) data types
df2['Date'] = pd.to_datetime(df2['Date'])
df2.dtypes



#Joinig TWO df1 and df2 Dataframe naming SECOM
Secom = pd.concat([df1,df2],axis = 1)
print(Secom)

Secom = Secom.drop(['Date']
                   
# establish target and features of the manufacturing data
# set the target to the encoded manufacturing outcome column
y = Secom[['Result']]
# set the features as the rest of the dataset after dropping the features that are no
x = Secom.drop(['Result'], axis=1)

# getting the shapes of new data sets x and y
print("shape of x:", x.shape)
print("shape of y:", y.shape)

#Splitting data


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1,stratify = y)



# getting the counts
print("shape of x_train: ", x_train.shape)
print("shape of x_test: ", x_test.shape)
print("shape of y_train: ", y_train.shape)
print("shape of y_test: ", y_test.shape)




#Removing features having Missing ratio more than 50%


def percentna(dataframe, threshold):
    columns = dataframe.columns[(dataframe.isnull().sum()/len(dataframe))>threshold]
    return columns.tolist()

 

na_columns = percentna(x_train, 0.5)
len(na_columns)
x_train_dn = x_train.drop(na_columns, axis=1)
x_train_dn.shape





#Low Variance Filter
from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(x_train_dn)

 

constant_columns = [column for column in x_train_dn.columns
                    if column not in x_train_dn.columns[var_thres.get_support()]]


print(len(constant_columns))

x_train_lv = x_train_dn.drop(constant_columns,axis=1)

## Outlier Method 1 - By Imputation

In [None]:
def outliers(feat):
 upper_limit = feat.mean() + 3*feat.std()
 lower_limit = feat.mean() - 3*feat.std()

 feat = np.where(
    feat >upper_limit,
    upper_limit,
    np.where(
       feat <lower_limit,
        lower_limit,
        feat ))
 return feat

x_train_outliers_imputation =x_train_lv.copy()
for column in x_train_outliers_imputation:
  x_train_outliers_imputation[column] = outliers(x_train_outliers_imputation[column])

In [None]:
x_train_outliers_imputation.describe()

## Outlier Method 2 - By QuantileTransformer

In [None]:
def IQR_outliers(data,limit=1.5):
    numColumns = data.select_dtypes(include=np.number).columns.tolist(); # extract list of numeric columns
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3-Q1;
    outliers=((data[numColumns] < (Q1 - limit*IQR)) | (data[numColumns] > (Q3 + limit*IQR))).sum()*100/data.shape[0]
    return outliers 
from sklearn.preprocessing import QuantileTransformer
x_train_lv = x_train_lv.copy()
quantile_transformer = QuantileTransformer(output_distribution='normal', random_state= 42)
df_outliers = pd.DataFrame(quantile_transformer.fit_transform(x_train_lv),columns=x_train_lv.columns)
outliers = IQR_outliers(df_outliers)

In [None]:
df_outliers.plot(kind='box', subplots=True,layout=(120,5), fontsize=10, figsize=(15,150));

## Missing Value Imputation Method 1- Mean 

#### 1. Using Outlier Imputation dataframe

In [None]:
numColumns = x_train_outliers_imputation.select_dtypes(include=np.number).columns.tolist();

# initialize imputer. use strategy='mean' for mean imputation
imputer = SimpleImputer(strategy='mean')# fit the imputer on X_train. we pass only numeric columns with NA's here.
imputer.fit(x_train_outliers_imputation[numColumns])# transform the data using the fitted imputer
X_train_mean_impute = imputer.transform(x_train_outliers_imputation[numColumns])
X_test_mean_impute = imputer.transform(x_test[numColumns])# put the output into DataFrame. remember to pass columns used in fit/transform
X_train_mean_impute = pd.DataFrame(X_train_mean_impute, columns=numColumns)
X_test_mean_impute = pd.DataFrame(X_test_mean_impute, columns=numColumns)



#### Accuracy Check for Missing Value Imputation step

In [None]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
#from sklearn.metrics import scorer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV


classifiers = [['RandomForest :',RandomForestClassifier()]]


for name,classifier in classifiers:
    clf=classifier.fit(X_train_mean_impute,y_train)
    y_pred=classifier.predict(X_test_mean_impute)
    print(f'\n {name} \n')
    print(f'Training Score for {name}  {clf.score(X_train_mean_impute,y_train) * 100:.2f}' )
    print(f'Testing Score for {name} {clf.score(X_test_mean_impute,y_test) * 100:.2f}' )
    print(f'Classification report  \n {classification_report(y_test,y_pred)}' )
    print(f'Confusion matrix  \n {confusion_matrix(y_test,y_pred)}' )
    print(f'ROC AUC  : {roc_auc_score(y_test,y_pred)}' )

 




#### 2. Using Outlier Transformer dataframe

In [None]:
numColumns = df_outliers.select_dtypes(include=np.number).columns.tolist();

# initialize imputer. use strategy='mean' for mean imputation
imputer = SimpleImputer(strategy='mean')# fit the imputer on X_train. we pass only numeric columns with NA's here.
imputer.fit(df_outliers[numColumns])# transform the data using the fitted imputer
X_train_mean_impute = imputer.transform(df_outliers[numColumns])
X_test_mean_impute = imputer.transform(x_test[numColumns])# put the output into DataFrame. remember to pass columns used in fit/transform
X_train_mean_impute = pd.DataFrame(X_train_mean_impute, columns=numColumns)
X_test_mean_impute = pd.DataFrame(X_test_mean_impute, columns=numColumns)

In [None]:
## Missing Value Imputation Method 2- KNN Imputer

In [None]:
#### 1.By Using Outlier Imputation

In [None]:
# initialize imputer
imputer = KNNImputer()



# fit the imputer on X_train. pass only numeric columns.
imputer.fit(x_train_outliers_imputation[numColumns])



# transform the data using the fitted imputer
X_train_knn_impute1 = imputer.transform(x_train_outliers_imputation[numColumns])
X_test_knn_impute1 = imputer.transform(x_test[numColumns])



# put the output into DataFrame. remember to pass columns used in fit/transform
X_train_knn_impute1 = pd.DataFrame(X_train_knn_impute1, columns=numColumns)
X_test_knn_impute1 = imputer.transform(x_test[numColumns])

In [None]:
#### 2.By Using Outlier Transformer

In [None]:
# initialize imputer
imputer = KNNImputer()



# fit the imputer on X_train. pass only numeric columns.
imputer.fit(df_outliers[numColumns])



# transform the data using the fitted imputer
X_train_knn_impute2 = imputer.transform(df_outliers[numColumns])
X_test_knn_impute2 = imputer.transform(x_test[numColumns])



# put the output into DataFrame. remember to pass columns used in fit/transform
X_train_knn_impute2 = pd.DataFrame(df_outliers, columns=numColumns)
X_test_knn_impute2 = imputer.transform(x_test[numColumns])

In [None]:
## Missing Value Imputation Method 3- Hotdeck 

In [None]:
#### 1.By Using Outlier Imputation

In [None]:
#Hot deck (LOCF - last observation carried forward )

x_train_Hot_deck1 = x_train_outliers_imputation.copy()

x_train_Hot_deck1[num_cols_with_na] = x_train_Hot_deck1[num_cols_with_na].fillna(method ='ffill')


num_cols_with_na = num_cols[x_train_Hot_deck1[num_cols].isnull().mean() > 0]
print(f"*** numerical columns that have NaN's ({len(num_cols_with_na)}): \n{num_cols_with_na}\n\n")


x_train_Hot_deck1.isnull().mean().sort_values(ascending =False)

In [None]:
#### 2.By Using Outlier Transformer

In [None]:
#Hot deck (LOCF - last observation carried forward )

x_train_Hot_deck2 = df_outliers.copy()

x_train_Hot_deck2[num_cols_with_na] = x_train_Hot_deck2[num_cols_with_na].fillna(method ='ffill')


num_cols_with_na = num_cols[x_train_Hot_deck2[num_cols].isnull().mean() > 0]
print(f"*** numerical columns that have NaN's ({len(num_cols_with_na)}): \n{num_cols_with_na}\n\n")


x_train_Hot_deck2.isnull().mean().sort_values(ascending =False)

In [None]:
## Missing Value Imputation Method 4- Latest Fill

In [None]:
#### 1.By Using Outlier Imputation

In [None]:
# latest information available
x_train_LastFill1 = x_train_outliers_imputation.copy()
x_train_LastFill1.fillna(method='ffill', inplace=True)
x_train_LastFill1.fillna(method='bfill', inplace=True)
x_train_LastFill1

x_train_LastFill1.isna().sum().sort_values(ascending=False).head(10)

In [None]:
#### 2.By Using Outlier Transformer

In [None]:
# latest information available
x_train_LastFill2 = df_outliers.copy()
x_train_LastFill2.fillna(method='ffill', inplace=True)
x_train_LastFill2.fillna(method='bfill', inplace=True)
x_train_LastFill2

x_train_LastFill2.isna().sum().sort_values(ascending=False).head(10)

In [None]:
## Missing Value Imputation Method 5- MICE

In [None]:
#### 1.By Using Outlier Imputation

In [None]:



from impyute.imputation.cs import mice

# start the MICE training
imputed_training1=mice(x_train_outliers_imputation.values)

array_sum = np.sum(imputed_training1) #https://www.adamsmith.haus/python/answers/how-to-check-for-nan-elements-in-a-numpy-array-in-python
Trainset1 = np.isnan(array_sum)

Trainset1 #Checking for NaN elements in a NumPy array returns True if the array contains any NaN elements and False otherwise.

In [None]:
#### 2.By Using Outlier Transformer

In [None]:
from impyute.imputation.cs import mice

# start the MICE training
imputed_training2=mice(df_outliers.values)

array_sum = np.sum(imputed_training2) #https://www.adamsmith.haus/python/answers/how-to-check-for-nan-elements-in-a-numpy-array-in-python
Trainset2 = np.isnan(array_sum)

Trainset2 #Checking for NaN elements in a NumPy array returns True if the array contains any NaN elements and False otherwise.