# broken-machine challange - quite challangeable

# Broken Machine dataset is used with following main steps:
* Fill missing values with mode
* Find correlation between features
* Undersample data set as we've got 900,000 rows of data with almost 70%-30% distribution of labels
* Use scaling (StandardScaler)
* Do ramdomizedSearchCV to select initialized parameters
* Plot learning curves over multiple iterations
* Plot validation curves over multiple iterations

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import time
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [None]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, f1_score, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold
import lightgbm as lgb

In [None]:
import warnings
warnings.filterwarnings('ignore')
from joblib import dump, load


##EDA part

In [None]:
file_path = '../input/the-broken-machine/'
model_path = '../input/the-broken-machine/'
# file_path = './the-broken-machine/'

In [None]:
xtrain = pd.read_csv(file_path + 'xtrain.csv')
ytrain = pd.read_csv(file_path + 'ytrain.csv')
print(xtrain.shape)
print(ytrain.shape)
xtrain.head()

In [None]:
ytrain.head()

In [None]:
ytrain.info()

In [None]:
xtrain.info()

In [None]:
print("1 ratio is：",ytrain[ytrain==1].count()/len(ytrain)*100)
#Then the accuracy is less than 70% is meaningless

In [None]:
pd.value_counts(ytrain.values.flatten())

In [None]:
# y train percentage 1 %
pd.value_counts(ytrain.values.flatten())[1]/(pd.value_counts(ytrain.values.flatten())[0]+pd.value_counts(ytrain.values.flatten())[1])*100

In [None]:
#check data
pd.set_option('display.max_columns', None)
xtrain.describe()

In [None]:
#Check missing data
all_data_na = (xtrain.isnull().sum() / len(xtrain)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
f, ax = plt.subplots(figsize=(8, 6))
plt.xticks(rotation='90')
sns.barplot(x=all_data_na.index, y=all_data_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)

In [None]:
%%time
#EDA NA processing,lgb doesn't need na processing
for col in xtrain.columns:
    xtrain[col] = xtrain[col].fillna(xtrain[col].mode()[0])#mode
xtrain.describe()

In [None]:
xtrain.isnull().sum()

In [None]:
xtrain.head()

In [None]:
# EDA skew
xtrain.skew(axis=0).sort_values(ascending=False)
#Found 37 numerical anomalies

In [None]:
xtrain['37'].hist()

In [None]:
xtrain['37']=xtrain['37'].apply(lambda x:200 if x>100 else x) #Handling No. 37

In [None]:
#EDA No. 37
from scipy import stats
from scipy.stats import norm, skew #for some statistics
def check_skewness(col):
    sns.distplot(xtrain[col] , fit=norm);
    fig = plt.figure()
#     res = stats.probplot(xtrain[col], plot=plt) #Probplot cannot be displayed, if it is an integer index, it can be displayed
    # Get the fitted parameters used by the function
    (mu, sigma) = norm.fit(xtrain[col])
    print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
    
check_skewness(['37']) 

In [None]:
# check unique value
for i in xtrain.columns:
    print(i,": ",len(xtrain[i].unique()))

In [None]:
#Feature distribution
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
h = .2  # step size in the mesh

x_min, x_max = xtrain.iloc[0:1000, 33].min() - .5, xtrain.iloc[0:1000, 33].max() + .5
y_min, y_max = xtrain.iloc[0:1000, 36].min() - .5, xtrain.iloc[0:1000, 36].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot()
ax.scatter(xtrain.iloc[0:1000, 33], xtrain.iloc[0:1000, 36], c=list(ytrain.iloc[0:1000,0]),cmap=cm_bright,
           edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())

In [None]:
xtrain['1'].hist()

In [None]:
#corelation
corrmat = xtrain.corr()
corrmat

In [None]:
corrmat[corrmat>0.01].count()
#No clear corelation

In [None]:
# plt.figure(figsize=(10,10))
# g = sns.heatmap(train_data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.check_cv.html#sklearn.model_selection.check_cv
# from sklearn.model_selection import check_cv
# cv = check_cv(3, xtrain, ytrain, classifier=True)

No clear patten for the scattering

In [None]:
xy = xtrain.join(ytrain)
train_sample = xy.sample(n=17000, random_state=0)
pd.value_counts(train_sample['x'].values.flatten())
X = train_sample.iloc[:, :-1]
y = train_sample.iloc[:,-1]
from imblearn.under_sampling import NearMiss
ns=NearMiss()
X_train_ns,y_train_ns=ns.fit_sample(X,y)
X_train_ns.shape

In [None]:
pd.value_counts(y_train_ns.values.flatten())

## Training part

In [None]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_train_ns[0:-1000],y_train_ns[0:-1000], test_size=0.2, random_state=3)
# gc.collect()  
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

In [None]:
%%time
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train_scaled = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(ss.transform(X_test), columns=X_test.columns)
# we have now fit and transform the data into a scaler for accurate reading and results.

In [None]:
X_train.head()

In [None]:
X_train_scaled

# Algos code start here

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import validation_curve
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
# !pip install pydotplus
# import pydotplus
from IPython.display import Image
from sklearn.model_selection import learning_curve 
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

doing random search first step source: https://gist.github.com/otaviomguerra/51df7a4cff28f92de7105f12a0724115

In [None]:
%%time

from scipy.stats import randint as sp_randint
from random import uniform

parameter_space = {
    'hidden_layer_sizes': [(sp_randint.rvs(5,10,1),sp_randint.rvs(5,10,1),)],
#     'activation': ['tanh', 'relu', 'logistic'],
#     'activation': ['logistic'],
#     'solver': ['sgd', 'adam', 'lbfgs'],
    'solver': ['lbfgs'],
#     'alpha': [uniform(0.0001, 0.0002)],
         'alpha': [1e-5],
#     'learning_rate': ['constant','adaptive']}
         'learning_rate': ['adaptive']}
mlp_cv = MLPClassifier(random_state=0,max_iter=10000)
mlp_cv = RandomizedSearchCV(mlp_cv, parameter_space, cv=3,n_jobs=-1)
mlp_cv.fit(X_train_scaled,y_train)
print("Tuned Parameters: {}".format(mlp_cv.best_params_))
print("Best score is {}".format(mlp_cv.best_score_))

In [None]:
%%time
prediction = mlp_cv.predict(X_test_scaled)

In [None]:
%%time

print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

Source:https://www.geeksforgeeks.org/using-learning-curves-ml/
# Learning curve: iteration 1

In [None]:
train_sizes=np.linspace(0.01, 1.0, 100)
def plot_learning_curve(estimator):
    sizes, training_scores, testing_scores , fit_times, _= learning_curve(estimator, X_train_scaled, y_train, cv=10, scoring='accuracy', n_jobs=-1,return_times=True, train_sizes=train_sizes ) 
    # Mean and Standard Deviation of training scores 
    mean_training = np.mean(training_scores, axis=1) 
    Standard_Deviation_training = np.std(training_scores, axis=1) 

    # Mean and Standard Deviation of testing scores 
    mean_testing = np.mean(testing_scores, axis=1) 
    Standard_Deviation_testing = np.std(testing_scores, axis=1) 
    
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)
    
    _, axes = plt.subplots(1, 2, figsize=(20, 5))

    # dotted blue line is for training scores and green line is for cross-validation score 
    axes[0].plot(sizes, mean_training, '--', color="b",  label="Training score") 
    axes[0].plot(sizes, mean_testing, color="g", label="Cross-validation score") 

    # Drawing plot 
#     plt.title("LEARNING CURVE FOR MLP Classifier") 
    axes[0].set_title("LEARNING CURVE FOR MLP Classifier")
    axes[0].set_xlabel("Training Set Size"), axes[0].set_ylabel("accuracy"), axes[0].legend(loc="best") 
    
    axes[1].grid()
#     axes[1].plot(fit_times_mean, mean_testing, 'o-')
#     axes[1].set_xlabel("fit_times")
#     axes[1].set_ylabel("Score")

    axes[1].plot(sizes, fit_times_mean, 'o-')
    axes[1].set_xlabel("Training Set Size")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Performance of the model")
    
    
    return plt

In [None]:
%%time
plot_learning_curve(mlp_cv.best_estimator_)

Trying validation curve for depth parameter source: https://datascience.stackexchange.com/questions/26918/validation-curve-unlike-sklearn-sample

In [None]:
def plot_validation_curve(param, param_range,estimator, param_range_label=None):
#     param_range = np.arange(1, 41, 2)
    train_scores, test_scores = validation_curve(estimator, X_train_scaled, y_train, param_name=param, cv=10, param_range=param_range,n_jobs=-1, scoring="accuracy")

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.title("Validation Curve with MLPClassifier")
    plt.xlabel(param)
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    if(param_range_label!= None):
        ind = np.arange(len(param_range))
        
        plt.plot(ind, train_scores_mean, label="Training score", color="r")
        plt.plot(ind, test_scores_mean, label="Cross-validation score", color="g")
        plt.xticks(ind,param_range_label)
        
    else:
        plt.plot(param_range, train_scores_mean, label="Training score", color="r")
        plt.plot(param_range, test_scores_mean, label="Cross-validation score", color="g")
        
    plt.legend(loc="best")
#     param_range = np.arange(1, param_range.max(), 2)
#     plt.xticks(param_range)
    plt.show()

In [None]:
%%time
param_range = np.arange(0, 1, .1)
# param_range_label = ['1','1.1e-5','1.1e-5','1.1e-5','1.1e-5','1.1e-5','1.1e-5','1.1e-5','1.1e-5']
param_name="alpha"
plot_validation_curve(param_name,param_range,mlp_cv.best_estimator_)

Findings: no impact

In [None]:
%%time
mlp_cv.best_params_['alpha'] = 0.8
mlp_iter1=MLPClassifier(random_state=0)
mlp_iter1.set_params(**mlp_cv.best_params_)
model = mlp_iter1.fit(X_train_scaled, y_train)
prediction = mlp_iter1.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

setting alpha 0 gives 66% in 3.6sec. and with alpha 0.8 gives 65% in 2.63 sec

# iteration 2

In [None]:
%%time
# param_range = [(10,),(20,),(30,),(40,),(50,)]
# param_range = [(100,100),(200,200),(300,300),(400,400),(500,500),(600,600)]
# param_range_label = ['(100,100)','(200,200)','(300,300)','(400,400)','(500,500)','(600,600)']
# param_range = [(10,10),(20,20),(30,30),(40,40),(50,50),(60,60),(70,70),(80,80),(90,90),(100,100)]
# param_range_label = ['(10,10)','(20,20)','(30,30)','(40,40)','(50,50)','(60,60)','(70,70)','(80,80)','(90,90)','(100,100)']
param_range = [(10,10),(20,20),(30,30),(40,40),(50,50)]
param_range_label = ['(10,10)','(20,20)','(30,30)','(40,40)','(50,50)']
param_name="hidden_layer_sizes"
plot_validation_curve(param_name,param_range,mlp_cv.best_estimator_,param_range_label)

In [None]:
%%time
mlp_cv.best_params_['hidden_layer_sizes'] = (11,11)
mlp_iter2=MLPClassifier(random_state=0)
mlp_iter2.set_params(**mlp_cv.best_params_)
model = mlp_iter2.fit(X_train_scaled, y_train)
prediction = mlp_iter2.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

setting hidden layer to 11,11 takes about 2.91 seconds with 66% accuracy. setting it to 50,50 takes about 6.73 sec with 65% accuracy

In [None]:
%%time
plot_learning_curve(mlp_iter2)

# Iteration 3

In [None]:
%%time
param_range = np.arange(10, 300, 20)
param_name="max_iter"
plot_validation_curve(param_name,param_range,mlp_iter2)

In [None]:
%%time
mlp_cv.best_params_['max_iter'] = 200
mlp_iter3=MLPClassifier(random_state=0)
mlp_iter3.set_params(**mlp_cv.best_params_)


In [None]:
%%time
model = mlp_iter3.fit(X_train_scaled, y_train)


In [None]:
%%time
prediction = mlp_iter3.predict(X_test_scaled)

In [None]:

print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

setting max_iter to 200 takes about 2.7 sec but with accuracy of just 66% and when set to 70 takes 1.03 sec with 65% accuracy. Additionally, setting it 1000 takes 13.3 sec with just 65% accuracy.

In [None]:
%%time
plot_learning_curve(mlp_iter3)