# broken-machine challange - quite challangeable

# Broken Machine dataset is used with following main steps:
* Fill missing values with mode
* Find correlation between features
* Undersample data set as we've got 900,000 rows of data with almost 70%-30% distribution of labels
* Use scaling (StandardScaler)
* Do ramdomizedSearchCV to select initialized parameters
* Plot learning curves over multiple iterations
* Plot validation curves over multiple iterations

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import time
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [None]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, f1_score, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold
import lightgbm as lgb

In [None]:
import warnings
warnings.filterwarnings('ignore')
from joblib import dump, load


##EDA part

In [None]:
file_path = '../input/the-broken-machine/'
model_path = '../input/the-broken-machine/'
# file_path = './the-broken-machine/'

In [None]:
xtrain = pd.read_csv(file_path + 'xtrain.csv')
ytrain = pd.read_csv(file_path + 'ytrain.csv')
print(xtrain.shape)
print(ytrain.shape)
xtrain.head()

In [None]:
ytrain.head()

In [None]:
ytrain.info()

In [None]:
xtrain.info()

In [None]:
print("1 ratio is：",ytrain[ytrain==1].count()/len(ytrain)*100)
#Then the accuracy is less than 70% is meaningless

In [None]:
pd.value_counts(ytrain.values.flatten())

In [None]:
# y train percentage 1 %
pd.value_counts(ytrain.values.flatten())[1]/(pd.value_counts(ytrain.values.flatten())[0]+pd.value_counts(ytrain.values.flatten())[1])*100

In [None]:
#check data
pd.set_option('display.max_columns', None)
xtrain.describe()

In [None]:
#Check missing data
all_data_na = (xtrain.isnull().sum() / len(xtrain)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
f, ax = plt.subplots(figsize=(8, 6))
plt.xticks(rotation='90')
sns.barplot(x=all_data_na.index, y=all_data_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)

In [None]:
%%time
#EDA NA processing,lgb doesn't need na processing
for col in xtrain.columns:
    xtrain[col] = xtrain[col].fillna(xtrain[col].mode()[0])#mode
xtrain.describe()

In [None]:
xtrain.isnull().sum()

In [None]:
xtrain.head()

In [None]:
# EDA skew
xtrain.skew(axis=0).sort_values(ascending=False)
#Found 37 numerical anomalies

In [None]:
xtrain['37'].hist()

In [None]:
xtrain['37']=xtrain['37'].apply(lambda x:200 if x>100 else x) #Handling No. 37

In [None]:
#EDA No. 37
from scipy import stats
from scipy.stats import norm, skew #for some statistics
def check_skewness(col):
    sns.distplot(xtrain[col] , fit=norm);
    fig = plt.figure()
#     res = stats.probplot(xtrain[col], plot=plt) #Probplot cannot be displayed, if it is an integer index, it can be displayed
    # Get the fitted parameters used by the function
    (mu, sigma) = norm.fit(xtrain[col])
    print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
    
check_skewness(['37']) 

In [None]:
# check unique value
for i in xtrain.columns:
    print(i,": ",len(xtrain[i].unique()))

In [None]:
#Feature distribution
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
h = .2  # step size in the mesh

x_min, x_max = xtrain.iloc[0:1000, 33].min() - .5, xtrain.iloc[0:1000, 33].max() + .5
y_min, y_max = xtrain.iloc[0:1000, 36].min() - .5, xtrain.iloc[0:1000, 36].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot()
ax.scatter(xtrain.iloc[0:1000, 33], xtrain.iloc[0:1000, 36], c=list(ytrain.iloc[0:1000,0]),cmap=cm_bright,
           edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())

In [None]:
xtrain['1'].hist()

In [None]:
#corelation
corrmat = xtrain.corr()
corrmat

In [None]:
corrmat[corrmat>0.01].count()
#No clear corelation

In [None]:
# plt.figure(figsize=(10,10))
# g = sns.heatmap(train_data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.check_cv.html#sklearn.model_selection.check_cv
# from sklearn.model_selection import check_cv
# cv = check_cv(3, xtrain, ytrain, classifier=True)

No clear patten for the scattering

In [None]:
xy = xtrain.join(ytrain)
train_sample = xy.sample(n=17000, random_state=0)
pd.value_counts(train_sample['x'].values.flatten())
X = train_sample.iloc[:, :-1]
y = train_sample.iloc[:,-1]
from imblearn.under_sampling import NearMiss
ns=NearMiss()
X_train_ns,y_train_ns=ns.fit_sample(X,y)
X_train_ns.shape

In [None]:
pd.value_counts(y_train_ns.values.flatten())

## Training part

In [None]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_train_ns[0:-1000],y_train_ns[0:-1000], test_size=0.2, random_state=3)
# gc.collect()  
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

In [None]:
%%time
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train_scaled = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(ss.transform(X_test), columns=X_test.columns)
# we have now fit and transform the data into a scaler for accurate reading and results.

In [None]:
X_train.head()

In [None]:
X_train_scaled

# Algos code start here

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import validation_curve
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
# !pip install pydotplus
# import pydotplus
from IPython.display import Image
from sklearn.model_selection import learning_curve 
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from random import uniform

In [None]:
%%time
xgb = XGBClassifier(random_state=0,eval_metric='error')
model = xgb.fit(X_train_scaled, y_train)
prediction = xgb.predict(X_test_scaled)

In [None]:
from xgboost import plot_tree
plot_tree(model)
plt.show()
plt.savefig("before_pruning.png",  dpi=800)

In [None]:
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

# Pruning

In [None]:
%%time
xgb = XGBClassifier(max_depth=2,n_estimators=10,random_state=0, eval_metric='error')
model = xgb.fit(X_train_scaled, y_train)
prediction = xgb.predict(X_test_scaled)

In [None]:
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
plot_tree(model)
# plt.show()
plt.savefig("after_pruning.png",  dpi=800)

doing random search first step source: https://gist.github.com/otaviomguerra/51df7a4cff28f92de7105f12a0724115

In [None]:
param_dist = {"max_depth": randint(1, 12),
              "eta": [uniform(0.1, 0.5)],
              "gamma": randint(0, 10),
              "n_estimators": randint(1, 100)}
tree = XGBClassifier(random_state=0,eval_metric='error')
tree_cv = RandomizedSearchCV(tree, param_dist, cv=3)
tree_cv.fit(X_train_scaled,y_train)
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

In [None]:
%%time
prediction = tree_cv.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

Source:https://www.geeksforgeeks.org/using-learning-curves-ml/
# Learning curve: iteration 1

In [None]:
train_sizes=np.linspace(0.01, 1.0, 100)
def plot_learning_curve(estimator):
    sizes, training_scores, testing_scores , fit_times, _= learning_curve(estimator, X_train_scaled, y_train, cv=3, scoring='accuracy', verbose=10, n_jobs=-1,return_times=True, train_sizes=train_sizes ) 
    # Mean and Standard Deviation of training scores 
    mean_training = np.mean(training_scores, axis=1) 
    Standard_Deviation_training = np.std(training_scores, axis=1) 

    # Mean and Standard Deviation of testing scores 
    mean_testing = np.mean(testing_scores, axis=1) 
    Standard_Deviation_testing = np.std(testing_scores, axis=1) 
    
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)
    
    _, axes = plt.subplots(1, 2, figsize=(20, 5))

    # dotted blue line is for training scores and green line is for cross-validation score 
    axes[0].plot(sizes, mean_training, '--', color="b",  label="Training score") 
    axes[0].plot(sizes, mean_testing, color="g", label="Cross-validation score") 

    # Drawing plot 
#     plt.title("LEARNING CURVE FOR MLP Classifier") 
    axes[0].set_title("LEARNING CURVE FOR XGBoost Classifier")
    axes[0].set_xlabel("Training Set Size"), axes[0].set_ylabel("accuracy"), axes[0].legend(loc="best") 
    
    axes[1].grid()
#     axes[1].plot(fit_times_mean, mean_testing, 'o-')
#     axes[1].set_xlabel("fit_times")
#     axes[1].set_ylabel("Score")

    axes[1].plot(sizes, fit_times_mean, 'o-')
    axes[1].set_xlabel("Training Set Size")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Performance of the model")
    
    
    return plt

In [None]:
%%time
plot_learning_curve(tree_cv.best_estimator_)

Trying validation curve for depth parameter source: https://datascience.stackexchange.com/questions/26918/validation-curve-unlike-sklearn-sample

In [None]:
def plot_validation_curve(param, param_range,estimator):
#     param_range = np.arange(1, 41, 2)
    train_scores, test_scores = validation_curve(estimator, X_train_scaled, y_train, param_name=param, cv=3, param_range=param_range,n_jobs=-1, scoring="accuracy")

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.title("Validation Curve with XGBoost")
    plt.xlabel(param)
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    plt.plot(param_range, train_scores_mean, label="Training score",
                 color="r")
    plt.plot(param_range, test_scores_mean, label="Cross-validation score",
                 color="g")

    plt.legend(loc="best")
    param_range = np.arange(1, param_range.max(), 2)
#     plt.xticks(param_range)
    plt.show()

In [None]:
%%time
param_range = np.arange(0, 41, 2)
param_name="gamma"
plot_validation_curve(param_name,param_range,tree_cv.best_estimator_)

Findings:

In [None]:
%%time
tree_cv.best_params_['gamma'] = 1
xg_iter1=XGBClassifier(random_state=0,eval_metric='error')
xg_iter1.set_params(**tree_cv.best_params_)
model = xg_iter1.fit(X_train_scaled, y_train)
prediction = xg_iter1.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
tree_cv.best_params_['gamma'] = 10
xg_iter1=XGBClassifier(random_state=0,eval_metric='error')
xg_iter1.set_params(**tree_cv.best_params_)
model = xg_iter1.fit(X_train_scaled, y_train)
prediction = xg_iter1.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
tree_cv.best_params_['gamma'] = 100
xg_iter1=XGBClassifier(random_state=0,eval_metric='error')
xg_iter1.set_params(**tree_cv.best_params_)
model = xg_iter1.fit(X_train_scaled, y_train)
prediction = xg_iter1.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
tree_cv.best_params_['gamma'] = 0
xg_iter1=XGBClassifier(random_state=0,eval_metric='error')
xg_iter1.set_params(**tree_cv.best_params_)
model = xg_iter1.fit(X_train_scaled, y_train)
prediction = xg_iter1.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
plot_learning_curve(xg_iter1)

# iter 2

In [None]:
%%time
param_range = np.arange(1, 16, 1)
param_name="max_depth"
plot_validation_curve(param_name,param_range,xg_iter1)

In [None]:
%%time
tree_cv.best_params_['max_depth'] = 20
xg_iter2=XGBClassifier(random_state=0,eval_metric='error')
xg_iter2.set_params(**tree_cv.best_params_)
model = xg_iter2.fit(X_train_scaled, y_train)
prediction = xg_iter2.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
tree_cv.best_params_['max_depth'] = 9
xg_iter2=XGBClassifier(random_state=0,eval_metric='error')
xg_iter2.set_params(**tree_cv.best_params_)
model = xg_iter2.fit(X_train_scaled, y_train)
prediction = xg_iter2.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
tree_cv.best_params_['max_depth'] = 2
xg_iter2=XGBClassifier(random_state=0,eval_metric='error')
xg_iter2.set_params(**tree_cv.best_params_)
model = xg_iter2.fit(X_train_scaled, y_train)
prediction = xg_iter2.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

# iter 3

In [None]:
%%time
plot_learning_curve(xg_iter2)

In [None]:
%%time
param_range = np.arange(1, 100, 10)
param_name="n_estimators"
plot_validation_curve(param_name,param_range,xg_iter2)

In [None]:
%%time
tree_cv.best_params_['n_estimators'] = 20
xg_iter3=XGBClassifier(random_state=0,eval_metric='error')
xg_iter3.set_params(**tree_cv.best_params_)




In [None]:
%%time
model = xg_iter3.fit(X_train_scaled, y_train)

In [None]:
%%time

prediction = xg_iter3.predict(X_test_scaled)

In [None]:
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
plot_learning_curve(xg_iter3)

with n_estimators e.g 100 wall time is 4 seconds. With 20 its just 1 second and result is not better for 100!

In [None]:
%%time
tree_cv.best_params_['n_estimators'] = 40
xg_iter3=XGBClassifier(random_state=0,eval_metric='error')
xg_iter3.set_params(**tree_cv.best_params_)




In [None]:
%%time

model = xg_iter3.fit(X_train_scaled, y_train)

In [None]:
%%time

prediction = xg_iter3.predict(X_test_scaled)

In [None]:
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
plot_learning_curve(xg_iter3)

In [None]:
%%time
tree_cv.best_params_['n_estimators'] = 60
xg_iter3=XGBClassifier(random_state=0,eval_metric='error')
xg_iter3.set_params(**tree_cv.best_params_)




In [None]:
%%time

model = xg_iter3.fit(X_train_scaled, y_train)

In [None]:
%%time

prediction = xg_iter3.predict(X_test_scaled)

In [None]:
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
plot_learning_curve(xg_iter3)

In [None]:
%%time
tree_cv.best_params_['n_estimators'] = 100
xg_iter3=XGBClassifier(random_state=0,eval_metric='error')
xg_iter3.set_params(**tree_cv.best_params_)




In [None]:
%%time
model = xg_iter3.fit(X_train_scaled, y_train)


In [None]:
%%time

prediction = xg_iter3.predict(X_test_scaled)

In [None]:
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
plot_learning_curve(xg_iter3)

# Iter 4

In [None]:
%%time
plot_learning_curve(xg_iter3)

In [None]:
%%time
param_range = np.arange(0, 1, .1)
param_name="eta"
plot_validation_curve(param_name,param_range,xg_iter3)

In [None]:
tree_cv.best_params_['eta'] = 0.3
xg_iter4=XGBClassifier(random_state=0,eval_metric='error')
xg_iter4.set_params(**tree_cv.best_params_)
model = xg_iter4.fit(X_train_scaled, y_train)
prediction = xg_iter4.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
tree_cv.best_params_['eta'] = 0.7
xg_iter4=XGBClassifier(random_state=0,eval_metric='error')
xg_iter4.set_params(**tree_cv.best_params_)
model = xg_iter4.fit(X_train_scaled, y_train)
prediction = xg_iter4.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

# Iter 5

In [None]:
%%time
plot_learning_curve(xg_iter4)

In [None]:
%%time
param_range = np.arange(0, 10, 1)
param_name="min_child_weight"
plot_validation_curve(param_name,param_range,xg_iter4)

In [None]:
%%time
tree_cv.best_params_['min_child_weight'] = 5
xg_iter5=XGBClassifier(random_state=0,eval_metric='error')
xg_iter5.set_params(**tree_cv.best_params_)
model = xg_iter5.fit(X_train_scaled, y_train)
prediction = xg_iter5.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
tree_cv.best_params_['min_child_weight'] = 9
xg_iter5=XGBClassifier(random_state=0,eval_metric='error')
xg_iter5.set_params(**tree_cv.best_params_)
model = xg_iter5.fit(X_train_scaled, y_train)
prediction = xg_iter5.predict(X_test_scaled)
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
tree_cv.best_params_['min_child_weight'] = 3
xg_iter5=XGBClassifier(random_state=0,eval_metric='error')
xg_iter5.set_params(**tree_cv.best_params_)


In [None]:
%%time
model = xg_iter5.fit(X_train_scaled, y_train)


In [None]:
%%time
prediction = xg_iter5.predict(X_test_scaled)

In [None]:

print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
%%time
plot_learning_curve(xg_iter5)