In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

In [3]:
train.head()

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [4]:
train.Type.unique()

array(['L', 'M', 'H'], dtype=object)

In [5]:
from sklearn import preprocessing

ids = test['id']
test = test.drop(['id', 'Product ID'], axis=1)

y_train = train[['Machine failure']]
X_train = train.drop(['id','Machine failure', 'Product ID'], axis=1)

# one hot encode type
types = pd.get_dummies(train['Type'])
X_train = X_train.drop('Type',axis = 1)
X_train = X_train.join(types)

types = pd.get_dummies(test['Type'])
test = test.drop('Type',axis = 1)
test = test.join(types)

# split into validation and training set before normalization
X_valid = X_train.sample(frac=.2, replace=False, random_state=1)
X_train.drop(X_valid.index)
y_valid = y_train.sample(frac=.2, replace=False, random_state=1)
X_train.drop(y_valid.index)

#normalize training data, validation data and testing data
min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X_train.values)
X_train = pd.DataFrame(X_scaled, columns=X_train.columns)

min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X_valid.values)
X_valid = pd.DataFrame(X_scaled, columns=X_valid.columns)

min_max_scaler = preprocessing.MinMaxScaler()
X_scaled_test = min_max_scaler.fit_transform(test.values)
test = pd.DataFrame(X_scaled_test, columns=test.columns)



In [6]:
X_train.head()
X_train.shape

(136429, 13)

In [7]:
test.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF,H,L,M
0,0.769231,0.716049,0.192666,0.46978,0.237154,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.703297,0.654321,0.317229,0.343407,0.067194,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.659341,0.580247,0.2078,0.465659,0.379447,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.527473,0.481481,0.181024,0.601648,0.019763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.89011,0.814815,0.201979,0.51511,0.450593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
#begin deep learning classification
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=[13]),
    layers.Dense(32, activation='relu'),    
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid'),
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=100,
    callbacks=[early_stopping],
    verbose=0, # hide the output because we have so many epochs
)

In [9]:
history_df = pd.DataFrame(history.history)
history_df.head()

Unnamed: 0,loss,binary_accuracy,val_loss,val_binary_accuracy
0,0.105391,0.988573,0.026639,0.995859
1,0.024691,0.996167,0.025213,0.995895
2,0.023963,0.996181,0.024265,0.995895
3,0.023159,0.996181,0.023647,0.995895
4,0.022734,0.996181,0.022975,0.995895


In [10]:
predictions = (model.predict(test) > 0.5).astype("int32")



In [11]:
predictions

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [12]:
submit = pd.DataFrame()
submit['id'] = ids
submit['Machine failure'] = predictions
submit.head(10)

Unnamed: 0,id,Machine failure
0,136429,0
1,136430,0
2,136431,0
3,136432,0
4,136433,0
5,136434,0
6,136435,0
7,136436,0
8,136437,0
9,136438,0


In [13]:
submit.value_counts('Machine failure')

Machine failure
0    89849
1     1105
dtype: int64

In [14]:
submit.to_csv('submission.csv', index=False)

In [15]:
from sklearn.ensemble import RandomForestClassifier 
rfc = RandomForestClassifier()

# Fit the model to your training data.
rfc.fit(X_train, np.ravel(y_train))
predictions = rfc.predict(test)

submit2 = pd.DataFrame()
submit2['id'] = ids
submit2['Machine failure'] = predictions
submit2.head(10)

Unnamed: 0,id,Machine failure
0,136429,0
1,136430,0
2,136431,0
3,136432,0
4,136433,0
5,136434,0
6,136435,0
7,136436,0
8,136437,0
9,136438,0


In [19]:
submit2.value_counts('Machine failure')
submit2.to_csv('submission2.csv', index=False)

Machine failure
0    89824
1     1130
dtype: int64

In [17]:
from sklearn.linear_model import LogisticRegressionCV

LR = LogisticRegressionCV(fit_intercept=True)

# Fit the model to your training data.
LR.fit(X_train, np.ravel(y_train))
predictions = LR.predict(test)

submit3 = pd.DataFrame()
submit3['id'] = ids
submit3['Machine failure'] = predictions
submit3.head(10)

Unnamed: 0,id,Machine failure
0,136429,0
1,136430,0
2,136431,0
3,136432,0
4,136433,0
5,136434,0
6,136435,0
7,136436,0
8,136437,0
9,136438,0


In [18]:
submit3.value_counts('Machine failure')

Machine failure
0    89850
1     1104
dtype: int64