In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.cluster import KMeans

In [10]:
df_train = pd.read_csv('../input/train_data/train_data.csv', sep=',')
df_test = pd.read_csv('../input/test_data/test_data.csv', sep=',')

In [11]:
col = ['op_setting_1', 'op_setting_2',
       'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6',
       'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11',
       'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_17',
       'sensor_18', 'sensor_20', 'sensor_21']
mean = df_train.mean()
std = df_train.std()

def preprocess(df, norm=True, rul=False):
    X = pd.DataFrame()
    X['engine_no'] = df['engine_no']
    X['time_in_cycles'] = df['time_in_cycles']
    if norm:
        for c in col:
            X[c] = (df[c] - mean[c])/std[c]
    else:
        for c in col:
            X[c] = df[c]
    if 'RUL' in df.columns:
        if rul:
            X['RUL'] = df['RUL']
        y = (df['RUL']<100).astype(int)
    else:
        y=None
    return X, y

In [79]:
def get_cor(X):
    if 'RUL' in X.columns:
        X = X.drop(columns=['RUL'])
    l = []
    eng_un = X['engine_no'].unique()
    for i in eng_un:
        d =  X[X['engine_no']==i]
        #print(d.corr()['time_in_cycles'])
        cor = [i] + d.corr()['time_in_cycles'].tolist()[2:]
        l.append(cor)
    c = 'engine_no', 'op_setting_1', 'op_setting_2', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_17', 'sensor_18', 'sensor_20', 'sensor_21'
    df_cor = pd.DataFrame(l, columns=c)
    df_cor=df_cor.fillna(0)
    return df_cor



In [106]:
def train_kmeans(df_cor):
    df = df_cor.drop(columns=['engine_no'])
    kmeans = KMeans(n_clusters=3)
    y_pred = kmeans.fit_predict(df)
    return kmeans, y_pred

def train_pca(df_cor, c):
    df = df_cor.drop(columns=['engine_no'])
    pca = PCA()
    cor_pca = pca.fit_transform(df)

    expl = pca.explained_variance_ratio_
    plt.bar(range(len(expl)), expl)
    plt.show()
    
    plt.figure(figsize=(8, 8))
    plt.scatter(cor_pca[:,0], cor_pca[:,1], c=c)
    return pca

def train_forests(X, y):
    model = RandomForestClassifier()
    model.fit(X, y)
    return model

def pred_kmeans(df_cor, kmeans):
    df = df_cor.drop(columns=['engine_no'])
    y_pred = kmeans.predict(df)
    return y_pred

In [81]:
def type_to_X(X, df_cor, y_pred):
    df_cor['Failure']=y_pred
    fail_type=df_cor[['engine_no', 'Failure']]
    dataset = pd.merge(X, fail_type, how='left', on='engine_no')
    return dataset

## Main

In [129]:
X, y = preprocess(df_train, rul=True, norm=True)
df_cor = get_cor(X)
kmeans, y_pred = train_kmeans(df_cor)
#pca = train_pca(df_cor, c=y_pred)

X = type_to_X(X, df_cor, y_pred)
models=[]
for i in [0, 1, 2]:
    x = X.drop(columns=["Failure", 'RUL'])
    y = X['Failure']
    m = train_forests(x[X['Failure']==i], y[X['Failure']==i])
    models.append(m)

x[X['Failure']==0]



Unnamed: 0,engine_no,time_in_cycles,op_setting_1,op_setting_2,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,...,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_18,sensor_20,sensor_21
0,0,1,0.471650,0.570738,-0.765963,-1.424744,-1.781385,-1.591976,-0.666971,-0.838765,...,-1.503873,-2.192440,-1.059968,-2.889927,-2.647883,2.435204,-1.763259,-2.517341,-0.991557,-1.011323
1,0,2,1.076672,1.172198,-1.196595,-0.986876,-0.868334,-0.969966,-1.035035,-0.997053,...,-0.870530,-0.709809,-0.944738,0.346818,-0.196357,0.263382,-0.860657,-0.356136,-0.946224,-0.936969
2,0,3,0.471523,0.574815,-0.765963,-1.434632,-1.701673,-1.568058,-0.666971,-0.837213,...,-1.503873,-2.253730,-1.061551,-2.889747,-2.531168,2.462347,-1.731023,-2.517341,-1.003531,-1.005693
3,0,4,1.500033,1.170567,-1.342550,-1.128830,-0.943646,-1.064904,-1.403100,-1.350875,...,-0.940901,-0.709809,-1.267528,0.346188,0.051214,0.384061,-0.957364,-0.433322,-1.309739,-1.298334
4,0,5,1.076515,1.168664,-1.196595,-0.992290,-0.896936,-1.007530,-1.035035,-0.997053,...,-0.870530,-0.706891,-0.948697,0.346728,-0.133596,0.364502,-0.860657,-0.356136,-0.940237,-0.931966
5,0,6,-1.041347,-1.108892,1.079181,1.059806,0.999485,1.067597,1.107711,1.115014,...,1.029501,0.892483,1.116976,0.345828,0.712686,-0.871159,1.041254,0.801652,1.112550,1.119048
6,0,7,1.499785,1.169752,-1.342550,-1.125299,-0.977579,-1.015894,-1.403100,-1.352427,...,-0.940901,-0.666031,-1.267893,0.347447,0.082842,0.359446,-0.989600,-0.433322,-1.300330,-1.312047
7,0,8,1.500033,1.168664,-1.342550,-1.129536,-1.001611,-1.125358,-1.403100,-1.350875,...,-0.940901,-0.686461,-1.269781,0.347267,0.086563,0.374082,-1.054071,-0.433322,-1.312305,-1.315497
8,0,9,1.499797,1.168664,-1.342550,-1.139424,-0.949570,-1.049423,-1.403100,-1.352427,...,-0.940901,-0.677705,-1.268563,0.346728,0.069943,0.355987,-1.021836,-0.433322,-1.314871,-1.290308
9,0,10,-0.435920,-0.434866,0.105492,0.167590,0.192886,0.277062,0.146523,0.165282,...,0.748015,0.285421,0.204760,0.345828,0.689367,-0.588023,0.267596,0.317486,0.229852,0.226498


In [130]:
X_test, y_test = preprocess(df_test, rul=True, norm=True)
df_cor_test = get_cor(X_test)

In [131]:
pred_test = pred_kmeans(df_cor_test, kmeans)

In [149]:
dataset = type_to_X(X_test, df_cor_test, pred_test)
dataset= dataset.fillna(0)
dataset

Unnamed: 0,engine_no,time_in_cycles,op_setting_1,op_setting_2,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,...,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_18,sensor_20,sensor_21,Failure
0,0,1,1.499966,1.168664,-1.342550,-1.130007,-1.057630,-1.000707,-1.403100,-1.352427,...,-0.666031,-1.265640,0.344119,-0.138558,0.338557,-1.021836,-0.433322,-1.304607,-1.307201,0
1,0,2,1.499864,1.168664,-1.342550,-1.142484,-0.974025,-1.061455,-1.403100,-1.352427,...,-0.668949,-1.269781,0.344029,-0.138062,0.386190,-0.957364,-0.433322,-1.316581,-1.309781,0
2,0,3,-1.041214,-1.109163,1.079181,1.059100,0.987553,1.023357,1.107711,1.115014,...,0.939180,1.114783,0.345378,0.499100,-0.892448,1.009019,0.801652,1.103997,1.109412,0
3,0,4,1.500045,1.168664,-1.342550,-1.122238,-0.953462,-0.993444,-1.403100,-1.350875,...,-0.695216,-1.264605,0.343399,-0.085719,0.439677,-0.989600,-0.433322,-1.315726,-1.294157,0
4,0,5,-0.435998,-0.433235,0.105492,0.173240,0.250597,0.363636,0.146523,0.165282,...,0.302933,0.199644,0.345918,0.406570,-0.595208,0.299831,0.317486,0.242681,0.232785,0
5,0,6,-0.436422,-0.431061,0.105492,0.172298,0.338517,0.349036,0.146523,0.165282,...,0.335037,0.202445,0.345468,0.400617,-0.585761,0.235360,0.317486,0.228141,0.227011,0
6,0,7,-0.436464,-0.434866,0.105492,0.172298,0.332340,0.392176,0.146523,0.165282,...,0.326281,0.201775,0.345918,0.411408,-0.581636,0.235360,0.317486,0.234128,0.232072,0
7,0,8,0.471196,0.570738,-0.765963,-1.423096,-1.754561,-1.583172,-0.666971,-0.838765,...,-2.212870,-1.060821,-2.892086,-2.724660,2.439196,-1.763259,-2.517341,-0.978727,-0.992920,0
8,0,9,-0.436016,-0.434050,0.105492,0.170415,0.249243,0.302447,0.146523,0.165282,...,0.291258,0.202567,0.346098,0.380399,-0.519900,0.203124,0.317486,0.226430,0.233169,0
9,0,10,0.168752,0.788166,0.175840,0.236095,0.113597,-0.067399,-0.127768,-0.120258,...,-0.003517,-0.142696,0.345108,-0.423463,0.221471,0.138653,0.352571,-0.117412,-0.104518,0


In [150]:
def predict(df):
    x = df.drop(columns=['Failure'])
    df['Pred_0'] = models[0].predict(x)
    df['Pred_1'] = models[1].predict(x)
    df['Pred_2'] = models[2].predict(x)
    
    def chose(l):
        i = l['Failure']
        return int(l['Pred_' + str(int(i))])
    
    df['WillFail'] = df.apply(chose, axis=1)
    return df
    
predict(dataset)

Unnamed: 0,engine_no,time_in_cycles,op_setting_1,op_setting_2,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,...,sensor_15,sensor_17,sensor_18,sensor_20,sensor_21,Failure,Pred_0,Pred_1,Pred_2,WillFail
0,0,1,1.499966,1.168664,-1.342550,-1.130007,-1.057630,-1.000707,-1.403100,-1.352427,...,0.338557,-1.021836,-0.433322,-1.304607,-1.307201,0,0,1,2,0
1,0,2,1.499864,1.168664,-1.342550,-1.142484,-0.974025,-1.061455,-1.403100,-1.352427,...,0.386190,-0.957364,-0.433322,-1.316581,-1.309781,0,0,1,2,0
2,0,3,-1.041214,-1.109163,1.079181,1.059100,0.987553,1.023357,1.107711,1.115014,...,-0.892448,1.009019,0.801652,1.103997,1.109412,0,0,1,2,0
3,0,4,1.500045,1.168664,-1.342550,-1.122238,-0.953462,-0.993444,-1.403100,-1.350875,...,0.439677,-0.989600,-0.433322,-1.315726,-1.294157,0,0,1,2,0
4,0,5,-0.435998,-0.433235,0.105492,0.173240,0.250597,0.363636,0.146523,0.165282,...,-0.595208,0.299831,0.317486,0.242681,0.232785,0,0,1,2,0
5,0,6,-0.436422,-0.431061,0.105492,0.172298,0.338517,0.349036,0.146523,0.165282,...,-0.585761,0.235360,0.317486,0.228141,0.227011,0,0,1,2,0
6,0,7,-0.436464,-0.434866,0.105492,0.172298,0.332340,0.392176,0.146523,0.165282,...,-0.581636,0.235360,0.317486,0.234128,0.232072,0,0,1,2,0
7,0,8,0.471196,0.570738,-0.765963,-1.423096,-1.754561,-1.583172,-0.666971,-0.838765,...,2.439196,-1.763259,-2.517341,-0.978727,-0.992920,0,0,1,2,0
8,0,9,-0.436016,-0.434050,0.105492,0.170415,0.249243,0.302447,0.146523,0.165282,...,-0.519900,0.203124,0.317486,0.226430,0.233169,0,0,1,2,0
9,0,10,0.168752,0.788166,0.175840,0.236095,0.113597,-0.067399,-0.127768,-0.120258,...,0.221471,0.138653,0.352571,-0.117412,-0.104518,0,0,1,2,0


In [154]:
sub = pd.DataFrame()

df_test = df_test.sort_values(['engine_no', 'time_in_cycles'])

# On prend la dernière prédiction du RUL
df_result = dataset.groupby('engine_no').last().reset_index()[['engine_no', 'Failure']]
df_result.columns=['engine_no', 'result']
print(df_result)

df_result.to_csv('sub.csv', index=False)

# On convertit en binaire (RUL > 100 ?)
#df_result['result'] = df_result['pred_tree'].map(lambda x: 0 if x > 100 else 1)


     engine_no  result
0            0       0
1            1       0
2            2       0
3            3       0
4            4       0
5            5       0
6            6       0
7            7       0
8            8       0
9            9       0
10          10       0
11          11       0
12          12       0
13          13       0
14          14       0
15          15       0
16          16       0
17          17       0
18          18       0
19          19       0
20          20       0
21          21       0
22          22       0
23          23       0
24          24       0
25          25       0
26          26       0
27          27       0
28          28       0
29          29       0
..         ...     ...
677        677       0
678        678       2
679        679       0
680        680       2
681        681       0
682        682       0
683        683       2
684        684       1
685        685       2
686        686       1
687        687       0
688        