In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.cluster import KMeans

In [10]:
df_train = pd.read_csv('../input/train_data/train_data.csv', sep=',')
df_test = pd.read_csv('../input/test_data/test_data.csv', sep=',')

In [11]:
col = ['op_setting_1', 'op_setting_2',
       'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6',
       'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11',
       'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_17',
       'sensor_18', 'sensor_20', 'sensor_21']
mean = df_train.mean()
std = df_train.std()

def preprocess(df, norm=True, rul=False):
    X = pd.DataFrame()
    X['engine_no'] = df['engine_no']
    X['time_in_cycles'] = df['time_in_cycles']
    if norm:
        for c in col:
            X[c] = (df[c] - mean[c])/std[c]
    else:
        for c in col:
            X[c] = df[c]
    if 'RUL' in df.columns:
        if rul:
            X['RUL'] = df['RUL']
        y = (df['RUL']<100).astype(int)
    else:
        y=None
    return X, y

In [79]:
def get_cor(X):
    if 'RUL' in X.columns:
        X = X.drop(columns=['RUL'])
    l = []
    eng_un = X['engine_no'].unique()
    for i in eng_un:
        d =  X[X['engine_no']==i]
        #print(d.corr()['time_in_cycles'])
        cor = [i] + d.corr()['time_in_cycles'].tolist()[2:]
        l.append(cor)
    c = 'engine_no', 'op_setting_1', 'op_setting_2', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_17', 'sensor_18', 'sensor_20', 'sensor_21'
    df_cor = pd.DataFrame(l, columns=c)
    df_cor=df_cor.fillna(0)
    return df_cor



In [106]:
def train_kmeans(df_cor):
    df = df_cor.drop(columns=['engine_no'])
    kmeans = KMeans(n_clusters=3)
    y_pred = kmeans.fit_predict(df)
    return kmeans, y_pred

def train_pca(df_cor, c):
    df = df_cor.drop(columns=['engine_no'])
    pca = PCA()
    cor_pca = pca.fit_transform(df)

    expl = pca.explained_variance_ratio_
    plt.bar(range(len(expl)), expl)
    plt.show()
    
    plt.figure(figsize=(8, 8))
    plt.scatter(cor_pca[:,0], cor_pca[:,1], c=c)
    return pca

def train_forests(X, y):
    model = RandomForestClassifier()
    model.fit(X, y)
    return model

def pred_kmeans(df_cor, kmeans):
    df = df_cor.drop(columns=['engine_no'])
    y_pred = kmeans.predict(df)
    return y_pred

In [81]:
def type_to_X(X, df_cor, y_pred):
    df_cor['Failure']=y_pred
    fail_type=df_cor[['engine_no', 'Failure']]
    dataset = pd.merge(X, fail_type, how='left', on='engine_no')
    return dataset

## Main

In [107]:
X, y = preprocess(df_train, rul=True, norm=True)
df_cor = get_cor(X)
kmeans, y_pred = train_kmeans(df_cor)
#pca = train_pca(df_cor, c=y_pred)

X = type_to_X(X, df_cor, y_pred)
m0 = train_forests(X[X['Failure']==0], y)



ValueError: Number of labels=160359 does not match number of samples=115008

In [92]:
X_test, y_test = preprocess(df_test, rul=True, norm=True)
df_cor_test = get_cor(X_test)

In [93]:
pred_test = pred_kmeans(df_cor_test, kmeans)

In [97]:
dataset = type_to_X(X, df_cor_test, pred_test)
dataset= dataset.fillna(0)
dataset

Unnamed: 0,engine_no,time_in_cycles,op_setting_1,op_setting_2,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_18,sensor_20,sensor_21,RUL,Failure
0,0,1,0.471650,0.570738,-0.765963,-1.424744,-1.781385,-1.591976,-0.666971,-0.838765,...,-1.059968,-2.889927,-2.647883,2.435204,-1.763259,-2.517341,-0.991557,-1.011323,339,0.0
1,0,2,1.076672,1.172198,-1.196595,-0.986876,-0.868334,-0.969966,-1.035035,-0.997053,...,-0.944738,0.346818,-0.196357,0.263382,-0.860657,-0.356136,-0.946224,-0.936969,338,0.0
2,0,3,0.471523,0.574815,-0.765963,-1.434632,-1.701673,-1.568058,-0.666971,-0.837213,...,-1.061551,-2.889747,-2.531168,2.462347,-1.731023,-2.517341,-1.003531,-1.005693,337,0.0
3,0,4,1.500033,1.170567,-1.342550,-1.128830,-0.943646,-1.064904,-1.403100,-1.350875,...,-1.267528,0.346188,0.051214,0.384061,-0.957364,-0.433322,-1.309739,-1.298334,336,0.0
4,0,5,1.076515,1.168664,-1.196595,-0.992290,-0.896936,-1.007530,-1.035035,-0.997053,...,-0.948697,0.346728,-0.133596,0.364502,-0.860657,-0.356136,-0.940237,-0.931966,335,0.0
5,0,6,-1.041347,-1.108892,1.079181,1.059806,0.999485,1.067597,1.107711,1.115014,...,1.116976,0.345828,0.712686,-0.871159,1.041254,0.801652,1.112550,1.119048,334,0.0
6,0,7,1.499785,1.169752,-1.342550,-1.125299,-0.977579,-1.015894,-1.403100,-1.352427,...,-1.267893,0.347447,0.082842,0.359446,-0.989600,-0.433322,-1.300330,-1.312047,333,0.0
7,0,8,1.500033,1.168664,-1.342550,-1.129536,-1.001611,-1.125358,-1.403100,-1.350875,...,-1.269781,0.347267,0.086563,0.374082,-1.054071,-0.433322,-1.312305,-1.315497,332,0.0
8,0,9,1.499797,1.168664,-1.342550,-1.139424,-0.949570,-1.049423,-1.403100,-1.352427,...,-1.268563,0.346728,0.069943,0.355987,-1.021836,-0.433322,-1.314871,-1.290308,331,0.0
9,0,10,-0.435920,-0.434866,0.105492,0.167590,0.192886,0.277062,0.146523,0.165282,...,0.204760,0.345828,0.689367,-0.588023,0.267596,0.317486,0.229852,0.226498,330,0.0


In [100]:
def predict(df):
    df['Pred_0'] = m0.predict(df)
    df['Pred_1'] = m1.predict(df)
    df['Pred_2'] = m2.predict(df)
    
    def chose(l):
        i = l['Failure']
        return l['Pred_' + str(i)]
    
    df['WillFail'] = df.map(chose)
    return df
    
predict(dataset)

NameError: name 'm0' is not defined

In [99]:
sub = pd.DataFrame()

df_test = df_test.sort_values(['engine_no', 'time_in_cycles'])

# On prend la dernière prédiction du RUL
df_result = dataset.groupby('engine_no').last().reset_index()[['engine_no', 'Failure']]
print(df_result)

# On convertit en binaire (RUL > 100 ?)
#df_result['result'] = df_result['pred_tree'].map(lambda x: 0 if x > 100 else 1)


     engine_no  Failure
0            0      0.0
1            1      0.0
2            2      0.0
3            3      0.0
4            4      0.0
5            5      0.0
6            6      0.0
7            7      0.0
8            8      0.0
9            9      0.0
10          10      0.0
11          11      0.0
12          12      0.0
13          13      0.0
14          14      0.0
15          15      0.0
16          16      0.0
17          17      0.0
18          18      0.0
19          19      0.0
20          20      0.0
21          21      0.0
22          22      0.0
23          23      0.0
24          24      0.0
25          25      0.0
26          26      0.0
27          27      0.0
28          28      0.0
29          29      0.0
..         ...      ...
679        679      0.0
680        680      2.0
681        681      0.0
682        682      0.0
683        683      2.0
684        684      1.0
685        685      2.0
686        686      1.0
687        687      0.0
688        688  