# <font color=blue><b>Business Understanding of Data</b></font>

* We are looking into predictive maintenance of pump using sensors .
* Machine status feature gives us an understanding of the maintenance of the pump from April 2018 to August 2018. 

# <font color=blue><b>Inventory of Resources</b></font>

I suggest using anaconda to create a Python 3.6 environment and installing the python packages:

* Pandas          (pip install pandas)
* Numpy           (pip install NumPy)
* Scikit-learn    (pip install scikit-learn)
* Matplotlib      (pip install matplotlib)
* Bayes optimizer (pip install bayesian-optimization)
* XGBoost         (pip install xgboost)
* LightGBM        (pip install lightgbm)

# <font color=blue><b>Classification problem</b></font>

In [3]:
# Librarys used in this analysis



import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix,recall_score,precision_score
from sklearn.metrics import mean_squared_error
%matplotlib inline


import warnings
warnings.filterwarnings('ignore')

# <font color=blue><b>Data Ingestion</b></font> 

<h3><font color=purple><b><u>Loading Data</u></b></font></h3>

In [4]:
org_data = pd.read_csv("../input/pump-sensor/pump_sensor.csv",parse_dates=["timestamp"])
org_data.head(2)

In [5]:
data=org_data.copy()
data.head(2)

In [6]:
# Analysing timeframe of given data

print(data["timestamp"].min())
print(data["timestamp"].max())
print(data["timestamp"].max()-data["timestamp"].min())

# <font color=blue><b>Data Preprocessing</b></font> 

<h3><font color=Purple><b><u>Describe Data</u></b></font> </h3>

In [7]:
data.shape

In [None]:
#Three types of status of a machine

data['machine_status'].unique()

In [8]:
data["machine_status"].value_counts()

In [None]:
machinestatus_percent = data["machine_status"].value_counts(normalize=True).mul(100).round(3).astype(str) + '%'
machinestatus_percent

In [None]:
data.columns

In [8]:
data.drop("Unnamed: 0",axis="columns",inplace=True)

In [None]:
# Extract the readings from the BROKEN state of the pump
broken = data[data['machine_status']=='BROKEN']
recovering = data[data['machine_status']=='RECOVERING']
# Extract the names of the numerical columns
sensors_to_plot = data.columns[1:7:2]
# Plot time series for each sensor with BROKEN state marked with X in red color
#Normal-working state represented in blue and recovering represented in orange
for name in sensors_to_plot:
    plt.figure(figsize=(18,3))
    plt.plot(broken['timestamp'], broken[name], linestyle='none', marker='X', color='red', markersize=12, label='broken')
    plt.plot(recovering['timestamp'], recovering[name], linestyle='none', marker='X', color='orange', markersize=6, label='recovering')
    plt.plot(data['timestamp'], data[name], color='blue', label='working')
    plt.title(name)
    plt.legend()
    plt.show()

In [None]:
g=data.groupby('machine_status') #created groups by class label
x=np.arange(0,52)
d_m=g.mean() # calculated mean for each group
fig=plt.figure(num=None, figsize=(16, 6))
ax = fig.add_subplot(1, 1, 1)
ax.set_xticks(x)
plt.plot(x[1:15],d_m.loc['BROKEN'].values[1:15],label='BROKEN',C='b') #plot mean for 1 to 15 sensor, we show absent of sensor_15 we plot data in parts.
plt.plot(x[16:],d_m.loc['BROKEN'].values[16:],C='b')
plt.plot(x[1:15],d_m.loc['NORMAL'].values[1:15],label='NORMAL',C='g')
plt.plot(x[16:],d_m.loc['NORMAL'].values[16:],C='g')
plt.plot(x[1:15],d_m.loc['RECOVERING'].values[1:15],label='RECOVERING',C='r')
plt.plot(x[16:],d_m.loc['RECOVERING'].values[16:],C='r')
plt.legend()
plt.xlabel('sensor number(sensor_0,sensor_1...)')
plt.ylabel('mean of sensor')
plt.title('mean of sensors among class labels')
plt.grid()
plt.show()

In [None]:

diff=dict()
for col in d_m.columns:
    diff[col]=abs(d_m[col][0]-d_m[col][1])

fig=plt.figure(num=None, figsize=(16, 6), dpi=80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(1, 1, 1)
ax.set_xticks(x)
plt.plot(x[0:16],list(diff.values())[0:16],C='b')
plt.plot(x[16:],list(diff.values())[16:],C='b')
plt.grid()
plt.xlabel('sensor number(sensor_0,sensor_1...)')
plt.ylabel('difference in mean of sensor among class label "BROKEN" and "NORMAL"')
plt.title('difference in mean among "BROKEN" and "NORMAL"')
diff={k: v for k, v in sorted(diff.items(), key=lambda item: item[1],reverse=True)}
print('difference in mean for each sensor in desending order:\n\n',diff)

In [None]:
import seaborn as sns
x=np.arange(data.shape[0])
fig=plt.figure(num=None, figsize=(16, 6), dpi=80, facecolor='w', edgecolor='k')
sns.scatterplot(x=x, y='sensor_04', hue= 'machine_status',data=data)
plt.xlabel('increasing time')
plt.title('sensor_04 values wrt class label')
plt.show()

In [None]:
g=data.groupby('machine_status')
fig=plt.figure(num=None, figsize=(16, 6), dpi=80, facecolor='w', edgecolor='k')
sns.distplot(g.get_group('BROKEN')['sensor_04'].values,label='BROKEN')
sns.distplot(g.get_group('NORMAL')['sensor_04'].values,label='NORMAL')
sns.distplot(g.get_group('RECOVERING')['sensor_04'].values,label='RECOVERING')
plt.title('PDF of sensor_04 wrt each class label')
plt.legend()

In [None]:
g=data.groupby('machine_status')
fig=plt.figure(num=None, figsize=(16, 6), dpi=80, facecolor='w', edgecolor='k')
sns.distplot(g.get_group('BROKEN')['sensor_31'].values,label='BROKEN')
sns.distplot(g.get_group('NORMAL')['sensor_31'].values,label='NORMAL')
sns.distplot(g.get_group('RECOVERING')['sensor_31'].values,label='RECOVERING')
plt.title('PDF of sensor_31 wrt each class label')
plt.legend()

In [None]:

df_temp=data[['timestamp','machine_status']]
df_temp['machine_status'][df_temp['machine_status']=='BROKEN']= 1
df_temp['machine_status'][df_temp['machine_status']=='NORMAL']= 0
df_temp['machine_status'][df_temp['machine_status']=='RECOVERING']= -1


fig=plt.figure(num=None, figsize=(18, 6), dpi=80, facecolor='w', edgecolor='k')
plt.plot(df_temp['machine_status'])
plt.xlabel('time in minutes')
plt.ylabel('class label')
plt.title('machine status plot')

In [None]:
f=1
for i in df_temp[df_temp['machine_status']==1].index:
    j=i+1
    temp=0
    while df_temp['machine_status'][j] == -1:
        temp=temp+1
        j=j+1
    print('for failure {0} at {2}, recovering time is {1} hours'.format(f,temp/60,df_temp['timestamp'][i]))
    f=f+1

<h3><font color=Purple><b><u>Verify Data Quality</u></b></font></h3>

In [None]:
null=data.iloc[:,1:-1].isna().sum()/data.shape[0] # #_null_vlaues_per_column/total_#_data_point
x=np.arange(0,52)
y=np.array(null)
fig=plt.figure(num=None, figsize=(16, 6))
ax = fig.add_subplot(1, 1, 1)
ax.set_xticks(x)
plt.plot(x,y)
plt.title('percentage of null value contained by each sensor')
plt.xlabel('sensors')
plt.ylabel('percent null values')
plt.grid()
plt.show()
print('NA values per sensor in percent:\n',null)

<!-- !pip install pandas-profiling
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension --sys-prefix -->

 <h3><font color=Purple><b><u>Clean the data</u></b></font></h3>

<h4><font color=red><b>Drop unnecessary columns</b></font></h4>

In [None]:
data.columns

In [None]:
#Check the null value in timestamp column

data_t=data["timestamp"].isna().sum()
data_t

In [9]:
from sklearn import preprocessing

data['machine_status'] = data['machine_status'].map({'NORMAL': 0, 'BROKEN': 1,'RECOVERING': 1})


label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(data['machine_status'])
label_encoder.transform(data['machine_status'])
data['machine_status'].unique()

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# for col in data.columns[1:-1]:
#     scaler = MinMaxScaler()
#     feat=scaler.fit_transform(data[col].values.reshape(1,-1))


In [None]:
# !pip install heatmapz

In [None]:
# data.corr()

In [None]:
# import seaborn as sns 

# plt.figure(figsize=(30,30))
# sns.heatmap(data.corr(), annot=True, fmt='.2f')
# plt.show()

In [None]:
!pip install heatmapz

In [None]:
#Finding correlation between sensors

from heatmap import heatmap, corrplot
plt.figure(figsize=(30,30))
corrplot(data.corr(),size_scale=300); 

In [None]:
# data.drop(['Unnamed: 0','sensor_15','sensor_01','sensor_03','sensor_14','sensor_16','sensor_17','sensor_18','sensor_19','sensor_20','sensor_21',
#            'sensor_22','sensor_23','sensor_24','sensor_25','sensor_26','sensor_27','sensor_28','sensor_29','sensor_30',
#            'sensor_31','sensor_33','sensor_34','sensor_37','sensor_36','sensor_48'],
#           inplace=True,axis=1) #droping unwanted feature

In [11]:
data.drop(['sensor_15','sensor_01','sensor_03','sensor_14','sensor_16','sensor_17','sensor_18','sensor_19','sensor_20','sensor_21',
           'sensor_22','sensor_23','sensor_24','sensor_25','sensor_26','sensor_27','sensor_28','sensor_29','sensor_30',
           'sensor_31','sensor_33','sensor_34','sensor_37','sensor_36','sensor_48'],
          inplace=True,axis=1) #droping unwanted feature# excluded 'Unnamed: 0'

In [None]:
# # # delete all rows with machine_status 'Recovering' 
# indexNames = data[data['machine_status'] == -1].index
# data.drop(indexNames , inplace
#           =True)

In [9]:
data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [12]:
print(data.shape)
print(data.columns) 

In [None]:
from sklearn.impute import KNNImputer
        
imputer = KNNImputer(n_neighbors=1)

data.iloc[:,1:29]=imputer.fit_transform(data.iloc[:,1:29])

In [None]:
#data.to_csv("Imputed_data_wth_Recovr.csv")

In [None]:
# data_imputed=data.copy()
# data_imputed.head(2)

In [10]:
data_imputed=pd.read_csv("../input/imputed-data-with-recover/Imputed_data_wth_Recovr.csv",parse_dates=["timestamp"])
data_imputed.head(2)

In [11]:
data_imputed.drop(["Unnamed: 0"],axis=1,inplace=True)
data_imputed.head(2)

In [15]:
data_imputed.shape

In [16]:
data_imputed["machine_status"].value_counts()

In [12]:
data_imputed["machine_status"]=data_imputed["machine_status"].astype(int)

In [18]:
data_imputed["machine_status"].value_counts()

In [13]:
daily_data=data_imputed.groupby([data_imputed.timestamp.dt.date,data_imputed.machine_status]).sum().reset_index()
daily_data.tail(5)

In [20]:
daily_data.columns

In [21]:
daily_data.shape,data.shape

In [22]:
data.columns

In [14]:
data_new=daily_data[['timestamp', 'sensor_00', 'sensor_02', 'sensor_04', 'sensor_05',
       'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10',
       'sensor_11', 'sensor_12', 'sensor_13', 'sensor_32', 'sensor_35',
       'sensor_38', 'sensor_39', 'sensor_40', 'sensor_41', 'sensor_42',
       'sensor_43', 'sensor_44', 'sensor_45', 'sensor_46', 'sensor_47',
       'sensor_49', 'sensor_50', 'sensor_51', 'machine_status']]
data_new.columns

In [25]:
data_new["machine_status"].value_counts()

In [15]:
from sklearn.preprocessing import StandardScaler

std_scale=StandardScaler()
data_new.iloc[:,1:-1]=std_scale.fit_transform(data_new.iloc[:,1:-1])

In [16]:
x=data_new.iloc[:,1:-1]
y=data_new.iloc[:,-1]
x.head(2)

In [17]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.5,random_state=43,shuffle=False)

In [18]:
np.count_nonzero(ytest==1, axis=None),np.count_nonzero(ytest==0, axis=None)

In [None]:
# from imblearn.over_sampling import SMOTE
# from collections import Counter

# sm=SMOTE(sampling_strategy=0.75,k_neighbors=2)
# xtrain_imb,ytrain_imb=sm.fit_resample(xtrain,ytrain)
# print("The number of classes before fit {}".format(Counter(ytrain)))
# print("The number of classes after fit {}".format(Counter(ytrain_imb)))

In [30]:
# from sklearn.ensemble import RandomForestClassifier
# model_rf = RandomForestClassifier(
#              n_estimators=100,
#              max_depth=350,
#              min_samples_split=10, 
#              n_jobs=-1, 
#              random_state=42,   
#              )
# model_rf.fit(xtrain,ytrain)
# ypred_rf=model_rf.predict(xtest)

In [22]:
from sklearn import svm
model_svm = svm.SVC(C=1.0, kernel='rbf',
                           degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, 
                           cache_size=200, class_weight='balanced')
model_svm.fit(xtrain,ytrain)
ypred_svm=model_svm.predict(xtest)

In [23]:
np.count_nonzero(ytest==1, axis=None),np.count_nonzero(ytest==0, axis=None)

In [25]:
np.count_nonzero(ypred_svm==1, axis=None),np.count_nonzero(ypred_svm==0, axis=None)

In [27]:
from sklearn.metrics import classification_report
print(classification_report(ytest,ypred_svm))

In [None]:
import seaborn as sns 

mat = confusion_matrix(ytest, ypred_rf)

sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)

plt.xlabel('true label')

plt.ylabel('predicted label');

#plt.savefig("visualizations/naive_bayes_confusion_matrix.png")

In [None]:
 
from fbprophet import Prophet 
# from fbprophet.plot import add_changepoints_to_plot

In [None]:
sensor_00= pd.DataFrame()
sensor_00['ds'] = data1["timestamp"]
sensor_00['y']=data1["sensor_00"]
model_p = Prophet()
model_p.fit(sensor_00)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_00 = model_p.predict(future) 
sensor_00_fc=forecast_00[['yhat']]

sensor_02= pd.DataFrame()
sensor_02['ds'] = data1["timestamp"]
sensor_02['y']=data1["sensor_02"]
model_p = Prophet()
model_p.fit(sensor_02)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_02 = model_p.predict(future) 
sensor_02_fc=forecast_02[['yhat']]

sensor_04= pd.DataFrame()
sensor_04['ds'] = data1["timestamp"]
sensor_04['y']=data1["sensor_04"]
model_p = Prophet()
model_p.fit(sensor_04)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_04 = model_p.predict(future) 
sensor_04_fc=forecast_04[['yhat']]

sensor_05= pd.DataFrame()
sensor_05['ds'] = data1["timestamp"]
sensor_05['y']=data1["sensor_05"]
model_p = Prophet()
model_p.fit(sensor_05)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_05 = model_p.predict(future) 
sensor_05_fc=forecast_05[['yhat']]

sensor_06= pd.DataFrame()
sensor_06['ds'] = data1["timestamp"]
sensor_06['y']=data1["sensor_06"]
model_p = Prophet()
model_p.fit(sensor_06)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_06 = model_p.predict(future) 
sensor_06_fc=forecast_06[['yhat']]

sensor_07= pd.DataFrame()
sensor_07['ds'] = data1["timestamp"]
sensor_07['y']=data1["sensor_07"]
model_p = Prophet()
model_p.fit(sensor_07)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_07 = model_p.predict(future) 
sensor_07_fc=forecast_07[['yhat']]

sensor_08= pd.DataFrame()
sensor_08['ds'] = data1["timestamp"]
sensor_08['y']=data1["sensor_08"]
model_p = Prophet()
model_p.fit(sensor_08)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_08 = model_p.predict(future) 
sensor_08_fc=forecast_08[['yhat']]

sensor_09= pd.DataFrame()
sensor_09['ds'] = data1["timestamp"]
sensor_09['y']=data1["sensor_09"]
model_p = Prophet()
model_p.fit(sensor_09)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_09 = model_p.predict(future) 
sensor_09_fc=forecast_09[['yhat']]

sensor_10= pd.DataFrame()
sensor_10['ds'] = data1["timestamp"]
sensor_10['y']=data1["sensor_10"]
model_p = Prophet()
model_p.fit(sensor_10)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_10 = model_p.predict(future) 
sensor_10_fc=forecast_10[['yhat']]

sensor_11= pd.DataFrame()
sensor_11['ds'] = data1["timestamp"]
sensor_11['y']=data1["sensor_11"]
model_p = Prophet()
model_p.fit(sensor_11)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_11 = model_p.predict(future) 
sensor_11_fc=forecast_11[['yhat']]

sensor_12= pd.DataFrame()
sensor_12['ds'] = data1["timestamp"]
sensor_12['y']=data1["sensor_12"]
model_p = Prophet()
model_p.fit(sensor_12)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_12 = model_p.predict(future) 
sensor_12_fc=forecast_12[['yhat']]

sensor_13= pd.DataFrame()
sensor_13['ds'] = data1["timestamp"]
sensor_13['y']=data1["sensor_13"]
model_p = Prophet()
model_p.fit(sensor_13)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_13 = model_p.predict(future) 
sensor_13_fc=forecast_13[['yhat']]

sensor_32= pd.DataFrame()
sensor_32['ds'] = data1["timestamp"]
sensor_32['y']=data1["sensor_32"]
model_p = Prophet()
model_p.fit(sensor_32)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_32 = model_p.predict(future) 
sensor_32_fc=forecast_32[['yhat']]

sensor_35= pd.DataFrame()
sensor_35['ds'] = data1["timestamp"]
sensor_35['y']=data1["sensor_35"]
model_p = Prophet()
model_p.fit(sensor_35)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_35 = model_p.predict(future) 
sensor_35_fc=forecast_35[['yhat']]

sensor_38= pd.DataFrame()
sensor_38['ds'] = data1["timestamp"]
sensor_38['y']=data1["sensor_38"]
model_p = Prophet()
model_p.fit(sensor_38)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_38 = model_p.predict(future) 
sensor_38_fc=forecast_38[['yhat']]


sensor_39= pd.DataFrame()
sensor_39['ds'] = data1["timestamp"]
sensor_39['y']=data1["sensor_39"]
model_p = Prophet()
model_p.fit(sensor_39)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_39 = model_p.predict(future) 
sensor_39_fc=forecast_39[['yhat']]

sensor_40= pd.DataFrame()
sensor_40['ds'] = data1["timestamp"]
sensor_40['y']=data1["sensor_40"]
model_p = Prophet()
model_p.fit(sensor_40)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_40 = model_p.predict(future) 
sensor_40_fc=forecast_40[['yhat']]

sensor_41= pd.DataFrame()
sensor_41['ds'] = data1["timestamp"]
sensor_41['y']=data1["sensor_41"]
model_p = Prophet()
model_p.fit(sensor_41)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_41 = model_p.predict(future) 
sensor_41_fc=forecast_41[['yhat']]

sensor_42= pd.DataFrame()
sensor_42['ds'] = data1["timestamp"]
sensor_42['y']=data1["sensor_42"]
model_p = Prophet()
model_p.fit(sensor_42)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_42 = model_p.predict(future) 
sensor_42_fc=forecast_42[['yhat']]

sensor_43= pd.DataFrame()
sensor_43['ds'] = data1["timestamp"]
sensor_43['y']=data1["sensor_43"]
model_p = Prophet()
model_p.fit(sensor_43)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_43 = model_p.predict(future) 
sensor_43_fc=forecast_43[['yhat']]

sensor_44= pd.DataFrame()
sensor_44['ds'] = data1["timestamp"]
sensor_44['y']=data1["sensor_44"]
model_p = Prophet()
model_p.fit(sensor_44)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_44 = model_p.predict(future) 
sensor_44_fc=forecast_44[['yhat']]


sensor_45= pd.DataFrame()
sensor_45['ds'] = data1["timestamp"]
sensor_45['y']=data1["sensor_45"]
model_p = Prophet()
model_p.fit(sensor_45)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_45 = model_p.predict(future) 
sensor_45_fc=forecast_45[['yhat']]


sensor_46= pd.DataFrame()
sensor_46['ds'] = data1["timestamp"]
sensor_46['y']=data1["sensor_46"]
model_p = Prophet()
model_p.fit(sensor_46)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_46 = model_p.predict(future) 
sensor_46_fc=forecast_46[['yhat']]


sensor_47= pd.DataFrame()
sensor_47['ds'] = data1["timestamp"]
sensor_47['y']=data1["sensor_47"]
model_p = Prophet()
model_p.fit(sensor_47)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_47 = model_p.predict(future) 
sensor_47_fc=forecast_47[['yhat']]


sensor_49= pd.DataFrame()
sensor_49['ds'] = data1["timestamp"]
sensor_49['y']=data1["sensor_49"]
model_p = Prophet()
model_p.fit(sensor_49)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_49 = model_p.predict(future) 
sensor_49_fc=forecast_49[['yhat']]


sensor_50= pd.DataFrame()
sensor_50['ds'] = data1["timestamp"]
sensor_50['y']=data1["sensor_50"]
model_p = Prophet()
model_p.fit(sensor_50)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_50 = model_p.predict(future) 
sensor_50_fc=forecast_50[['yhat']]


sensor_51= pd.DataFrame()
sensor_51['ds'] = data1["timestamp"]
sensor_51['y']=data1["sensor_51"]
model_p = Prophet()
model_p.fit(sensor_51)
future = model_p.make_future_dataframe(periods=360,freq="H")
forecast_51 = model_p.predict(future) 
sensor_51_fc=forecast_51[['yhat']]



In [None]:
sensor_testdata=pd.concat([forecast_00[['ds']],sensor_00_fc[['yhat']],sensor_02_fc[['yhat']],sensor_04_fc[['yhat']],
                           sensor_05_fc[['yhat']],sensor_06_fc[['yhat']],sensor_07_fc[['yhat']],sensor_08_fc[['yhat']],sensor_09_fc[['yhat']],
                          sensor_10_fc[['yhat']],sensor_11_fc[['yhat']],sensor_12_fc[['yhat']],sensor_13_fc[['yhat']]
                          ,sensor_32_fc[['yhat']],sensor_35_fc[['yhat']],sensor_38_fc[['yhat']],sensor_39_fc[['yhat']],
                          sensor_40_fc[['yhat']],sensor_41_fc[['yhat']],sensor_42_fc[['yhat']],sensor_43_fc[['yhat']]
                          ,sensor_44_fc[['yhat']],sensor_45_fc[['yhat']],sensor_46_fc[['yhat']],sensor_47_fc[['yhat']],
                          sensor_49_fc[['yhat']],sensor_50_fc[['yhat']],sensor_51_fc[['yhat']]]
                          ,axis=1)

sensor_testdata.columns=x.columns


In [None]:
# train_data,test_data = data[0:141100], data[141100:]  # Broken[Apr,May,Jun] and Broken[Jul]
# train_data.shape,test_data.shape

In [28]:
sensor_testdata=pd.read_csv("../input/sensor-month-data/Sensor_6Monthdata.csv",parse_dates=["timestamp"])
sensor_testdata.tail(2)

In [30]:
sensor_testdata.drop(['Unnamed: 0'],axis=1,inplace=True)

In [31]:
from sklearn.preprocessing import StandardScaler

std_scale=StandardScaler()
sensor_testdata.iloc[:,1:]=std_scale.fit_transform(sensor_testdata.iloc[:,1:])

In [32]:
sensor_testdata.columns

In [33]:
x_new=sensor_testdata.iloc[:,1:]
#y_new=scaled_sensordata.iloc[:,-1]
x_new.columns,x_new.shape

In [34]:
xtest.shape,x_new.shape

In [37]:
x_new.head(2)

In [38]:
x.head(2)

In [52]:
xtest.shape

In [35]:
ypred_final=model_svm.predict(x_new)

In [36]:
ypred_final.shape

In [37]:
ypred_final_df = pd.DataFrame (ypred_final, columns= ["machine_status_new"])

In [38]:
final_data = pd.concat([sensor_testdata, ypred_final_df], axis=1)
final_data.head(2)

In [57]:
print(final_data["timestamp"].min())
print(final_data["timestamp"].max())
print(final_data["timestamp"].max()-data["timestamp"].min())

In [59]:
final_data["timestamp"].tail(2)

In [39]:
final_data["machine_status_new"].value_counts()

In [40]:
filtered_df = final_data.loc[(final_data['timestamp'] >= '2018-09-01')]
filtered_df
                    

In [41]:
# final_data[final_data["machine_status_new"]==1 and final_data["timestamp"]>='2018-09-01']
final_data.loc[(final_data["machine_status_new"]==1) & (final_data['timestamp'] >= '2018-09-01')]

In [47]:
g=final_data.groupby('machine_status_new') #created groups by class label
x=np.arange(0,52)
d_m=g.mean() # calculated mean for each group
fig=plt.figure(num=None, figsize=(16, 6))
ax = fig.add_subplot(1, 1, 1)
ax.set_xticks(x)
plt.plot(x[1:15],d_m.loc[1].values[1:15],label='BROKEN',C='b') #plot mean for 1 to 15 sensor, we show absent of sensor_15 we plot data in parts.
plt.plot(x[16:],d_m.loc[1].values[16:],C='b')
plt.plot(x[1:15],d_m.loc[0].values[1:15],label='NORMAL',C='g')
plt.plot(x[16:],d_m.loc[0].values[16:],C='g')
# plt.plot(x[1:15],d_m.loc['RECOVERING'].values[1:15],label='RECOVERING',C='r')
# plt.plot(x[16:],d_m.loc['RECOVERING'].values[16:],C='r')
plt.legend()
plt.xlabel('sensor number(sensor_0,sensor_1...)')
plt.ylabel('mean of sensor')
plt.title('mean of sensors among class labels')
plt.grid()
plt.show()

<h4><font color=Red><b>Downsampling</b></font></h4>