In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle

In [8]:
df=pd.read_csv("cleaned_data.csv")

In [9]:
#Separate the features and target variable
X=df.drop(['Failure_status','date'],axis=1) #Exclude Failure status
y=df['Failure_status']

In [10]:
pipeline = Pipeline([
    ('imputation', SimpleImputer(strategy='median')),  # Use median strategy for imputation
    ('scaling1', MinMaxScaler()),  # Add MinMaxScaler for feature scaling
    ('scaling2', StandardScaler()),  # Add StandardScaler for additional feature scaling
])

In [11]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Fit the pipeline
pipeline.fit(X_train)

In [13]:
#Applying pipeline
X_train_processed = pipeline.transform(X_train)
X_test_processed = pipeline.transform(X_test)

In [14]:
#Save processed data
processed_data = pd.concat([pd.DataFrame(X_train_processed), pd.DataFrame(X_test_processed)])
processed_data.to_csv('processed_data.csv', index=False)

In [15]:
# Save the pipeline for deployment
with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [16]:
# Save the feature names used during the pipeline's fit operation
feature_names = processed_data.columns.tolist()
with open('feature_names.pkl', 'wb') as file:
    pickle.dump(feature_names, file)