In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score,confusion_matrix
from sklearn.preprocessing import StandardScaler
import warnings
import joblib
import sys
sys.path.append("../")
warnings.filterwarnings("ignore", category=UserWarning, module='_distutils_hack')

### Load Data From CSV

In [2]:
# Load your data
fs1 = pd.read_csv('../data/raw/FS1.txt', delimiter='\t', header=None)
ps2 = pd.read_csv('../data/raw/PS2.txt', delimiter='\t', header=None)

profile_column_names = [
    'Cooler condition (%)',
    'Valve condition (%)',
    'Internal pump leakage',
    'Hydraulic accumulator (bar)',
    'Stable flag'
]

profile = pd.read_csv('../data/raw/profile.txt', delimiter='\t', header=None, names=profile_column_names)

### Data preparation

Formatting target data as True/False (1/0) signifying optimal valve condition (100% being 1/True)

In [3]:
profile["Valve condition Status"] = profile["Valve condition (%)"].apply(lambda x: 1 if x == 100 else 0)
profile = profile[["Valve condition Status"]]

Instead of using every captured measure in the 60 seconds cycle, and to reduce the dimensionality of the input Data, I created a function to calculate the mean of multiple measures in a specific timeframe (**In my case, i am using the mean of measures in 10seconds timeframes**).
* FS1 : the FS1 measures are done on 10hz frequency, meaning 10 measures per second. **in my code, i take the mean of 100 measures in a 10 seconds timeframe, presenting my with only 6 features as input**

* PS2 : the PS2 measures are done on 100hz frequency, meaning 100 measures per second**in my code, i take the mean of 1000 measures in a 10 seconds timeframe, presenting my with only 6 features as input**

I end up with 12 features coming from FS1 and PS2

In [4]:
def features_mean_measures_preparation(df, group_size, dataset_prefix):
    df = df.T
    df[f"cycle"] = (df.index // group_size)
    df = df.groupby('cycle').mean().T
    df.columns = [f"{dataset_prefix}_mean_measure_at_{i + 1}0th_second" for i in range(len(df.columns))]
    return df

engineered_fs1 = features_mean_measures_preparation(fs1, 100, "fs1")
engineered_ps2 = features_mean_measures_preparation(ps2, 1000, "ps2")

Concatenate data based on index

In [5]:
merged_df = pd.concat([engineered_fs1, engineered_ps2, profile], axis=1)

In [17]:
merged_df.head(2).to_dict()

{'fs1_mean_measure_at_10th_second': {0: 0.98779, 1: 0.9428300000000001},
 'fs1_mean_measure_at_20th_second': {0: 7.83661, 1: 7.84971},
 'fs1_mean_measure_at_30th_second': {0: 7.6888499999999995, 1: 7.70148},
 'fs1_mean_measure_at_40th_second': {0: 7.93828, 1: 7.96244},
 'fs1_mean_measure_at_50th_second': {0: 7.937, 1: 7.9485},
 'fs1_mean_measure_at_60th_second': {0: 7.87036, 1: 7.8869299999999996},
 'ps2_mean_measure_at_10th_second': {0: 9.512161, 1: 9.566111999999999},
 'ps2_mean_measure_at_20th_second': {0: 121.12585, 1: 121.08698},
 'ps2_mean_measure_at_30th_second': {0: 131.31226, 1: 131.12716},
 'ps2_mean_measure_at_40th_second': {0: 139.6496, 1: 139.48404000000002},
 'ps2_mean_measure_at_50th_second': {0: 129.96391, 1: 129.83252000000002},
 'ps2_mean_measure_at_60th_second': {0: 125.2377, 1: 125.03253},
 'Valve condition Status': {0: 1, 1: 1}}

### Storing dataset as feather file for serving the model
-- to predict output based on index

In [6]:
merged_df.to_feather('../data/merged_dataset/dataset.feather')

### Data Splitting
Using only the first 2000 rows for training and testing the model, while preserving remaining data for final evaluation

In [7]:
remaining_data = merged_df.iloc[2000:]
merged_df = merged_df.iloc[:2000]

### Data Modeling

In [8]:
# Split data into features (X) and target (y)
X = merged_df.drop(columns=['Valve condition Status'])
y = merged_df['Valve condition Status']

# Data splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Data Scaling and saving scaler for model serving

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, '../saved_models/scaler.pkl')

['../saved_models/scaler.pkl']

Defining evaluation metrics

In [12]:
# Define a function to calculate metrics and return them as a dictionary
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1, 'Confusion Matrix': cm}


Training, Testing, and evaluating results

In [13]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Gaussian Naive Bayes': GaussianNB()
}

# Train and evaluate each model
results = []

for model_name, model in models.items():
    
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    # Calculate metrics
    metrics = calculate_metrics(y_test, y_pred)
    # Append results
    results.append({'Model': model_name, **metrics})

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Confusion Matrix
0,Logistic Regression,1.0,1.0,1.0,1.0,"[[197, 0], [0, 203]]"
1,Random Forest,0.995,0.995074,0.995074,0.995074,"[[196, 1], [1, 202]]"
2,Gaussian Naive Bayes,0.8675,0.820513,0.945813,0.878719,"[[155, 42], [11, 192]]"


Cross validation for better evaluation:

In [14]:
# Optional: Cross-validation for more robust evaluation
cv_results = []

for model_name, model in models.items():
    
    # Calculate metrics for cross-validation
    cv_metrics = {
        'Mean F1 Score': cross_val_score(model, X, y, cv=10, scoring='f1').mean(),
        'Mean Accuracy': cross_val_score(model, X, y, cv=10, scoring='accuracy').mean(),
        'Mean Precision': cross_val_score(model, X, y, cv=10, scoring='precision').mean(),
        'Mean Recall': cross_val_score(model, X, y, cv=10, scoring='recall').mean()
    }
    
    # Append cross-validation results
    cv_results.append({'Model': model_name, **cv_metrics})

# Convert cross-validation results to DataFrame
cv_results_df = pd.DataFrame(cv_results)

# Print cross-validation results
print("\nCross-validation results (mean scores):")
cv_results_df


Cross-validation results (mean scores):


Unnamed: 0,Model,Mean F1 Score,Mean Accuracy,Mean Precision,Mean Recall
0,Logistic Regression,0.999522,0.9995,1.0,0.999048
1,Random Forest,0.959698,0.966,0.997187,0.940952
2,Gaussian Naive Bayes,0.885988,0.876,0.866103,0.93619


Using the remaining data for final evaluation:

In [15]:
# Separate features (X_remaining) and target (y_remaining)
X_remaining = remaining_data.drop(columns=['Valve condition Status'])
y_remaining = remaining_data['Valve condition Status']

# Scale the remaining data using the previously fitted scaler
X_remaining_scaled = scaler.transform(X_remaining)

# Initialize an empty list to store results for remaining data
remaining_results = []

# Evaluate each model on remaining data
for model_name, model in models.items():
    # Predict using the model on scaled remaining data
    y_pred_remaining = model.predict(X_remaining_scaled)
    
    # Calculate metrics for remaining data
    metrics_remaining = calculate_metrics(y_remaining, y_pred_remaining)
    
    # Append results
    remaining_results.append({'Model': model_name, **metrics_remaining})

# Convert remaining results to DataFrame
remaining_results_df = pd.DataFrame(remaining_results)

# Print results using remaining data
print("Results using remaining data (beyond first 2000 rows):")
remaining_results_df

Results using remaining data (beyond first 2000 rows):


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Confusion Matrix
0,Logistic Regression,1.0,1.0,1.0,1.0,"[[132, 0], [0, 73]]"
1,Random Forest,0.97561,0.935897,1.0,0.966887,"[[127, 5], [0, 73]]"
2,Gaussian Naive Bayes,0.756098,0.593496,1.0,0.744898,"[[82, 50], [0, 73]]"


### Saving model for serving

In [17]:
joblib.dump(models["Random Forest"], '../saved_models/rf_classifier.pkl')
joblib.dump(models["Logistic Regression"], '../saved_models/lr_classifier.pkl')

['../saved_models/lr_classifier.pkl']