In [4]:

# Importing required libraries 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import r2_score
from scipy import stats
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix, accuracy_score, r2_score, roc_auc_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
data_train=pd.read_csv('CMaps/train_FD001.txt',sep=" ")
train_copy = data_train
#initail dropping., bcs columns full of null values
data_train.drop(columns=['Unnamed: 26','Unnamed: 27'],inplace=True)

In [None]:
# Give names to the features
index_names = ['engine', 'cycle']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names=[ "(Fan inlet temperature) (◦R)",
"(LPC outlet temperature) (◦R)",
"(HPC outlet temperature) (◦R)",
"(LPT outlet temperature) (◦R)",
"(Fan inlet Pressure) (psia)",
"(bypass-duct pressure) (psia)",
"(HPC outlet pressure) (psia)",
"(Physical fan speed) (rpm)",
"(Physical core speed) (rpm)",
"(Engine pressure ratio(P50/P2)",
"(HPC outlet Static pressure) (psia)",
"(Ratio of fuel flow to Ps30) (pps/psia)",
"(Corrected fan speed) (rpm)",
"(Corrected core speed) (rpm)",
"(Bypass Ratio) ",
"(Burner fuel-air ratio)",
"(Bleed Enthalpy)",
"(Required fan speed)",
"(Required fan conversion speed)",
"(High-pressure turbines Cool air flow)",
"(Low-pressure turbines Cool air flow)" ]
col_names = index_names + setting_names + sensor_names

In [None]:
data_train.columns = col_names
data_train.describe()

In [None]:
data_train.shape
#20630 rows and 26 columns

data_train

In [None]:
data_train.dtypes
#no categorica data types only numeric (4-int and remaining are in float)

In [None]:
data_train.info()

In [None]:
data_train.describe()

In [None]:
data_train.isnull().sum()
#no null as we dropped 

In [None]:
data_train.head(10)

In [None]:
data_train.nunique()

In [None]:
for i in data_train.select_dtypes(include=np.number):
    sns.boxplot(data_train[i])
    plt.show()


In [None]:
#presence of outliers., some data containing only one unique values., 

In [None]:
# we will check for correlation
plt.figure(figsize=(15,15))
sns.set_style("whitegrid", {"axes.facecolor": ".0"})
df_cluster2 = data_train.corr()
plot_kws={"s": 1}
sns.heatmap(data_train.corr(),
            cmap='RdYlBu',
            annot=True,
            linecolor='lightgrey').set_facecolor('white')

In [None]:
#heavy correlation between variables can be seen, we will fix, threshold value=0.9

In [None]:
for i in data_train.select_dtypes(include=np.number):
    sns.histplot(data_train[i])
    plt.show()

In [None]:
print(data_train.shape)

In [None]:
for i in data_train.select_dtypes(include=np.number):
    sns.boxplot(data_train[i])
    plt.show()

In [None]:
data_train.nunique()

In [None]:
unwanted=[]
for i in data_train.select_dtypes(include=np.number):
    if data_train[i].nunique()==1:
        unwanted.append(i)
print(unwanted)

In [None]:
unwanted

In [None]:
data_train.drop(columns=unwanted, inplace=True)

#so we will drop such columns

In [None]:
data_train.shape

In [None]:
# we will check for correlation
plt.figure(figsize=(15,15))
sns.set_style("whitegrid", {"axes.facecolor": ".0"})
df_cluster2 = data_train.corr()
plot_kws={"s": 1}
sns.heatmap(data_train.corr(),
            cmap='RdYlBu',
            annot=True,
            linecolor='lightgrey').set_facecolor('white')

In [None]:
threshold = 0.90
plt.figure(figsize=(10,10))

sns.set_style("whitegrid", {"axes.facecolor": ".0"})
df_cluster2 = data_train.corr()
mask = df_cluster2.where((abs(df_cluster2) >= threshold)).isna()
plot_kws={"s": 1}
sns.heatmap(df_cluster2,
            cmap='RdYlBu',
            annot=True,
            mask=mask,
            linewidths=0.2, 
            linecolor='lightgrey').set_facecolor('white')

In [None]:
#Drop one of the highly correlated features and keep the other. The threshold for correlation is set at 0.95

#by correlation plot we got to know ., '(Corrected core speed) (rpm)' and '(Physical core speed) (rpm)' have high correlation 
#so will drop one ['(Corrected core speed) (rpm)']

data_train.drop(['(Corrected core speed) (rpm)'],axis=1,inplace=True)

In [None]:
list(data_train)

In [None]:
list(data_train)

In [None]:
data_train.shape

In [None]:
features = list(data_train.columns)
features

In [None]:
data_train.head()

## Outlier Detection and Treatment

In [None]:
data_train

In [None]:
for i in data_train.select_dtypes(include=np.number):
    sns.set(style="darkgrid")
    sns.boxplot(data_train[i],palette="Blues")
    
    plt.show()

In [None]:
data_train.shape

In [None]:
data_train.nunique()

In [None]:
data_train['(bypass-duct pressure) (psia)'].unique()

In [None]:
(((data_train['(bypass-duct pressure) (psia)']==21.61).sum())/data_train.shape[0])*100

In [None]:
(((data_train['(bypass-duct pressure) (psia)']==21.6).sum())/data_train.shape[0])*100

In [None]:
#so 98% (bypass-duct pressure) has 26.61psia., and 1.9% has 26.6psia.,
#and 26.61 is almost equal to 26.6., that means, we have only 1 unique value., 
# so will drop this column
data_train.drop(['(bypass-duct pressure) (psia)'],axis=1,inplace=True)

In [None]:
data_train.shape

In [None]:
data_train.nunique()

In [None]:
#OUTLIER TREAMENT FOR TRAIN DATA
# calculate the z-scores for each column
z_scores = data_train.apply(lambda x: np.abs((x - x.mean()) / x.std()))

# set a threshold for the z-score
threshold = 3

# identify the outliers
outliers = z_scores > threshold

In [None]:
z_scores = (data_train - data_train.mean()) / data_train.std()

# Replace values that exceed a certain threshold with the mode
threshold = 2.5
for col in data_train.columns:
    outlier_mask = z_scores[col].abs() > threshold
    data_train.loc[outlier_mask, col] = data_train[col].mask(outlier_mask).mode()[0]

In [None]:
for i in data_train.select_dtypes(include=np.number):
    sns.boxplot(data_train[i],palette="Blues")
    plt.show()

In [None]:
for i in data_train.select_dtypes(include=np.number):
    sns.set(style="darkgrid")
    sns.histplot(data_train[i],palette="Blues")
    
    plt.show()
    print(data_train[i].skew())

In [None]:
data_train.shape

In [None]:
print(data_train.shape)

In [None]:
data_train['cycle'].max()

In [None]:
# define the maximum life of each engine, 
#as this could be used to obtain the RUL at each point in time of the engine's life 

#remaining useful life (RUL) of each engine in the test dataset.
#RUL is equivalent of number of flights remained for the engine after the last datapoint in the test dataset.
data_train_RUL = data_train.groupby(['engine']).agg({'cycle':'max'})
data_train_RUL.rename(columns={'cycle':'life'},inplace=True)
data_train_RUL.head()

In [None]:
data_train=data_train.merge(data_train_RUL,how='left',on=['engine'])

In [None]:
data_train['RUL']=data_train['life']-data_train['cycle']
data_train.drop(['life'],axis=1,inplace=True)

# the RUL prediction is only useful nearer to the end of the engine's life, therefore we put an upper limit on the RUL
# this is a bit sneaky, since it supposes that the test set has RULs of less than this value, the closer you are
# to the true value, the more accurate the model will be
data_train['RUL'][data_train['RUL']>125]=125
data_train.head()


# the RUL prediction is only useful nearer to the end of the engine's life, therefore we put an upper limit on the RUL
# this is a bit sneaky, since it supposes that the test set has RULs of less than this value, the closer you are

In [None]:
print(data_train.isnull().sum())


In [None]:
data_train['RUL'].unique()

In [None]:
print(data_train.shape)

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(data_train.corr(), annot=True )
plt.show()

In [None]:
threshold = 0.90
plt.figure(figsize=(10,10))
threshold = 0.6
sns.set_style("whitegrid", {"axes.facecolor": ".0"})
df_cluster2 = data_train.corr()
mask = df_cluster2.where((abs(df_cluster2) >= threshold)).isna()
plot_kws={"s": 1}
sns.heatmap(df_cluster2,
            cmap='RdYlBu',
            annot=True,
            mask=mask,
            linewidths=0.2, 
            linecolor='lightgrey').set_facecolor('white')

In [None]:
data_train.drop(columns=['(Physical core speed) (rpm)','engine','setting_1','setting_2','engine'], inplace=True)

In [None]:
list(data_train)

In [None]:
threshold = 0.90
plt.figure(figsize=(10,10))
threshold = 0.6
sns.set_style("whitegrid", {"axes.facecolor": ".0"})
df_cluster2 = data_train.corr()
mask = df_cluster2.where((abs(df_cluster2) >= threshold)).isna()
plot_kws={"s": 1}
sns.heatmap(df_cluster2,
            cmap='RdYlBu',
            annot=True,
            mask=mask,
            linewidths=0.2, 
            linecolor='lightgrey').set_facecolor('white')

In [None]:
data_train.drop(columns=['(Corrected fan speed) (rpm)','(Physical fan speed) (rpm)','(HPC outlet temperature) (◦R)'], inplace=True)

In [None]:

data_train.isnull().sum()

In [None]:
data_train.nunique()

In [None]:

new_train_data=pd.DataFrame(data_train)

In [None]:
new_train_data

In [None]:
x= data_train.iloc[:, :-1]
y = data_train.iloc[:,-1]

from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
x = sc.fit_transform(x)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print('x_train shape : ',x_train.shape)
print('x_test shape : ',x_test.shape)
print('y_train shape : ',y_train.shape)
print('y_test shape : ',y_test.shape)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import  mean_squared_error,r2_score, mean_absolute_percentage_error
model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
# make predictions on the testing data
y_pred = model.predict(x_test)

In [None]:
# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean squared error: {mse}")
print(f"Root mean squared error: {rmse}")
print(f"R-squared score: {r2}")
print(f"mean_absolute_percentage_error: {mape}")

training_score = cross_val_score(model, x_train, y_train, cv=5)
print("Algorithm: ", model, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor()
model.fit(x_train, y_train)

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Mean squared error: {mse}")
print(f"Root mean squared error: {rmse}")
print(f"R-squared score: {r2}")

training_score = cross_val_score(model, x_train, y_train, cv=5)
print("Algorithm: ", model, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

model=DecisionTreeRegressor()
model.fit(x_train, y_train)

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Mean squared error: {mse}")
print(f"Root mean squared error: {rmse}")
print(f"R-squared score: {r2}")

training_score = cross_val_score(model, x_train, y_train, cv=5)
print("Algorithm: ", model, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

## KNN

In [None]:

from sklearn.neighbors import KNeighborsRegressor

model=KNeighborsRegressor()
model.fit(x_train, y_train)

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Mean squared error: {mse}")
print(f"Root mean squared error: {rmse}")
print(f"R-squared score: {r2}")

In [None]:
training_score = cross_val_score(model, x_train, y_train, cv=5)
print("Algorithm: ", model, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model=GradientBoostingRegressor()
model.fit(x_train, y_train)

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Mean squared error: {mse}")
print(f"Root mean squared error: {rmse}")
print(f"R-squared score: {r2}")

In [None]:
training_score = cross_val_score(model, x_train, y_train, cv=5)
print("Algorithm: ", model, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

## CONCLUSION

Only the below 10 variables were found to be correlated to RUL

cycle

LPC outlet temperature

LPT outlet temperature

HPC outlet pressure

HPC outlet Static pressure

Ratio of fuel flow to Ps30

Bypass Ratio

Bleed Enthalpy

High-pressure turbines Cool air flow

Low-pressure turbines Cool air flow

In [None]:
data_train.info()

GradientBoostingRegressor() Has a training score of 83.0 % accuracy score

KNeighborsRegressor() Has a training score of 81.0 % accuracy score

RandomForestRegressor() Has a training score of 83.0 % accuracy score

LinearRegression() Has a training score of 77.0 % accuracy score

DecisionTreeRegressor() Has a training score of 66.0 % accuracy score