## Imports

## Creating combined data frame

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
import statsmodels.api as sm

In [None]:
folder_path = "C:/Users/User/Downloads/Train_2,4"
dataframes = []
for file_name in os.listdir(folder_path):
    if file_name.endswith('.txt'):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, sep=r'\s+', header=None)
        df['source_file'] = file_name
        dataframes.append(df)
combined_df = pd.concat(dataframes, ignore_index=True)
df=pd.DataFrame(combined_df)
print(combined_df.head())

In [None]:
columns = ['engine', 'cycle',
'setting1', 'setting2', 'setting3',
"Fan_inlet_temperature",
"LPC_outlet_temperature",
"HPC_outlet_temperature",
"LPT_outlet_temperature",
"Fan_inlet_Pressure",
"bypass_duct_pressure",
"HPC_outlet_pressure",
"Physical_fan_speed",
"Physical_core_speed",
"Engine_pressure_ratio",
"HPC_outlet_Static_pressure",
"Ratio_of_fuel_flow_to_Ps30",
"Corrected_fan_speed",
"Corrected_core_speed",
"Bypass_Ratio",
"Burner_fuel_air_ratio",
"Bleed_Enthalpy",
"Required_fan_speed",
"Required_fan_conversion_speed",
"High_pressure_turbines_Cool_air_flow",
"Low_pressure_turbines_Cool_air_flow","source_file" ]
df.columns=columns
df.head()

In [None]:
# Create UID for each engine
source_file_mapping = {
    "train_FD001.txt": "_1",
    "train_FD002.txt": "_2",
    "train_FD003.txt": "_3",
    "train_FD004.txt": "_4"
}
df['source_suffix'] = df['source_file'].map(source_file_mapping)
df['UID'] = df['engine'].astype(str) + df['source_suffix']
df.drop(columns=['source_suffix'], inplace=True)
print(df.head())

In [None]:
unique_uid_counts = df.groupby('source_file')['UID'].nunique()
print(unique_uid_counts)
#double checking with Kaggle values

# Creating RUL and Failure Variables

In [None]:
df['rul'] = df.groupby(['UID'])['cycle'].transform('max') - df['cycle']
df['max_cycles'] = df.groupby(['UID'])['cycle'].transform('max')

In [None]:
df['HPC Degradation'] = 0
df['Fan Degradation'] = 0

# Apply conditions
df.loc[(df['rul'] == 0) & (df['source_file'].isin(["train_FD001.txt"])), 'HPC Degradation'] = 1
df.loc[(df['rul'] == 0) & (df['source_file'].isin(["train_FD003.txt"])), ['HPC Degradation', 'Fan Degradation']] = 1

In [None]:
print(df.shape)
df.head()

## Checking for missing values

In [None]:
missing_values = df.isnull().sum()
missing_values

## conclusion
No missing values so imputation/dropping of observations required

## Should we use a combined dataset?

## Conclusion

As seen from the TSNE plots datasets 03  & 01 are very distinct from 04 and 02 and therefore we will treat them as such/will create an algorithm that first differentiates on these. Within these, there are alot of overlaps which may result in multicolinearity issues.

## Checking and Removing outliers using cooks distance

In [None]:
df_original=df.copy()
variables_of_interest = ['setting1', 'setting2', 'setting3',
"Fan_inlet_temperature",
"LPC_outlet_temperature",
"HPC_outlet_temperature",
"LPT_outlet_temperature",
"Fan_inlet_Pressure",
"bypass_duct_pressure",
"HPC_outlet_pressure",
"Physical_fan_speed",
"Physical_core_speed",
"Engine_pressure_ratio",
"HPC_outlet_Static_pressure",
"Ratio_of_fuel_flow_to_Ps30",
"Corrected_fan_speed",
"Corrected_core_speed",
"Bypass_Ratio",
"Burner_fuel_air_ratio",
"Bleed_Enthalpy",
"Required_fan_speed",
"Required_fan_conversion_speed",
"High_pressure_turbines_Cool_air_flow",
"Low_pressure_turbines_Cool_air_flow","rul",'HPC Degradation', 'Fan Degradation']

## Conclusion
Box plot of physical fan speed, corrected core speed, bypass ratio, required fan speed and RUL seem to suggest presence of outliers. Let us evaluate them further using cook's distance

In [None]:
X = df[variables_of_interest]

y = df['rul'] 
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
influence = model.get_influence()
cooks_d, p_values = influence.cooks_distance
threshold = 1
outliers = np.where(cooks_d > threshold)[0]
print(f"Number of outliers detected: {len(outliers)}")
print(f"Outlier indices: {outliers}")
df_cleaned = df.drop(index=outliers)
print(f"Cleaned dataset shape: {df_cleaned.shape}")

In [None]:
plt.figure(figsize=(10, 6))
plt.stem(np.arange(len(cooks_d)), cooks_d)
plt.axhline(y=threshold, color='r', linestyle='--', label='Threshold')
plt.title("Cook's Distance")
plt.xlabel("Observation Index")
plt.ylabel("Cook's Distance")
plt.legend()
plt.show()

## Conclusion

Based on cook's distance we infer that the data does not have any outliers

## Basic EDA

In [None]:
df.describe().transpose()

## Conclusion

* Variables have a large range of values so scaling will be required
* Some variables (e.g. setting 2, sensor Engine pressure ratio,Bypass Ratio,Burner fuel-air ratio) have a very small standard deviation/range of values, so they may not be very useful for prediction

# Corelation between variables

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(df[variables_of_interest].corr(), annot=True, cmap='RdYlGn')
plt.grid(False)

## Conclusion

* the corellation with RUL for all variables are relatively low while among variables, there is very high corelation
    * Therefore we can perform prediction with a small subset of variables (sensor and setting)
    * A simple linear regression may not be very useful

In [None]:
source_files = df["source_file"].unique()

# Generate separate correlation matrices for each source file
for source in source_files:
    # Filter data for the current source file
    df_subset = df[df["source_file"] == source][variables_of_interest]
    
    # Compute correlation matrix
    corr_matrix = df_subset.corr()
    
    # Plot heatmap
    plt.figure(figsize=(15, 10))
    sns.heatmap(corr_matrix, annot=True, cmap='RdYlGn', fmt=".2f", vmin=-1, vmax=1)
    plt.title(f"Correlation Matrix for Source File: {source}")
    plt.grid(False)
    plt.show()


## 
Correlations are much stronger in datasets 02&04 vs 01&03

## Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

# Identify features to scale, excluding specified variables
features_to_scale = [var for var in variables_of_interest if var not in ['rul', 'setting1', 'setting2', 'setting3','HPC Degradation', 'Fan Degradation']]

# Initialize the scaler
scaler = StandardScaler()

# Scale the selected features
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Print the head of the DataFrame to verify the scaling
print(df.head())


In [None]:
variables = variables_of_interest
variables_to_drop = ["rul","setting1","setting2","setting3"]
sensor_columns = [var for var in variables if var not in variables_to_drop]
sensor_columns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of sensor columns to plot
sensor_columns = [col for col in df.columns if col not in ['UID', 'engine', 'cycle', 'rul', 'max_cycles', 'source_file', 
                                                           'settings_category', 'setting1', 'setting2', 'setting3']]

# Loop through each sensor column
for sensor in sensor_columns:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='settings_category', y=sensor, data=df)
    plt.title(f'{sensor} vs Settings Category')
    plt.xlabel('Settings Category')
    plt.ylabel(sensor)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


## conclusion

* sensor value vary randomly for FD002 & FD004
  
* For FD001
    *  required fan speed/ required fan conversion speed/fan inlet temp/fan inlet pressure/ engine pressure ratio/ burner fuel air ratio remains static
    *  lpc outlet temp / hpc outlet temp/ lpt outlet temp/ physical fan speed/ HPC outlet static pressure/ corrected fans speed / bypass ratio / bleed enthalpy/ Increases
    *  HPC outlet pressure/ ratio of fuel to flow/ HPT cool air flow / LPT cool air flow Decreases

* For FD002
    *  required fan speed/ required fan conversion speed/fan inlet temp/fan inlet pressure/ engine pressure ratio/ burner fuel air ratio remains static
    *  lpc outlet temp / hpc outlet temp/ lpt outlet temp/ HPC outlet static pressure/ corrected fans speed / bleed enthalpy/ Increases

## Conclusion

There seems to be more corellation with RUL and setting combinations vs individual settings so we will create a data feature representing this

## Creating additional feature

In [None]:
# List of predefined conditions with their setting values
conditions = [
    {'condition': 'Condition_1', 'setting1': 0, 'setting2': 0, 'setting3': 100},
    {'condition': 'Condition_2', 'setting1': 10, 'setting2': 0.25, 'setting3': 100},
    {'condition': 'Condition_3', 'setting1': 20, 'setting2': 0.7, 'setting3': 100},
    {'condition': 'Condition_4', 'setting1': 25, 'setting2': 0.62, 'setting3': 60},
    {'condition': 'Condition_5', 'setting1': 35, 'setting2': 0.84, 'setting3': 100},
    {'condition': 'Condition_6', 'setting1': 42, 'setting2': 0.84, 'setting3': 100},
]

# Function to assign the closest condition
def assign_closest_condition(row):
    min_distance = float('inf')
    closest_condition = None
    for cond in conditions:
        # Calculate total absolute distance across all settings
        distance = abs(row['setting1'] - cond['setting1']) + \
                   abs(row['setting2'] - cond['setting2']) + \
                   abs(row['setting3'] - cond['setting3'])
        if distance < min_distance:
            min_distance = distance
            closest_condition = cond['condition']
    return closest_condition

# Apply the function to the training DataFrame
df['settings_category'] = df.apply(assign_closest_condition, axis=1)
df['settings_category'] = df['settings_category'].astype('category')

# Display unique categories in df
print("Unique Settings Categories in df:")
print(df['settings_category'].unique())





In [None]:
rows_with_77 = df[df['settings_category'] == 'Condition_6']
print("Shape of rows with settings_category = 77:", rows_with_77.shape)


In [None]:
df.columns

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Filter data to include only sensor columns and RUL
data = df[sensor_columns + ["engine", "rul"]]

# Loop through each sensor
for sensor in sensor_columns:
    plt.figure(figsize=(4, 3))
    
    # Plot each engine's data
    for engine_id in data['engine'].unique():
        engine_data = data[data['engine'] == engine_id]
        
        # Compute rolling mean for smoother visualization
        rolled_data = engine_data.rolling(window=8).mean()
        
        # Plot RUL vs sensor value for the current engine
        plt.plot(rolled_data['rul'], rolled_data[sensor], alpha=0.6, label=f'Engine {engine_id}')
    
    # Customize the plot
    plt.xlim(data['rul'].max(), 0)  # Reverse RUL axis
    plt.title(f'{sensor} vs RUL', fontsize=6)
    plt.xlabel('Remaining Useful Life (RUL)', fontsize=6)
    plt.ylabel(sensor, fontsize=6)
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of sensor columns to plot
sensor_columns = [col for col in df.columns if col not in ['UID', 'engine', 'cycle', 'rul', 'max_cycles', 'source_file', 
                                                           'settings_category', 'setting1', 'setting2', 'setting3']]

# Loop through each sensor column
for sensor in sensor_columns:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='settings_category', y=sensor, data=df)
    plt.title(f'{sensor} vs Settings Category')
    plt.xlabel('Settings Category')
    plt.ylabel(sensor)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Use the 'settings_category' column for grouping
settings_categories = df['settings_category'].unique()  # ['Condition_1', ..., 'Condition_6']

# Loop through each sensor
for sensor in sensor_columns:
    # Set up subplots
    num_categories = len(settings_categories)
    num_cols = 3  # Number of columns in the subplot grid
    num_rows = -(-num_categories // num_cols)  # Calculate number of rows (ceil division)
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, num_rows * 5))
    axes = axes.flatten()  # Flatten axes for easier iteration

    # Loop through unique settings categories
    for idx, category in enumerate(settings_categories):
        # Filter data for the current category
        category_data = df[df['settings_category'] == category]

        # Get the axis for the current category
        ax = axes[idx]

        # Plot RUL vs sensor for the current category
        sns.lineplot(
            data=category_data,
            x="rul",
            y=sensor,
            ax=ax,
            alpha=0.6
        )

        # Customize the plot
        ax.set_title(f"{sensor} vs RUL for {category}")
        ax.set_xlabel("Remaining Useful Life (RUL)")
        ax.set_ylabel(sensor)
        ax.grid(True)

    # Remove unused subplots
    for idx in range(num_categories, len(axes)):
        fig.delaxes(axes[idx])

    # Adjust layout and display
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt

# Iterate through each unique settings_category
unique_categories = df['settings_category'].unique()

for category in unique_categories:
    # Filter data for the current category
    category_data = df[df['settings_category'] == category]
    
    # Get the unique engine IDs within this category
    engine_ids = category_data['UID'].unique()

    # Set up the plot
    plt.figure(figsize=(12, 6))
    
    # Iterate through each engine in the category
    for engine_id in engine_ids:
        # Filter data for the current engine
        engine_data = category_data[category_data['UID'] == engine_id]
        
        # Plot RUL vs Cycle for this engine
        plt.plot(engine_data['cycle'], engine_data['rul'], label=f'Engine {engine_id}', alpha=0.6)
    
    # Customize the plot
    plt.title(f'RUL Trends for Engines in {category}', fontsize=16)
    plt.xlabel('Cycle', fontsize=14)
    plt.ylabel('Remaining Useful Life (RUL)', fontsize=14)
    plt.legend(title='Engines', loc='upper right', fontsize=10, bbox_to_anchor=(1.15, 1))
    plt.grid(True)
    plt.tight_layout()
    
    # Show the plot
    plt.show()


In [None]:
import matplotlib.pyplot as plt

# Iterate through each unique settings_category
unique_categories = df['settings_category'].unique()

for category in unique_categories:
    # Filter data for the current category
    category_data = df[df['settings_category'] == category]
    
    # Get the unique engine IDs within this category
    engine_ids = category_data['UID'].unique()

    # Set up the plot
    plt.figure(figsize=(14, 8))  # Increase figure size to allow more space for decorations
    
    # Iterate through each engine in the category
    for engine_id in engine_ids:
        # Filter data for the current engine
        engine_data = category_data[category_data['UID'] == engine_id]
        
        # Plot RUL vs Cycle for this engine
        plt.plot(engine_data['cycle'], engine_data['rul'], label=f'Engine {engine_id}', alpha=0.6)
    
    # Customize the plot
    plt.title(f'RUL Trends for Engines in {category}', fontsize=6, pad=20)  # Add padding to the title
    plt.xlabel('Cycle', fontsize=4)
    plt.ylabel('Remaining Useful Life (RUL)', fontsize=4)
    plt.legend(title='Engines', loc='upper right', fontsize=10, bbox_to_anchor=(1.15, 1))
    plt.grid(True)
    
    # Adjust layout
    plt.subplots_adjust(bottom=0.1, top=0.9, left=0.1, right=0.9)  # Adjust margins manually
    
    # Show the plot
    plt.show()


In [None]:
import matplotlib.pyplot as plt

# List of sensors (replace with actual sensor columns from your dataset)
sensor_columns = [
    "Fan_inlet_temperature",
    "LPC_outlet_temperature",
    "HPC_outlet_temperature",
    "LPT_outlet_temperature",
    "Fan_inlet_Pressure",
    "bypass_duct_pressure",
    "HPC_outlet_pressure",
    "Physical_fan_speed",
    "Physical_core_speed",
    "Engine_pressure_ratio",
    "HPC_outlet_Static_pressure",
    "Ratio_of_fuel_flow_to_Ps30",
    "Corrected_fan_speed",
    "Corrected_core_speed",
    "Bypass_Ratio",
    "Burner_fuel_air_ratio",
    "Bleed_Enthalpy",
    "Required_fan_speed",
    "Required_fan_conversion_speed",
    "High_pressure_turbines_Cool_air_flow",
    "Low_pressure_turbines_Cool_air_flow",
]

# Iterate through each sensor
for sensor in sensor_columns:
    # Iterate through each unique settings category
    unique_categories = df["settings_category"].unique()
    
    for category in unique_categories:
        # Filter data for the current category
        category_data = df[df["settings_category"] == category]
        
        # Get unique engine IDs within the current category
        engine_ids = category_data["UID"].unique()
        
        # Set up the plot
        plt.figure(figsize=(4,2))  # Adjust figure size
        
        # Iterate through each engine in the category
        for engine_id in engine_ids:
            # Filter data for the current engine
            engine_data = category_data[category_data["UID"] == engine_id]
            
            # Plot RUL vs sensor values for this engine
            plt.plot(
                engine_data[sensor],
                engine_data["rul"],
                alpha=0.6,
            )
        
        # Customize the plot
        plt.title(f"RUL Trends for {sensor} in {category}", fontsize=6, pad=20)
        plt.xlabel(sensor, fontsize=6)
        plt.ylabel("Remaining Useful Life (RUL)", fontsize=6)
        plt.legend(title="Engines", loc="upper right", fontsize=10, bbox_to_anchor=(1.15, 1))
        plt.grid(True)
        
        # Adjust layout
        plt.subplots_adjust(bottom=0.1, top=0.9, left=0.1, right=0.9)  # Adjust margins manually
        
        # Show the plot
        plt.show()


In [None]:
import matplotlib.pyplot as plt

# List of sensor columns (replace with actual column names from your dataset)
sensor_columns = [
    "Fan_inlet_temperature",
    "LPC_outlet_temperature",
    "HPC_outlet_temperature",
    "LPT_outlet_temperature",
    "Fan_inlet_Pressure",
    "bypass_duct_pressure",
    "HPC_outlet_pressure",
    "Physical_fan_speed",
    "Physical_core_speed",
    "Engine_pressure_ratio",
    "HPC_outlet_Static_pressure",
    "Ratio_of_fuel_flow_to_Ps30",
    "Corrected_fan_speed",
    "Corrected_core_speed",
    "Bypass_Ratio",
    "Burner_fuel_air_ratio",
    "Bleed_Enthalpy",
    "Required_fan_speed",
    "Required_fan_conversion_speed",
    "High_pressure_turbines_Cool_air_flow",
    "Low_pressure_turbines_Cool_air_flow",
]

# Iterate through each sensor
for sensor in sensor_columns:
    plt.figure(figsize=(10, 6))
    
    # Scatter plot for RUL vs. the sensor
    plt.scatter(df[sensor], df["rul"], alpha=0.6, color="blue", edgecolor="k")
    
    # Customize the plot
    plt.title(f"RUL vs {sensor}", fontsize=16, pad=20)
    plt.xlabel(sensor, fontsize=14)
    plt.ylabel("Remaining Useful Life (RUL)", fontsize=14)
    plt.grid(True)
    
    # Display the plot
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Filter data to include only sensor columns, engine, and RUL
data = df[sensor_columns + ["engine", "rul"]]

# Loop through each sensor
for sensor in sensor_columns:
    engines = data['engine'].unique()
    num_engines = len(engines)
    num_cols = 4  # 4 plots in a row
    num_rows = -(-num_engines // num_cols)  # Calculate rows needed (ceil division)
    
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, num_rows * 5))
    axes = axes.flatten()  # Flatten axes for easy indexing
    
    for idx, engine_id in enumerate(engines):
        # Filter data for the current engine
        engine_data = data[data['engine'] == engine_id]
        
        # Compute rolling mean of the sensor values
        rolled_data = engine_data.rolling(window=8).mean()
        
        # Plot rolling mean against RUL
        ax = axes[idx]
        ax.plot(rolled_data['rul'], rolled_data[sensor], alpha=0.6, label=f'Engine {engine_id}')

        # Customize the subplot
        ax.set_xlim(data['rul'].max(), 0)
        ax.set_xticks(np.arange(0, data['rul'].max() + 1, 25))
        ax.set_title(f'{sensor} (Engine: {engine_id})', fontsize=12)
        ax.set_xlabel('Remaining Useful Life')
        ax.set_ylabel(sensor)
        ax.grid(True)
    
    # Remove unused subplots
    for idx in range(num_engines, len(axes)):
        fig.delaxes(axes[idx])

    # Add a title to the overall figure
    fig.suptitle(f'{sensor} (Rolling Mean of Previous 8 RULs) vs RUL for All Engines', fontsize=16, y=1.02)
    plt.tight_layout()
    plt.show()



## Load test data

In [None]:
folder_path = "C:/Users/User/Downloads/Test_2,4"
dataframes = []
for file_name in os.listdir(folder_path):
    if file_name.endswith('.txt'):
        file_path = os.path.join(folder_path, file_name)
        test_df = pd.read_csv(file_path, sep=r'\s+', header=None)
        test_df['source_file'] = file_name
        dataframes.append(test_df)
combined_df = pd.concat(dataframes, ignore_index=True)
test_df=pd.DataFrame(combined_df)
print(combined_df.head())

In [None]:
columns_to_exclude = ['UID', 'rul', 'HPC Degradation', 'Fan Degradation','max_cycles','floored_setting1','floored_setting2','floored_setting3','settings_category']
new_columns = [col for col in df_original.columns if col not in columns_to_exclude]
test_df.columns = new_columns
test_df.head()

In [None]:
# Create UID for each engine
source_file_mapping = {
    "test_FD001.txt": "_1",
    "test_FD002.txt": "_2",
    "test_FD003.txt": "_3",
    "test_FD004.txt": "_4"
}
test_df['source_suffix'] = test_df['source_file'].map(source_file_mapping)
test_df['UID'] = test_df['engine'].astype(str) + test_df['source_suffix']
test_df.drop(columns=['source_suffix'], inplace=True)
print(test_df[['engine', 'source_file', 'UID']].head())

In [None]:
# List of predefined conditions with their setting values
conditions = [
    {'condition': 'Condition_1', 'setting1': 0, 'setting2': 0, 'setting3': 100},
    {'condition': 'Condition_2', 'setting1': 10, 'setting2': 0.25, 'setting3': 100},
    {'condition': 'Condition_3', 'setting1': 20, 'setting2': 0.7, 'setting3': 100},
    {'condition': 'Condition_4', 'setting1': 25, 'setting2': 0.62, 'setting3': 60},
    {'condition': 'Condition_5', 'setting1': 35, 'setting2': 0.84, 'setting3': 100},
    {'condition': 'Condition_6', 'setting1': 42, 'setting2': 0.84, 'setting3': 100},
]

# Function to assign the closest condition
def assign_closest_condition(row):
    min_distance = float('inf')
    closest_condition = None
    for cond in conditions:
        # Calculate total absolute distance across all settings
        distance = abs(row['setting1'] - cond['setting1']) + \
                   abs(row['setting2'] - cond['setting2']) + \
                   abs(row['setting3'] - cond['setting3'])
        if distance < min_distance:
            min_distance = distance
            closest_condition = cond['condition']
    return closest_condition

# Apply the function to the training DataFrame
test_df['settings_category'] = test_df.apply(assign_closest_condition, axis=1)
test_df['settings_category'] =test_df['settings_category'].astype('category')

# Display unique categories in test_df
print("Unique Settings Categories in test_df:")
print(test_df['settings_category'].unique())



In [None]:
scaler = StandardScaler()
test_df[features_to_scale] = scaler.fit_transform(test_df[features_to_scale])
print(test_df.head())

In [None]:
folder_path_rul ="C:/Users/User/Downloads/RUL_2,4"

# Initialize an empty list to store dataframes
dataframes = []

# Iterate over all files in the folder
for file_name_rul in os.listdir(folder_path_rul):
    if file_name_rul.endswith('.txt'):  # Check if the file is a .txt file
        file_path_rul = os.path.join(folder_path_rul, file_name_rul)
        
        # Read the file into a DataFrame
        RUL_df = pd.read_csv(file_path_rul, sep=r'\s+', header=None)
        print(RUL_df.shape)
        # Add a column for the source file name
        RUL_df['source_file'] = file_name_rul
        # Append the DataFrame to the list
        dataframes.append(RUL_df)

# Combine all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Convert to a DataFrame (optional, since `combined_df` is already a DataFrame)
RUL_df = pd.DataFrame(combined_df)

# Print the head of the combined DataFrame
RUL_df.columns = ['rul','source_file'] 

In [None]:
# Define the mapping for source files
source_file_mapping = {
    "RUL_FD001.txt": "_1",
    "RUL_FD002.txt": "_2",
    "RUL_FD003.txt": "_3",
    "RUL_FD004.txt": "_4"
}

# Map the source_file column to its corresponding suffix
RUL_df['source_suffix'] = RUL_df['source_file'].map(source_file_mapping)

# Create a sequential number (1, 2, 3, ...) for each source_file
RUL_df['engine_seq'] = RUL_df.groupby('source_file').cumcount() + 1

# Combine the sequential number and source suffix to create the UID
RUL_df['UID'] = RUL_df['engine_seq'].astype(str) + RUL_df['source_suffix']

# Drop temporary columns if not needed
RUL_df.drop(columns=['source_suffix'], inplace=True)

# Preview the resulting DataFrame
print(RUL_df[['engine_seq', 'source_file', 'UID']].head())

In [None]:
print(RUL_df.shape)
unique_uid_counts = test_df.groupby('source_file')['UID'].nunique()
print(sum(unique_uid_counts))

In [None]:
# Merge the dataframes on 'UID', keeping all columns and filtering only matching UIDs
merged_df = test_df.merge(RUL_df, on='UID', how='inner')

# Perform a left join to retain all rows from clustered_test_df
left_join_df = test_df.merge(RUL_df, on='UID', how='left', indicator=True)

# Count rows in clustered_test_df that didn't find a match in RUL_test
non_matching_rows = left_join_df[left_join_df['_merge'] == 'left_only'].shape[0]

# Drop the indicator column (optional)
left_join_df.drop(columns=['_merge'], inplace=True)

# Output the results
print(f"Number of rows in clustered_test_df that didn't find a match: {non_matching_rows}")

In [None]:
test_df.head()

In [None]:
# Add a column for max_cycles to the training and testing datasets

test_df['max_cycles'] = test_df.groupby(['UID'])['cycle'].transform('max')

# Calculate RUL as max_cycles - current cycle

test_df['rul'] = test_df['max_cycles'] - test_df['cycle']

print(test_df[['UID', 'cycle', 'max_cycles', 'rul']].head())


In [None]:
## Mohini edited
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd

# Step 1: Identify sensor columns (exclude specific columns)
excluded_columns = ['engine', 'cycle', 'UID', 'rul', 'max_cycles', 'source_file', 'settings_category',
                    'setting1', 'setting2', 'setting3',
                    'HPC Degradation', 'Fan Degradation',
                    'floored_setting1', 'floored_setting2', 'floored_setting3']
sensor_columns = [col for col in df.columns if col not in excluded_columns]

all_features_with_correlations = []

# Step 2: Process each sensor column, grouped by `settings_category`
for feature in sensor_columns:
    try:
        grouped_correlations = []

        # Iterate through each unique `settings_category`
        for category in df['settings_category'].unique():
            df_filtered = df[df['settings_category'] == category]

            # Check correlation of the original feature
            original_correlation = np.corrcoef(df_filtered[feature], df_filtered["max_cycles"])[0, 1]
            best_correlation = original_correlation
            best_feature_name = feature

            # Generate polynomial transformations and check correlations
            poly = PolynomialFeatures(degree=10, include_bias=False)
            transformed = poly.fit_transform(df_filtered[[feature]])
            feature_names = poly.get_feature_names_out([feature])

            for i, transformed_column in enumerate(transformed.T):
                correlation = np.corrcoef(transformed_column, df_filtered["max_cycles"])[0, 1]
                if abs(correlation) > abs(best_correlation):
                    best_correlation = correlation
                    best_feature_name = feature_names[i]

            grouped_correlations.append(best_correlation)

        # Calculate the average correlation across all `settings_category`
        avg_correlation = np.mean(grouped_correlations)

        # Apply correlation threshold
        if abs(avg_correlation) > 0.05:
            all_features_with_correlations.append((feature, best_feature_name, avg_correlation))
            print(f"Sensor: {feature}, Best Feature: {best_feature_name}, Average Correlation: {avg_correlation:.4f}")

    except Exception as e:
        print(f"Skipping {feature} due to error: {e}")

# Step 3: Sort all features by absolute average correlation and select top 10
sorted_features = sorted(all_features_with_correlations, key=lambda x: abs(x[2]), reverse=True)
top_10_features = sorted_features[:10]

# Step 4: Display results
print("\nTop 10 Features Based on Average Correlation with max_cycles (Grouped by settings_category):")
for feature, name, correlation in top_10_features:
    print(f"Sensor: {feature}, Selected Feature: {name}, Average Correlation: {correlation:.4f}")

# Step 5: Store final selection
best_polynomial_features = {feature: name for feature, name, correlation in top_10_features}

print("\nFinal Best Polynomial Features Selected:")
for feature, name in best_polynomial_features.items():
    print(f"Sensor: {feature}, Selected Feature: {name}")

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Step 1: Add the best features to the training and testing datasets
for original_feature, selected_feature in best_polynomial_features.items():
    degree = int(selected_feature.split("^")[-1]) if "^" in selected_feature else 1
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    
    # Add to training dataset
    transformed_train = poly.fit_transform(df[[original_feature]])
    train_feature_names = poly.get_feature_names_out([original_feature])
    df[selected_feature] = transformed_train[:, train_feature_names.tolist().index(selected_feature)]
    
    # Add to testing dataset
    transformed_test = poly.fit_transform(test_df[[original_feature]])
    test_feature_names = poly.get_feature_names_out([original_feature])
    test_df[selected_feature] = transformed_test[:, test_feature_names.tolist().index(selected_feature)]

# Confirm feature addition
print(f"Number of Columns in Training Dataset After Adding Features: {df.shape[1]}")
print(f"Number of Columns in Testing Dataset After Adding Features: {test_df.shape[1]}")

In [None]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode `settings_category`
encoder = OneHotEncoder(drop="first", sparse_output=False)

# Apply one-hot encoding for training and testing datasets
settings_encoded_train = pd.DataFrame(
    encoder.fit_transform(df[["settings_category"]]),
    columns=encoder.get_feature_names_out(["settings_category"]),
    index=df.index
)
settings_encoded_test = pd.DataFrame(
    encoder.transform(test_df[["settings_category"]]),
    columns=encoder.get_feature_names_out(["settings_category"]),
    index=test_df.index
)

# Add the encoded columns to the datasets
df = pd.concat([df, settings_encoded_train], axis=1)
test_df = pd.concat([test_df, settings_encoded_test], axis=1)

# Remove the original `settings_category` column
df.drop(columns=["settings_category"], inplace=True)
test_df.drop(columns=["settings_category"], inplace=True)

# Define final selected features
selected_columns = ["UID", "engine", "cycle", "rul", "max_cycles"] + list(settings_encoded_train.columns)
selected_features = selected_columns + list(best_polynomial_features.values())

# Combine selected features for averaging
# Use max for cycle, mean for numeric columns
df_grouped_train = (
    df[selected_features]
    .groupby("UID", as_index=False)
    .agg(
        {col: ("max" if col == "cycle" else "mean") for col in selected_features if col != "UID"}
    )
)

df_grouped_test = (
    test_df[selected_features]
    .groupby("UID", as_index=False)
    .agg(
        {col: ("max" if col == "cycle" else "mean") for col in selected_features if col != "UID"}
    )
)

# Display resulting shapes
print("Final Grouped Train Features Shape:", df_grouped_train.shape)
print("Final Grouped Test Features Shape:", df_grouped_test.shape)

# Optionally display a sample for verification
print(df_grouped_train.head())
print(df_grouped_test.head())


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# Define predictors and response variable
X_train = df_grouped_train.drop(columns=["UID", "max_cycles","engine"])  # Remove non-predictor columns
y_train = df_grouped_train["max_cycles"]  # Set RUL as the target variable

X_test = df_grouped_test.drop(columns=["UID", "max_cycles","engine"])


# Display shapes for verification
print("Training Features Shape:", X_train.shape)
print("Training Target Shape (max_cycles):", y_train.shape)
print("Testing Features Shape:", X_test.shape)



In [None]:
column_names_check = X_train.columns
print(column_names_check)

In [None]:
y_train.head()

In [None]:
# Initialize Random Forest Regressor
rfr = RandomForestRegressor(
    n_estimators=200,  # Number of trees
    max_depth=15,      # Maximum depth of trees
    random_state=42    # Ensures reproducibility
)

# Train the model on training data
rfr.fit(X_train, y_train)

# Display confirmation
print("Random Forest Model Trained Successfully!")


In [None]:
# Predictions on training and testing datasets
y_train_pred = rfr.predict(X_train)
y_test_pred = rfr.predict(X_test)# add back to X_test then unaggregate it and compare against test_df rul by y_test_pred-cycles Then evavulate metric

# Calculate metrics
train_r2 = r2_score(y_train, y_train_pred)
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)

#test_r2 = r2_score(y_test, y_test_pred)
#test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

# Display evaluation metrics
print(f"Train R^2: {train_r2:.4f}")
print(f"Train RMSE: {train_rmse:.4f}")
#print(f"Test R^2: {test_r2:.4f}")
#print(f"Test RMSE: {test_rmse:.4f}")


In [None]:
# Add predictions back to X_test
X_test["predicted_max_cycles"] = y_test_pred
X_test["UID"] = df_grouped_test["UID"]

# Unaggregate X_test to match original test_df
# Merge predictions with the original test_df
test_df_with_predictions = test_df.merge(
    X_test[["UID", "predicted_max_cycles"]], 
    on="UID", 
    how="left"
)

# Calculate RUL predictions by subtracting `cycle` from `predicted_max_cycles`
test_df_with_predictions["predicted_rul"] = (
    test_df_with_predictions["predicted_max_cycles"] - test_df_with_predictions["cycle"]
)

# Ensure RUL predictions are non-negative
test_df_with_predictions["predicted_rul"] = test_df_with_predictions["predicted_rul"].clip(lower=0)

# Calculate metrics on unaggregated data
test_r2 = r2_score(test_df_with_predictions["rul"], test_df_with_predictions["predicted_rul"])
test_rmse = mean_squared_error(test_df_with_predictions["rul"], test_df_with_predictions["predicted_rul"], squared=False)

# Display evaluation metrics
print(f"Train R^2: {train_r2:.4f}")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test R^2: {test_r2:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")

# Optionally display a sample for verification
print(test_df_with_predictions[["UID", "cycle", "rul", "predicted_rul"]].head())


Conjoint Analysis

In [None]:
# Define thresholds for each setting
setting1_thresholds = [0, 10, 20, 25, 35, 42]
setting2_thresholds = [0, 0.25, 0.7, 0.84]
setting3_thresholds = [60, 100]

# Function to assign categories based on thresholds
def assign_setting_category(value, thresholds):
    for i in range(len(thresholds) - 1):
        if thresholds[i] <= value < thresholds[i + 1]:
            return f"Category_{i + 1}"
    return f"Category_{len(thresholds)}"  # Assign to the last category

# Apply the function to assign categories for each setting
df["setting1_category"] = df["setting1"].apply(assign_setting_category, thresholds=setting1_thresholds)
df["setting2_category"] = df["setting2"].apply(assign_setting_category, thresholds=setting2_thresholds)
df["setting3_category"] = df["setting3"].apply(assign_setting_category, thresholds=setting3_thresholds)

# Verify the results
print(df[["setting1_category", "setting2_category", "setting3_category"]].head())





In [None]:
from sklearn.preprocessing import OneHotEncoder

# Initialize separate encoders for each setting category
encoder1 = OneHotEncoder(drop=None, sparse_output=False)
encoder2 = OneHotEncoder(drop=None, sparse_output=False)
encoder3 = OneHotEncoder(drop=None, sparse_output=False)

# One-hot encode categories for each setting
encoded_setting1 = encoder1.fit_transform(df[["setting1_category"]])
encoded_setting2 = encoder2.fit_transform(df[["setting2_category"]])
encoded_setting3 = encoder3.fit_transform(df[["setting3_category"]])

# Convert encoded arrays to DataFrames with appropriate column names
encoded_setting1_df = pd.DataFrame(
    encoded_setting1,
    columns=encoder1.get_feature_names_out(["setting1_category"]),
    index=df.index
)
encoded_setting2_df = pd.DataFrame(
    encoded_setting2,
    columns=encoder2.get_feature_names_out(["setting2_category"]),
    index=df.index
)
encoded_setting3_df = pd.DataFrame(
    encoded_setting3,
    columns=encoder3.get_feature_names_out(["setting3_category"]),
    index=df.index
)

# Combine the original DataFrame with the encoded columns
df = pd.concat([df, encoded_setting1_df, encoded_setting2_df, encoded_setting3_df], axis=1)

# Drop the original category columns
df.drop(columns=["setting1_category", "setting2_category", "setting3_category"], inplace=True)

# Verify the updated DataFrame
print(df.head())


In [None]:
# Select only the one-hot encoded setting category columns as predictors
setting_columns = [col for col in df.columns if "setting1_category" in col or 
                   "setting2_category" in col or 
                   "setting3_category" in col]

X = df[setting_columns]  # Predictors: One-hot encoded setting categories
y = df["max_cycles"]  # Response: `max_cycles` as the target variable

# Display shapes
print(f"Predictors Shape: {X.shape}")
print(f"Response Shape: {y.shape}")

# Optional: Display the first few rows of the predictors to verify
print(X.head())


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and fit the model
lm = LinearRegression()
lm.fit(X, y)

# Display coefficients
coefficients = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": lm.coef_
}).sort_values(by="Coefficient", ascending=False)

print("Conjoint Analysis Coefficients:")
print(coefficients)

# Make predictions
y_pred = lm.predict(X)

# Evaluate model performance
mse = mean_squared_error(y, y_pred)


print(f"Mean Squared Error: {mse:.4f}")



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot coefficients
plt.figure(figsize=(10, 6))
sns.barplot(x="Coefficient", y="Feature", data=coefficients)
plt.title("Conjoint Analysis Feature Importance")
plt.xlabel("Coefficient")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
