In [None]:
# standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# First Dataset
#### without dropping any column

In [None]:
train1 = pd.read_csv('RawData/Train1_raw.csv')
test1 = pd.read_csv('RawData/Test1_raw.csv')

In [None]:
# Adding RUL(Remaining Useful Life)
def categorize_zone(time_cycles, max_time_cycle):
    time_percent = time_cycles / max_time_cycle
    if time_percent <= 0.5:  
        return "Safe Zone"
    elif time_percent <= 0.8:
        return "Moderate Zone"
    else:
        return "Dangerous"

def add_zones_rul_column(df):
    train_grouped_by_unit = df.groupby(by='engine_number') 
    max_time_cycles = train_grouped_by_unit['time_cycles'].max() 
    merged = df.merge(max_time_cycles.to_frame(name='max_time_cycle'), left_on='engine_number',right_index=True)
    
    merged["RUL"] = merged["max_time_cycle"] - merged['time_cycles']
    
    merged["zone"] = merged.apply(lambda row: categorize_zone(row['time_cycles'], row['max_time_cycle']), axis=1)
    
    merged = merged.drop("max_time_cycle", axis=1) 
    return merged

In [None]:
train1_with_rul = add_zones_rul_column(train1)
test1_with_rul = add_zones_rul_column(test1)
train1_with_rul

In [None]:
# Select features and target variable for training
features = train1_with_rul.drop(['RUL', 'zone'], axis=1)
target = train1_with_rul['RUL']

# for testing
X_test = test1_with_rul.drop(['RUL', 'zone'], axis=1)
y_test = test1_with_rul['RUL']

# Split the data into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

In [None]:
print("Shape of X_train: ",X_train.shape)
print("Shape of X_val: ",X_val.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_val: ",y_val.shape)

# ML Algo and Metrics

In [None]:
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
models = {
    'Lasso':Lasso(),
    'LinearRegression': LinearRegression(),
     'KNeighborsRegressor' : KNeighborsRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor()
}

In [None]:
results = []

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make Predictions
    y_pred = model.predict(X_val)

    mae = mean_absolute_error(y_val, y_pred)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    r2_square = r2_score(y_val, y_pred)
    
    # Check feature importances (for models that support it)
    if hasattr(model, 'feature_importances_'):
        feature_importances = model.feature_importances_
    else:
        feature_importances = None

    # Append results to the list
    results.append({
        'Model': model_name,
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2_square,
        'Feature_Importances': feature_importances
    })

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Display the results DataFrame
results_df

In [None]:
# Initialize the linear regression model
model = RandomForestRegressor()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
predictions = model.predict(X_test)

# Add the predictions to the test dataset
test1_with_rul['Predicted_RUL'] = predictions

In [None]:
test1_with_rul[test1_with_rul['engine_number'] == 2][['engine_number', 'time_cycles','RUL', 'Predicted_RUL']]

In [None]:
test1[test1['engine_number'] == 2][['engine_number', 'time_cycles', 'Predicted_RUL']]

In [None]:
train1_with_rul[train1_with_rul['engine_number'] == 2][['engine_number', 'time_cycles', 'RUL']].head(30)

# Divide data into target and features
##### Regression

In [None]:
train1 = pd.read_csv('CleanedData/train1_clean.csv')
test1 = pd.read_csv('CleanedData/test1_clean.csv')

In [None]:
train1

In [None]:
test1

In [None]:
X = train1.drop(columns=['RUL','zone'],axis=1)
y = train1['RUL']

In [None]:
print("sahpe of features: ", X.shape)
print("Shape of target: ",y.shape)

In [None]:
# split into train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state=42)

In [None]:
print("Shape of X_train: ",X_train.shape)
print("Shape of X_test: ",X_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test: ",y_test.shape)

## ML Algo and metrics

In [None]:
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
models = {
    'Lasso':Lasso(),
    'LinearRegression': LinearRegression(),
     'KNeighborsRegressor' : KNeighborsRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor()
}

In [None]:
results = []

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2_square = r2_score(y_test, y_pred)
    
    # Check feature importances (for models that support it)
    if hasattr(model, 'feature_importances_'):
        feature_importances = model.feature_importances_
    else:
        feature_importances = None

    # Append results to the list
    results.append({
        'Model': model_name,
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2_square,
        'Feature_Importances': feature_importances
    })

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Display the results DataFrame
results_df

In [None]:
feature_names = ['engine_number', 'time_cycles', 'op_setting_1', 'op_setting_2',
       'sensor_measurement2', 'sensor_measurement3', 'sensor_measurement4',
       'sensor_measurement6', 'sensor_measurement7', 'sensor_measurement8',
       'sensor_measurement9', 'sensor_measurement11', 'sensor_measurement12',
       'sensor_measurement13', 'sensor_measurement15', 'sensor_measurement17',
       'sensor_measurement20', 'sensor_measurement21']

# Plot feature importances
for result in results:
    model_name = result['Model']
    feature_importances = result['Feature_Importances']
    
    if feature_importances is not None:
        plt.figure(figsize=(10, 6))  # Adjust the figure size if needed
        plt.bar(range(len(feature_importances)), feature_importances)
        plt.xlabel('Feature')
        plt.ylabel('Feature Importance')
        plt.title(f'Feature Importances for {model_name}')
        plt.xticks(range(len(feature_importances)), feature_names, rotation='vertical')
        plt.tight_layout()

        # Annotate each bar with the importance value
        for i, imp in enumerate(feature_importances):
            plt.text(i, imp + 0.01, f'{imp:.4f}', ha='center', va='bottom')

        plt.show()

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(test1)
test_prediction1 = np.round(model.predict(test1))
test1["Predicticted_RUL"] = test_prediction1

In [None]:
test1[test1['engine_number'] == 2][['engine_number', 'time_cycles', 'Predicticted_RUL']]

In [None]:
train1[train1['engine_number'] == 2][['engine_number', 'time_cycles', 'RUL']].head(30)

# Overfitting Model

# Selecting only top5 feature according to XGB

In [None]:
X_top5 = X[['engine_number', 'time_cycles','sensor_measurement4','sensor_measurement11','sensor_measurement12']]

In [None]:
X_train_top5, X_test_top5, y_train, y_test = train_test_split(X_top5, y, test_size = 0.2 , random_state=42)

In [None]:
results = []

for model_name, model in models.items():
    # Train the model
    model.fit(X_train_top5, y_train)

    # Make Predictions
    y_pred = model.predict(X_test_top5)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2_square = r2_score(y_test, y_pred)
    
    # Check feature importances (for models that support it)
    if hasattr(model, 'feature_importances_'):
        feature_importances = model.feature_importances_
    else:
        feature_importances = None

    # Append results to the list
    results.append({
        'Model': model_name,
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2_square,
    })

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Display the results DataFrame
results_df

### According to above info XGBRegressor is best model.

# Predict RUL for our test data 

In [None]:
# top 5 features
test1 = test1[['engine_number', 'time_cycles','sensor_measurement4','sensor_measurement11','sensor_measurement12']]

In [None]:
model = XGBRegressor()
model.fit(X_train_top5, y_train)

In [None]:
test_prediction = np.round(model.predict(test1))
test1["Predicticted_RUL"] = test_prediction

In [None]:
test1[test1['engine_number'] == 2][['engine_number', 'time_cycles', 'Predicticted_RUL']]

In [None]:
train1[train1['engine_number'] == 2][['engine_number', 'time_cycles', 'RUL']].head(20)

# Model is overfitted

***************************************************************************************************************************

# Complete CleanedDataset

In [None]:
df_train = pd.read_csv('CleanedData/trainset_clean.csv')
df_test = pd.read_csv('CleanedData/testset_clean.csv')

In [None]:
df_train.columns

In [None]:
df_train

In [None]:
# independent and dependent feature
feature = df_train.drop(columns=['RUL','zone'],axis=1)
target = df_train['RUL']

In [None]:
# split into train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature,target , test_size = 0.2 , random_state=42)

###
results = []

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2_square = r2_score(y_test, y_pred)
    
    # Append results to the list
    results.append({
        'Model': model_name,
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2_square,
    })

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Display the results DataFrame
results_df

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_predicton = np.round(model.predict(df_test))
df_test['Predicted_RUL'] = y_predicton

In [None]:
df_test[df_test['engine_number'] == 100][['engine_number', 'time_cycles', 'Predicted_RUL']].head()

In [None]:
df_train[df_train['engine_number'] == 100][['engine_number', 'time_cycles', 'RUL']].head()

# Model is Generalized
### Save into csv

In [None]:
df_test.to_csv('PredictedRUL/PredictedRUL_Complete_test.csv',header=True,index=False)

***************************************************************************************************************************

In [None]:
train1 = pd.read_csv('RawData/Train1_raw.csv')
test1 = pd.read_csv('RawData/Test1_raw.csv')

In [None]:
# Check Train1 with complete train dataset feature
train1_top_feature = train1[['engine_number', 'time_cycles', 'op_setting_1', 'op_setting_3',
       'sensor_measurement8', 'sensor_measurement14', 'sensor_measurement16']]
test1_top_feature = test1[['engine_number', 'time_cycles', 'op_setting_1', 'op_setting_3',
       'sensor_measurement8', 'sensor_measurement14', 'sensor_measurement16']]

In [None]:
# Adding RUL(Remaining Useful Life)
def categorize_zone(time_cycles, max_time_cycle):
    time_percent = time_cycles / max_time_cycle
    if time_percent <= 0.5:  
        return "Safe Zone"
    elif time_percent <= 0.8:
        return "Moderate Zone"
    else:
        return "Dangerous"

def add_zones_rul_column(df):
    train_grouped_by_unit = df.groupby(by='engine_number') 
    max_time_cycles = train_grouped_by_unit['time_cycles'].max() 
    merged = df.merge(max_time_cycles.to_frame(name='max_time_cycle'), left_on='engine_number',right_index=True)
    
    merged["RUL"] = merged["max_time_cycle"] - merged['time_cycles']
    
    merged["zone"] = merged.apply(lambda row: categorize_zone(row['time_cycles'], row['max_time_cycle']), axis=1)
    
    merged = merged.drop("max_time_cycle", axis=1) 
    return merged

In [None]:
train1_top_feature_with_rul = add_zones_rul_column(train1_top_feature)
train1_top_feature_with_rul

In [None]:
# independent and dependent feature
independent = train1_top_feature_with_rul.drop(columns=['RUL','zone'],axis=1)
dependent = train1_top_feature_with_rul['RUL']

In [None]:
# split into train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(independent,dependent , test_size = 0.2 , random_state=42)

###
results = []

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2_square = r2_score(y_test, y_pred)
    
    # Append results to the list
    results.append({
        'Model': model_name,
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2_square,
    })

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Display the results DataFrame
results_df

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_predicton = np.round(model.predict(test1_top_feature))
test1_top_feature['Predicted_RUL'] = y_predicton

In [None]:
test1_top_feature[test1_top_feature['engine_number'] == 5][['engine_number', 'time_cycles', 'Predicted_RUL']].head()

In [None]:
train1_top_feature_with_rul[train1_top_feature_with_rul['engine_number'] == 5][['engine_number', 'time_cycles', 'RUL']].head()

# Model is over fitting

#### Now Check Without dropiing anyf eature model is generalized or not

In [None]:
train1_with_rul = add_zones_rul_column(train1)
train1_with_rul

In [None]:
# independent and dependent feature
independent = train1_with_rul.drop(columns=['RUL','zone'],axis=1)
dependent = train1_with_rul['RUL']

# split into train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(independent,dependent , test_size = 0.2 , random_state=42)

###
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_predicton = np.round(model.predict(test1))
test1['Predicted_RUL'] = y_predicton

In [None]:
test1[test1['engine_number'] == 5][['engine_number', 'time_cycles', 'Predicted_RUL']].head()

In [None]:
train1_with_rul[train1_with_rul['engine_number'] == 5][['engine_number', 'time_cycles', 'RUL']].head()