Import the needed Libraries:

In [14]:
import sys
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import sklearn
import scipy
import tqdm
import os
import glob
from IPython.display import display
from io import StringIO

In [15]:
# ==============================
# Function to read all files in a folder
# ==============================
def read_files_in_folder(folder_path, columns, sep=r"\s+"):
    files = glob.glob(os.path.join(folder_path, "*.txt"))  # all txt files
    data_list = [pd.read_csv(f, sep=sep, header=None, names=columns) for f in files]
    return pd.concat(data_list, ignore_index=True)

# ==============================
# 1 - Read RUL Data
# ==============================
rul_folder = r"C:\Users\ACER\Desktop\ISE-518\ISE518\Project\NASA Turbofan Jet Engine Data Set\CMaps\Results (RUL)"
rul_columns = ['RUL']
rul_data = read_files_in_folder(rul_folder, rul_columns)

# ==============================
# 2 - Read Test Data
# ==============================
test_folder = r"C:\Users\ACER\Desktop\ISE-518\ISE518\Project\NASA Turbofan Jet Engine Data Set\CMaps\Testing"
sensor_cols = [f'sensor_{i}' for i in range(1, 22)]
operational_cols = [f'operational_setting_{i}' for i in range(1, 4)]
test_columns = ['unit_number', 'time_in_cycles'] + operational_cols + sensor_cols
test_data = read_files_in_folder(test_folder, test_columns)

# ==============================
# 3 - Read Training Data
# ==============================
train_folder = r"C:\Users\ACER\Desktop\ISE-518\ISE518\Project\NASA Turbofan Jet Engine Data Set\CMaps\Training"
train_data = read_files_in_folder(train_folder, test_columns)  # same columns as test


In [None]:
# Show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Show all rows
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

############################################

print("RUL")
display(rul_data.head())
print("Testing Data")
display(test_data.head())
print("Training Data")
display(train_data.head())


RUL


Unnamed: 0,RUL
0,112
1,98
2,69
3,82
4,91


Testing Data


Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,21.61,553.9,2388.04,9050.17,1.3,47.2,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,21.61,554.85,2388.01,9054.42,1.3,47.5,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,21.61,554.11,2388.05,9056.96,1.3,47.5,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,21.61,554.07,2388.03,9045.29,1.3,47.28,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,21.61,554.16,2388.01,9044.55,1.3,47.31,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413


Training Data


Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [49]:
# ===============================================
# Dataset Summary Overview (info + describe)
# ===============================================

print(f"{'#'*40} INFO {'#'*40}")

def info_as_dataframe(df):

    print(f"\n{'-'*40}")
    print(f"{' '*4} ðŸ“ˆ Info. of a Data Frame ðŸ“ˆ")
    print(f"{'-'*40}")

    buffer = StringIO()
    df.info(buf=buffer)
    s = buffer.getvalue()
    lines = s.split('\n')
    
    # Extract useful lines
    data_lines = [line for line in lines if line.strip().startswith(tuple(map(str, range(len(df.columns)))))]
    
    # Parse lines into DataFrame
    info_list = []
    for line in data_lines:
        parts = line.split()
        info_list.append({
            'Column Index': parts[0],
            'Column Name': parts[1],
            'Non-Null Count': parts[2],
            'Dtype': parts[-1]
        })
    
    return pd.DataFrame(info_list)

# Displaying the Data in a Tabular Format
display(info_as_dataframe(rul_data))
display(info_as_dataframe(test_data))
display(info_as_dataframe(train_data))

datasets = {
    "RUL Data": rul_data,
    "Test Data": test_data,
    "Training Data": train_data
}

print(f"{'#'*40} Discribe {'#'*40}")

for name, df in datasets.items():
    print(f"\n{'-'*40}")
    print(f"ðŸ“ˆ {name} â€” DESCRIPTIVE STATISTICS ðŸ“ˆ")
    print(f"{'-'*40}")
    display(df.describe())


######################################## INFO ########################################

----------------------------------------
     ðŸ“ˆ Info. of a Data Frame ðŸ“ˆ
----------------------------------------


Unnamed: 0,Column Index,Column Name,Non-Null Count,Dtype
0,0,RUL,707,int64



----------------------------------------
     ðŸ“ˆ Info. of a Data Frame ðŸ“ˆ
----------------------------------------


Unnamed: 0,Column Index,Column Name,Non-Null Count,Dtype
0,0,unit_number,104897,int64
1,1,time_in_cycles,104897,int64
2,2,operational_setting_1,104897,float64
3,3,operational_setting_2,104897,float64
4,4,operational_setting_3,104897,float64
5,5,sensor_1,104897,float64
6,6,sensor_2,104897,float64
7,7,sensor_3,104897,float64
8,8,sensor_4,104897,float64
9,9,sensor_5,104897,float64



----------------------------------------
     ðŸ“ˆ Info. of a Data Frame ðŸ“ˆ
----------------------------------------


Unnamed: 0,Column Index,Column Name,Non-Null Count,Dtype
0,0,unit_number,160359,int64
1,1,time_in_cycles,160359,int64
2,2,operational_setting_1,160359,float64
3,3,operational_setting_2,160359,float64
4,4,operational_setting_3,160359,float64
5,5,sensor_1,160359,float64
6,6,sensor_2,160359,float64
7,7,sensor_3,160359,float64
8,8,sensor_4,160359,float64
9,9,sensor_5,160359,float64


######################################## Discribe ########################################

----------------------------------------
ðŸ“ˆ RUL Data â€” DESCRIPTIVE STATISTICS ðŸ“ˆ
----------------------------------------


Unnamed: 0,RUL
count,707.0
mean,81.437058
std,51.108532
min,6.0
25%,36.0
50%,83.0
75%,119.0
max,195.0



----------------------------------------
ðŸ“ˆ Test Data â€” DESCRIPTIVE STATISTICS ðŸ“ˆ
----------------------------------------


Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
count,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0,104897.0
mean,103.119117,95.406589,17.24381,0.410887,95.748591,485.791314,597.153354,1464.738375,1257.240382,9.887805,14.412914,359.377679,2273.867033,8671.465814,1.153026,44.097986,338.437014,2349.811235,8084.687648,9.046593,0.025039,360.088096,2273.841797,98.398282,25.949868,15.570426
std,71.370362,74.311486,16.530141,0.368078,12.328151,30.426804,42.430629,117.856835,135.728855,4.265536,6.443315,174.05696,142.13543,373.202306,0.142029,3.408337,164.110284,110.890533,78.956087,0.74839,0.005,30.932463,142.218102,4.644631,11.70101,7.02059
min,1.0,1.0,-0.0087,-0.0006,60.0,445.0,535.41,1240.94,1024.9,3.91,5.66,136.06,1914.9,7987.03,0.93,36.03,128.26,2027.74,7849.8,8.198,0.02,302.0,1915.0,84.93,10.18,6.1231
25%,43.0,38.0,0.0013,0.0002,100.0,449.44,549.78,1355.14,1124.03,5.48,8.0,175.72,2212.05,8329.54,1.02,41.92,164.8,2387.95,8068.36,8.4256,0.02,331.0,2212.0,100.0,14.34,8.6073
50%,87.0,80.0,19.9982,0.62,100.0,489.05,605.53,1490.16,1262.85,9.35,13.66,337.51,2319.11,8751.85,1.08,44.74,317.63,2388.05,8113.1,9.1085,0.03,367.0,2319.0,100.0,24.78,14.8724
75%,159.0,135.0,35.0015,0.84,100.0,518.67,642.2,1584.81,1399.46,14.62,21.61,553.58,2388.03,9052.68,1.3,47.26,521.59,2388.11,8136.63,9.3339,0.03,392.0,2388.0,100.0,38.88,23.331
max,259.0,486.0,42.008,0.842,100.0,518.67,644.32,1609.69,1434.12,14.62,21.61,569.17,2388.55,9179.62,1.32,48.28,536.6,2389.96,8243.1,11.03,0.03,398.0,2388.0,100.0,39.75,23.8496



----------------------------------------
ðŸ“ˆ Training Data â€” DESCRIPTIVE STATISTICS ðŸ“ˆ
----------------------------------------


Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
count,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0
mean,105.553758,123.331338,17.211973,0.410004,95.724344,485.84089,597.361022,1467.035653,1260.956434,9.894999,14.424935,359.729968,2273.829707,8677.553696,1.153705,44.212049,338.789821,2349.645243,8088.950972,9.054747,0.025185,360.698801,2273.754039,98.389146,25.942709,15.5657
std,72.867325,83.538146,16.527988,0.367938,12.359044,30.420388,42.478516,118.175261,136.300073,4.265554,6.443922,174.133835,142.426613,374.657454,0.142103,3.426342,164.19348,111.167242,80.623257,0.751581,0.004997,31.02143,142.513114,4.65627,11.691422,7.015067
min,1.0,1.0,-0.0087,-0.0006,60.0,445.0,535.48,1242.67,1023.77,3.91,5.67,136.17,1914.72,7984.51,0.93,36.04,128.31,2027.57,7845.78,8.1563,0.02,302.0,1915.0,84.93,10.16,6.0105
25%,44.0,57.0,0.0013,0.0002,100.0,449.44,549.96,1357.36,1126.83,5.48,8.0,175.71,2212.12,8334.77,1.02,42.01,164.79,2387.97,8070.53,8.43925,0.02,332.0,2212.0,100.0,14.33,8.6013
50%,89.0,114.0,19.9981,0.62,100.0,489.05,605.93,1492.81,1271.74,9.35,13.66,341.69,2319.37,8764.2,1.09,44.93,321.69,2388.07,8118.59,9.0301,0.03,367.0,2319.0,100.0,24.92,14.9535
75%,164.0,173.0,35.0015,0.84,100.0,518.67,642.34,1586.59,1402.2,14.62,21.61,553.29,2388.05,9055.85,1.3,47.34,521.34,2388.16,8139.41,9.3442,0.03,392.0,2388.0,100.0,38.82,23.2946
max,260.0,543.0,42.008,0.842,100.0,518.67,645.11,1616.91,1441.49,14.62,21.61,570.81,2388.64,9244.59,1.32,48.53,537.49,2390.49,8293.72,11.0669,0.03,400.0,2388.0,100.0,39.89,23.9505


In [50]:
from IPython.display import display

def dataset_summary(df, sample_size=5):
    summary_data = []
    for col in df.columns:
        dtype = df[col].dtype
        unique_vals = df[col].nunique()
        # Get sample values (up to sample_size)
        if unique_vals > sample_size:
            sample_vals = df[col].unique()[:sample_size]
            sample_vals = ', '.join(map(str, sample_vals)) + ', ...'
        else:
            sample_vals = ', '.join(map(str, df[col].unique()))
        summary_data.append({
            'Column': col,
            'Data Type': dtype,
            'Unique Values': unique_vals,
            'Sample Values': sample_vals
        })
    summary_df = pd.DataFrame(summary_data)
    display(summary_df)

# Example usage:
dataset_summary(train_data)
dataset_summary(test_data)
dataset_summary(rul_data)


Unnamed: 0,Column,Data Type,Unique Values,Sample Values
0,unit_number,int64,260,"1, 2, 3, 4, 5, ..."
1,time_in_cycles,int64,543,"1, 2, 3, 4, 5, ..."
2,operational_setting_1,float64,670,"-0.0007, 0.0019, -0.0043, 0.0007, -0.0019, ..."
3,operational_setting_2,float64,111,"-0.0004, -0.0003, 0.0003, 0.0, -0.0002, ..."
4,operational_setting_3,float64,2,"100.0, 60.0"
5,sensor_1,float64,6,"518.67, 449.44, 445.0, 462.54, 491.19, ..."
6,sensor_2,float64,1799,"641.82, 642.15, 642.35, 642.37, 642.1, ..."
7,sensor_3,float64,15377,"1589.7, 1591.82, 1587.99, 1582.79, 1582.85, ..."
8,sensor_4,float64,20582,"1400.6, 1403.14, 1404.2, 1401.87, 1406.22, ..."
9,sensor_5,float64,6,"14.62, 5.48, 3.91, 7.05, 9.35, ..."


Unnamed: 0,Column,Data Type,Unique Values,Sample Values
0,unit_number,int64,259,"1, 2, 3, 4, 5, ..."
1,time_in_cycles,int64,486,"1, 2, 3, 4, 5, ..."
2,operational_setting_1,float64,662,"0.0023, -0.0027, 0.0003, 0.0042, 0.0014, ..."
3,operational_setting_2,float64,111,"0.0003, -0.0003, 0.0001, 0.0, 0.0002, ..."
4,operational_setting_3,float64,2,"100.0, 60.0"
5,sensor_1,float64,6,"518.67, 489.05, 491.19, 449.44, 445.0, ..."
6,sensor_2,float64,1603,"643.02, 641.71, 642.46, 642.44, 642.51, ..."
7,sensor_3,float64,13285,"1585.29, 1588.45, 1586.94, 1584.12, 1587.19, ..."
8,sensor_4,float64,16454,"1398.21, 1395.42, 1401.34, 1406.42, 1401.92, ..."
9,sensor_5,float64,6,"14.62, 10.52, 9.35, 5.48, 3.91, ..."


Unnamed: 0,Column,Data Type,Unique Values,Sample Values
0,RUL,int64,178,"112, 98, 69, 82, 91, ..."


In [51]:
# For training data
num_training_engines = train_data['unit_number'].nunique()
print("Number of engines in training data:", num_training_engines)

# For test data
num_test_engines = test_data['unit_number'].nunique()
print("Number of engines in test data:", num_test_engines)


Number of engines in training data: 260
Number of engines in test data: 259


In [52]:
# Function to check missing values
def check_missing(df, name):
    print(f"=== Missing Data in {name} ===")
    missing = df.isnull().sum()
    missing = missing[missing > 0]  # Only show columns with missing values
    if missing.empty:
        print("No missing data!\n")
    else:
        print(missing, "\n")

# Check RUL data
check_missing(rul_data, "RUL Data")

# Check Training data
check_missing(train_data, "Training Data")

# Check Testing data
check_missing(test_data, "Testing Data")


=== Missing Data in RUL Data ===
No missing data!

=== Missing Data in Training Data ===
No missing data!

=== Missing Data in Testing Data ===
No missing data!



In [55]:
# Filter the rows for unit_number = 1
engine_1_data = train_data[train_data['unit_number'] == 1]

# Display the first few rows
display(engine_1_data)

# Optionally, show how many cycles this engine has
print(f"\nEngine 1 has {engine_1_data['time_in_cycles'].nunique()} cycles recorded.")


Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044
5,1,6,-0.0043,-0.0001,100.0,518.67,642.1,1584.47,1398.37,14.62,21.61,554.67,2388.02,9049.68,1.3,47.16,521.68,2388.03,8132.85,8.4108,0.03,391,2388,100.0,38.98,23.3669
6,1,7,0.001,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,21.61,554.34,2388.02,9059.13,1.3,47.36,522.32,2388.03,8132.32,8.3974,0.03,392,2388,100.0,39.1,23.3774
7,1,8,-0.0034,0.0003,100.0,518.67,642.56,1582.96,1400.97,14.62,21.61,553.85,2388.0,9040.8,1.3,47.24,522.47,2388.03,8131.07,8.4076,0.03,391,2388,100.0,38.97,23.3106
8,1,9,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.8,14.62,21.61,553.69,2388.05,9046.46,1.3,47.29,521.79,2388.05,8125.69,8.3728,0.03,392,2388,100.0,39.05,23.4066
9,1,10,-0.0033,0.0001,100.0,518.67,641.71,1591.24,1400.46,14.62,21.61,553.59,2388.05,9051.7,1.3,47.03,521.79,2388.06,8129.38,8.4286,0.03,393,2388,100.0,38.95,23.4694



Engine 1 has 321 cycles recorded.
