In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report

# Reading the data
data_path = r'C:\Users\GGPC\IoD_Mini_Projects\Mini_Project_2\project_NASA_Materials_Project\data\result\cleaned_Outgassing_Db_20240702.csv'
material_df = pd.read_csv(data_path)

# Feature Engineering
def feature_engineering(df):
    noise = np.random.normal(0, 0.65, size=df.shape[0])
    scaler = StandardScaler()
    df[['TML', 'CVCM', 'WVR']] = scaler.fit_transform(df[['TML', 'CVCM', 'WVR']])
    df['performance_score'] = (1 - df['TML']) + (1 - df['CVCM']) + (1 - df['WVR']) + noise
    return df

material_df = feature_engineering(material_df)

# Linear Regression Pipeline
linear_regression_pipeline = Pipeline([
    # ('scaler', StandardScaler()),
    ('linear_regression', LinearRegression())
])

# Logistic Regression Pipeline
logistic_regression_pipeline = Pipeline([
    # ('scaler', StandardScaler()),
    ('logistic_regression', LogisticRegression())
])

# Preparing data for Linear Regression
X_lr = material_df[['TML', 'CVCM', 'WVR']]
y_lr = material_df['performance_score']
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_lr, y_lr, test_size=0.2, random_state=1)

# Fitting the Linear Regression model
linear_regression_pipeline.fit(X_train_lr, y_train_lr)
y_pred_lr = linear_regression_pipeline.predict(X_test_lr)
lr_mse = mean_squared_error(y_test_lr, y_pred_lr)
lr_r2 = r2_score(y_test_lr, y_pred_lr)

print(f'Linear Regression Mean Squared Error: {lr_mse}')
print(f'Linear Regression R_squared: {lr_r2}')

# Preparing data for Logistic Regression
# Create performance categories using pd.qcut
material_df['performance_category'] = pd.qcut(material_df['performance_score'], q=3, labels=['Low', 'Medium', 'High'])

# Encoding the target variable
y_logreg = material_df['performance_category'].astype('category').cat.codes

X_train_logreg, X_test_logreg, y_train_logreg, y_test_logreg = train_test_split(X_lr, y_logreg, test_size=0.2, random_state=1)

# Fitting the Logistic Regression model
logistic_regression_pipeline.fit(X_train_logreg, y_train_logreg)
y_pred_logreg = logistic_regression_pipeline.predict(X_test_logreg)
logreg_accuracy = accuracy_score(y_test_logreg, y_pred_logreg)
conf_matrix = confusion_matrix(y_test_logreg, y_pred_logreg)
class_report = classification_report(y_test_logreg, y_pred_logreg)

print(f'Logistic Regression Accuracy: {logreg_accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


Linear Regression Mean Squared Error: 0.4250415286116711
Linear Regression R_squared: 0.9164540130088195
Logistic Regression Accuracy: 0.8042328042328042
Confusion Matrix:
[[446  62   0]
 [ 44 364  92]
 [  0  98 406]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89       508
           1       0.69      0.73      0.71       500
           2       0.82      0.81      0.81       504

    accuracy                           0.80      1512
   macro avg       0.81      0.80      0.81      1512
weighted avg       0.81      0.80      0.81      1512

