<a href="https://colab.research.google.com/github/SiyandaCaddy/firstproject.com/blob/main/Problem1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

import pandas as pd
import numpy as np

# Machine Learning tools
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# Load dataset
from google.colab import files
uploaded = files.upload()

Saving Question 1 datasets .csv to Question 1 datasets .csv


In [3]:
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

In [5]:
# -------------------------
# a) Understanding Data
# -------------------------
print("Dataset Shape:", df.shape)
print(df.head())

# Features and Target
X = df.drop(columns=["Days to Failure"])
y = df["Days to Failure"]

Dataset Shape: (200, 6)
   Index  Temperature  Vibration  Pressure  Runtime  Days to Failure
0      0        64.98       5.93     76.19      204            669.9
1      1        88.03       4.25    124.15      665            619.4
2      2        79.28       4.48    100.32      803            688.7
3      3        73.95       6.70    119.59      881            577.7
4      4        56.24       5.82     89.20      733            595.7


In [6]:
# -------------------------
# b) Train-Test Split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
# -------------------------
# c) Random Forest Model
# -------------------------
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

# Metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Random Forest RMSE: {rmse:.2f}")
print(f"Random Forest R²: {r2:.2f}")

# Cross-validation (k=5)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf, X, y, cv=kf, scoring="r2")
print("Cross-Validation R² Scores:", cv_scores)
print("Mean CV R²:", np.mean(cv_scores))

Random Forest RMSE: 150.02
Random Forest R²: -0.07
Cross-Validation R² Scores: [-0.06617283  0.02764739 -0.30528568 -0.08625    -0.04701766]
Mean CV R²: -0.09541575525603734


In [9]:
# -------------------------
# d) Feature Engineering Example
# -------------------------
# Add interaction term: vibration * runtime
df["Vibration_Runtime"] = df["Vibration"] * df["Runtime"]

# Polynomial features example
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(df.drop(columns=["Days to Failure"]))
X_poly_train, X_poly_test, y_poly_train, y_poly_test = train_test_split(
    X_poly, y, test_size=0.2, random_state=42
)

# Train again with polynomial features
rf_poly = RandomForestRegressor(n_estimators=100, random_state=42)
rf_poly.fit(X_poly_train, y_poly_train)
y_poly_pred = rf_poly.predict(X_poly_test)

rmse_poly = np.sqrt(mean_squared_error(y_poly_test, y_poly_pred))
r2_poly = r2_score(y_poly_test, y_poly_pred)

print(f"RF with Feature Engineering RMSE: {rmse_poly:.2f}")
print(f"RF with Feature Engineering R²: {r2_poly:.2f}")

RF with Feature Engineering RMSE: 146.70
RF with Feature Engineering R²: -0.02
