In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')  # Ignoring warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load the dataset
df = pd.read_csv("defects_data.csv")

# Display basic information about the dataset
print(df.info())

# Check for missing values
missing_values = df.isnull().sum()
if missing_values.any():
    print("Missing values detected in the dataset:")
    print(missing_values)
    
    # Fill missing values with the mean of each column
    df.fillna(df.mean(), inplace=True)
    print("Missing values have been filled with column means.")
else:
    print("No missing values detected in the dataset.")

# Drop irrelevant columns
df = df.drop(columns=['defect_id', 'product_id'])

# Encode categorical target variable
label_encoder = LabelEncoder()
df['severity'] = label_encoder.fit_transform(df['severity'])

# Separate features and target
X = df.drop(columns=['severity'])
y = df['severity']

# Define categorical and numerical features
categorical_features = ['defect_type', 'defect_date', 'defect_location', 'inspection_method']
numerical_features = ['repair_cost']

# Apply One-Hot Encoding to categorical features and scale numerical features
column_transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaler', StandardScaler(), numerical_features)
])

X_transformed = column_transformer.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

print(df.head())

print("Preprocessing complete. Data is ready for modeling!")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   defect_id          1000 non-null   int64  
 1   product_id         1000 non-null   int64  
 2   defect_type        1000 non-null   object 
 3   defect_date        1000 non-null   object 
 4   defect_location    1000 non-null   object 
 5   severity           1000 non-null   object 
 6   inspection_method  1000 non-null   object 
 7   repair_cost        1000 non-null   float64
dtypes: float64(1), int64(2), object(5)
memory usage: 62.6+ KB
None
No missing values detected in the dataset.
  defect_type defect_date defect_location  severity  inspection_method  \
0  Structural    6/6/2024       Component         1  Visual Inspection   
1  Functional   4/26/2024       Component         1  Visual Inspection   
2  Structural   2/15/2024        Internal         1  Automated Testing   
3  Funct

Training Accuracy: 1.0000
Validation Accuracy (Cross-Validation Mean): 0.3263
Validation Accuracy Std Dev: 0.0286
R² Score on Test Data: 0.3000


In [7]:
# Re-import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Training Accuracy
train_accuracy = rf_model.score(X_train, y_train)

# Evaluate model using cross-validation
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')

# Calculate R-squared on test data
r_squared = rf_model.score(X_test, y_test)

# Display results
cv_scores_mean = np.mean(cv_scores)
cv_scores_std = np.std(cv_scores)

# Print out results
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy (Cross-Validation Mean): {cv_scores_mean:.4f}")
print(f"Validation Accuracy Std Dev: {cv_scores_std:.4f}")
print(f"R² Score on Test Data: {r_squared:.4f}")

Training Accuracy: 1.0000
Validation Accuracy (Cross-Validation Mean): 0.3263
Validation Accuracy Std Dev: 0.0286
R² Score on Test Data: 0.3000


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

# Define parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [10, 20, None],  # Tree depth
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split
    'min_samples_leaf': [1, 2, 5],  # Minimum samples required at a leaf
    'max_features': ['sqrt', 'log2'],  # Consider subset of features
}

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train with best hyperparameters
rf_model = RandomForestClassifier(**best_params, random_state=42)
rf_model.fit(X_train, y_train)

# Training Accuracy
train_accuracy = rf_model.score(X_train, y_train)

# Evaluate model using cross-validation
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')

# Calculate R² on test data
r_squared = rf_model.score(X_test, y_test)

# Display results
cv_scores_mean = np.mean(cv_scores)
cv_scores_std = np.std(cv_scores)

# Print out results
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy (Cross-Validation Mean): {cv_scores_mean:.4f}")
print(f"Validation Accuracy Std Dev: {cv_scores_std:.4f}")
print(f"R² Score on Test Data: {r_squared:.4f}")

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Training Accuracy: 0.6250
Validation Accuracy (Cross-Validation Mean): 0.3563
Validation Accuracy Std Dev: 0.0326
R² Score on Test Data: 0.3400
