In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')  # Ignoring warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load the dataset
df = pd.read_csv("defects_data.csv")

# Display basic information about the dataset
print(df.info())

# Check for missing values
missing_values = df.isnull().sum()
if missing_values.any():
    print("Missing values detected in the dataset:")
    print(missing_values)
    
    # Fill missing values with the mean of each column
    df.fillna(df.mean(), inplace=True)
    print("Missing values have been filled with column means.")
else:
    print("No missing values detected in the dataset.")

# Drop irrelevant columns
df = df.drop(columns=['defect_id', 'product_id'])

# Encode categorical target variable
label_encoder = LabelEncoder()
df['severity'] = label_encoder.fit_transform(df['severity'])

# Separate features and target
X = df.drop(columns=['severity'])
y = df['severity']

# Define categorical and numerical features
categorical_features = ['defect_type', 'defect_date', 'defect_location', 'inspection_method']
numerical_features = ['repair_cost']

# Apply One-Hot Encoding to categorical features and scale numerical features
column_transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaler', StandardScaler(), numerical_features)
])

X_transformed = column_transformer.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

print(df.head())

print("Preprocessing complete. Data is ready for modeling!")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   defect_id          1000 non-null   int64  
 1   product_id         1000 non-null   int64  
 2   defect_type        1000 non-null   object 
 3   defect_date        1000 non-null   object 
 4   defect_location    1000 non-null   object 
 5   severity           1000 non-null   object 
 6   inspection_method  1000 non-null   object 
 7   repair_cost        1000 non-null   float64
dtypes: float64(1), int64(2), object(5)
memory usage: 62.6+ KB
None
No missing values detected in the dataset.
  defect_type defect_date defect_location  severity  inspection_method  \
0  Structural    6/6/2024       Component         1  Visual Inspection   
1  Functional   4/26/2024       Component         1  Visual Inspection   
2  Structural   2/15/2024        Internal         1  Automated Testing   
3  Funct