In [4]:
# ============================================
# LAB 02 – EXPLORATORY DATA ANALYSIS (EDA)
# Milling Dataset
# ============================================

# -------------------------------
# 1. Import Libraries
# -------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (8,5)
sns.set_style("whitegrid")

# -------------------------------
# 2. Load Dataset
# -------------------------------
df = pd.read_csv("milling_dataset.csv")   # Make sure 'milling_dataset.csv' is uploaded to your Colab environment or specify the correct file path.

print("\nFirst 5 Rows:\n")
print(df.head())

print("\nLast 5 Rows:\n")
print(df.tail())

print("\nData Types:\n")
print(df.dtypes)

print("\nDataset Shape (Rows, Columns):\n")
print(df.shape)


# ============================================
# TASK 2 – DATA CLEANING
# ============================================

# -------------------------------
# 3. Missing Values
# -------------------------------
print("\nMissing Values Before Cleaning:\n")
print(df.isnull().sum())

# Fill numeric columns with median
for col in df.select_dtypes(include=np.number).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with mode
for col in df.select_dtypes(include="object").columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("\nMissing Values After Cleaning:\n")
print(df.isnull().sum())


# -------------------------------
# 4. Remove Duplicates
# -------------------------------
print("\nDuplicate Rows:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Shape After Removing Duplicates:", df.shape)


# ============================================
# TASK 3 – STATISTICAL ANALYSIS
# ============================================

print("\n========== CENTER ==========")

mean_vals = df.mean(numeric_only=True)
median_vals = df.median(numeric_only=True)
mode_vals = df.mode(numeric_only=True)

print("\nMean:\n", mean_vals)
print("\nMedian:\n", median_vals)
print("\nMode:\n", mode_vals.head())


print("\n========== SPREAD ==========")

variance = df.var(numeric_only=True)
range_vals = df.max(numeric_only=True) - df.min(numeric_only=True)

Q1 = df.quantile(0.25, numeric_only=True)
Q3 = df.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1

print("\nVariance:\n", variance)
print("\nRange:\n", range_vals)
print("\nIQR:\n", IQR)

highest_variability = variance.idxmax()
print("\nSensor with Highest Variability:", highest_variability)


# ============================================
# DISTRIBUTION ANALYSIS (Tool Wear)
# ============================================

if "Tool_Wear" in df.columns:

    print("\nAnalyzing Tool_Wear Distribution...")

    # Histogram
    plt.figure()
    plt.hist(df["Tool_Wear"], bins=30)
    plt.title("Histogram of Tool Wear")
    plt.xlabel("Tool Wear")
    plt.ylabel("Frequency")
    plt.show()

    # KDE
    plt.figure()
    sns.kdeplot(df["Tool_Wear"], fill=True)
    plt.title("KDE Plot of Tool Wear")
    plt.show()

    print("Mean Tool Wear:", df["Tool_Wear"].mean())
    print("Median Tool Wear:", df["Tool_Wear"].median())


# ============================================
# OUTLIER DETECTION (IQR Method)
# ============================================

if "Tool_Wear" in df.columns:

    Q1 = df["Tool_Wear"].quantile(0.25)
    Q3 = df["Tool_Wear"].quantile(0.75)
    IQR_value = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR_value
    upper_bound = Q3 + 1.5 * IQR_value

    outliers = df[(df["Tool_Wear"] < lower_bound) | (df["Tool_Wear"] > upper_bound)]

    print("\nNumber of Outliers (IQR Method):", len(outliers))

    # Remove outliers
    df_clean = df[(df["Tool_Wear"] >= lower_bound) & (df["Tool_Wear"] <= upper_bound)]

    print("Shape After Removing Outliers:", df_clean.shape)

else:
    df_clean = df.copy()


# ============================================
# TASK 4 – CORRELATION ANALYSIS
# ============================================

print("\n========== CORRELATION MATRIX ==========")

corr_matrix = df_clean.corr(numeric_only=True)
print(corr_matrix)

plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

if "Tool_Wear" in corr_matrix.columns:
    tool_corr = corr_matrix["Tool_Wear"].abs().sort_values(ascending=False)
    print("\nCorrelation with Tool_Wear:\n")
    print(tool_corr)


# ============================================
# TASK 5 – SCATTER PLOT
# ============================================

if "Tool_Wear" in df_clean.columns:

    numeric_cols = df_clean.select_dtypes(include=np.number).columns.tolist()

    # Try plotting against the first numeric feature except Tool_Wear
    for col in numeric_cols:
        if col != "Tool_Wear":
            plt.figure()
            plt.scatter(df_clean[col], df_clean["Tool_Wear"])
            plt.xlabel(col)
            plt.ylabel("Tool Wear")
            plt.title(f"Tool Wear vs {col}")
            plt.show()
            break


# ============================================
# ENGINEERING INTERPRETATION (AUTO PRINT)
# ============================================

print("\n========== ENGINEERING INSIGHTS ==========")

if "Tool_Wear" in df_clean.columns:
    print("• Tool wear tends to increase with features that show strong positive correlation.")
    print("• The most predictive feature is the one with highest absolute correlation.")
    print("• The most stable sensor is the one with lowest variance.")
    print("• Tool wear can be predicted using regression or machine learning models.")
else:
    print("Tool_Wear column not found. Check dataset column names.")

FileNotFoundError: [Errno 2] No such file or directory: 'milling_dataset.csv'