# Import Required Libraries
Import libraries such as pandas, numpy, matplotlib, and seaborn for data analysis and visualization.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff

# Set visualization style
sns.set(style="whitegrid")
# Increase font size for better readability
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['font.size'] = 12

# Load Datasets
Load both JM1 and KC1 datasets from ARFF files and convert them to pandas DataFrames.

In [None]:
# Load Datasets
def load_arff_dataset(file_path):
    data, meta = arff.loadarff(file_path)
    df = pd.DataFrame(data)
    # Convert byte strings to regular strings
    for col in df.select_dtypes(["object"]):
        df[col] = df[col].str.decode("utf-8")
    return df

# Load both datasets
jm1_path = "../data/raw/jm1.arff"
kc1_path = "../data/raw/kc1.arff"

jm1_df = load_arff_dataset(jm1_path)
kc1_df = load_arff_dataset(kc1_path)

print("JM1 Dataset Shape:", jm1_df.shape)
print("KC1 Dataset Shape:", kc1_df.shape)
print("\nJM1 Column Names:", jm1_df.columns.tolist())
print("\nData Types (JM1):\n", jm1_df.dtypes)
jm1_df.head()

# Analyze Class Distribution
Examine the distribution of defects in both datasets.

In [None]:
# Analyze Class Distributions for both datasets
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# JM1 distribution
jm1_counts = jm1_df['defects'].value_counts()
sns.barplot(x=jm1_counts.index, y=jm1_counts.values, ax=ax1)
ax1.set_title("JM1 Class Distribution")
ax1.set_xlabel("Defect Status")
ax1.set_ylabel("Count")

# KC1 distribution
kc1_counts = kc1_df['defects'].value_counts()
sns.barplot(x=kc1_counts.index, y=kc1_counts.values, ax=ax2)
ax2.set_title("KC1 Class Distribution")
ax2.set_xlabel("Defect Status")
ax2.set_ylabel("Count")

plt.tight_layout()
plt.show()

# Print class distribution percentages
print("JM1 Class Distribution:")
print(jm1_df['defects'].value_counts(normalize=True).round(3) * 100, "%\n")
print("KC1 Class Distribution:")
print(kc1_df['defects'].value_counts(normalize=True).round(3) * 100, "%")

# Feature Distributions
Analyze the distributions of numerical features using histograms and KDE plots.

In [None]:
# Select numerical features (exclude 'defects' column)
numerical_features = [col for col in jm1_df.columns if col != 'defects']

# Create a function to plot feature distributions
def plot_feature_distributions(feature, df1, df2, dataset1_name='JM1', dataset2_name='KC1'):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot for first dataset
    sns.histplot(df1[feature], kde=True, bins=30, ax=ax1)
    ax1.set_title(f"{dataset1_name} - {feature}")
    
    # Plot for second dataset
    sns.histplot(df2[feature], kde=True, bins=30, ax=ax2)
    ax2.set_title(f"{dataset2_name} - {feature}")
    
    plt.tight_layout()
    plt.show()

# Plot distributions for each feature
for feature in numerical_features[:5]:  # Start with first 5 features
    plot_feature_distributions(feature, jm1_df, kc1_df)
    
# Print basic statistics
print("JM1 Dataset Statistics:")
print(jm1_df[numerical_features[:5]].describe())
print("\nKC1 Dataset Statistics:")
print(kc1_df[numerical_features[:5]].describe())

# Correlation Analysis
Generate and compare correlation matrices for both datasets.

In [None]:
# Function to create correlation matrix plot
def plot_correlation_matrix(df, title):
    correlation_matrix = df[numerical_features].corr()
    
    plt.figure(figsize=(12, 8))
    mask = np.triu(np.ones_like(correlation_matrix))
    sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f',
               cmap='coolwarm', center=0, square=True, cbar_kws={"shrink": .5})
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

# Plot correlation matrices
plot_correlation_matrix(jm1_df, "JM1 Dataset - Feature Correlations")
plot_correlation_matrix(kc1_df, "KC1 Dataset - Feature Correlations")

# Feature Importance Analysis
Analyze feature importance using mutual information scores.

In [None]:
from sklearn.feature_selection import mutual_info_classif

def plot_feature_importance(df, dataset_name):
    # Calculate mutual information scores
    mi_scores = mutual_info_classif(df[numerical_features], df['defects'])
    importance_df = pd.DataFrame({'Feature': numerical_features, 'Importance': mi_scores})
    importance_df = importance_df.sort_values('Importance', ascending=True)
    
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    sns.barplot(data=importance_df, y='Feature', x='Importance')
    plt.title(f"{dataset_name} - Feature Importance (Mutual Information)")
    plt.xlabel("Mutual Information Score")
    plt.tight_layout()
    plt.show()
    
    return importance_df

# Plot feature importance for both datasets
jm1_importance = plot_feature_importance(jm1_df, "JM1")
kc1_importance = plot_feature_importance(kc1_df, "KC1")

# Print top 5 most important features for each dataset
print("Top 5 Most Important Features - JM1:")
print(jm1_importance.tail())
print("\nTop 5 Most Important Features - KC1:")
print(kc1_importance.tail())