# Import Required Libraries
Import libraries such as pandas, numpy, matplotlib, and seaborn for data analysis and visualization.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style="whitegrid")

# Load Dataset
Load the dataset into a pandas DataFrame and display basic information such as shape, column names, and data types.

In [None]:
# Load Dataset
file_path = "../data/software_defects.csv"  # Update with the actual dataset path
df = pd.read_csv(file_path)

# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumn Names:", df.columns.tolist())
print("\nData Types:\n", df.dtypes)
df.head()

# Analyze Distributions
Visualize the distributions of numerical features using histograms and KDE plots.

In [None]:
# Analyze Distributions
numerical_features = df.select_dtypes(include=["int64", "float64"]).columns

# Plot histograms and KDE plots
for feature in numerical_features:
    plt.figure(figsize=(10, 5))
    sns.histplot(df[feature], kde=True, bins=30, color="blue")
    plt.title(f"Distribution of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.show()

# Generate Correlation Matrix
Compute and visualize the correlation matrix using seaborn's heatmap to identify relationships between features.

In [None]:
# Generate Correlation Matrix
correlation_matrix = df.corr()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix")
plt.show()

# Detect Outliers
Use boxplots and statistical methods to detect and visualize outliers in the dataset.

In [None]:
# Detect Outliers
for feature in numerical_features:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=df[feature], color="orange")
    plt.title(f"Boxplot of {feature}")
    plt.xlabel(feature)
    plt.show()

# Analyze Class Distributions
Examine the distribution of the target variable (e.g., defect vs. no defect) using bar plots or pie charts.

In [None]:
# Analyze Class Distributions
target_variable = "defect_status"  # Update with the actual target column name

# Bar plot for class distribution
plt.figure(figsize=(8, 5))
sns.countplot(x=df[target_variable], palette="viridis")
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

# Pie chart for class distribution
class_counts = df[target_variable].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(class_counts, labels=class_counts.index, autopct="%1.1f%%", startangle=90, colors=sns.color_palette("viridis"))
plt.title("Class Distribution")
plt.show()