In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [None]:
pd.set_option('display.expand_frame_repr', False)
data = pd.read_csv('data/Breast_Cancer.csv')

print("\nSample Data:")
print(data.head())
print("\nDescriptive Statistics:")
print(data.describe())
print("\nDataset Info:")
data.info()

In [None]:
print("\nColumns with Missing Values:")
print(data.isnull().sum())

print("\nRows with duplicated values:", data.duplicated().sum())

In [None]:
print("\nDistribution of numerical variable:")
data.select_dtypes(include=np.number).hist(bins=20, figsize=(15, 10))
plt.suptitle('Distribution of numerical variables')
plt.show()

In [None]:
print("\nDistribution of categorical variable:")
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    plt.figure(figsize=(12, 4))
    sns.countplot(data=data, x=col)
    plt.title(f'Distribution of categorical variable: {col}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
print("\nBox plots for numeric variables:")
numerical_cols = data.select_dtypes(include=np.number).columns
for col in numerical_cols:
    plt.figure(figsize=(12, 4))
    sns.boxplot(x=data[col])
    plt.title(f'Box plot for {col}')
    plt.show()

In [None]:
data_encoded = data.copy()

encoder = LabelEncoder()
for column in data_encoded.columns:
    if data_encoded[column].dtype == 'object':
        data_encoded[column] = encoder.fit_transform(data_encoded[column])

In [None]:
print("\nCorrelation matrix:")
correlation_matrix = data_encoded.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation matrix')
plt.show()

In [None]:
X = data_encoded.drop(columns=['Status'])
y = data_encoded['Status']

model = RandomForestClassifier(random_state=42)
model.fit(X, y)

importances = pd.Series(model.feature_importances_, index=X.columns)
importances_sorted = importances.sort_values(ascending=False)

print("Features Importance in model:")
for feature, importance in importances_sorted.items():
    print(f"{feature}: {importance:.4f}")

In [None]:
print("\nGenerating EDA Report")
profile = ProfileReport(data, title="EDA Report", explorative=True)
profile.to_file("documentation/eda_report.html")

In [None]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)
print("Training Data Shape:", train_data.shape)
print("Testing Data Shape:", test_data.shape)

train_data.to_csv("data/train_data.csv", index=False)
test_data.to_csv("data/test_data.csv", index=False)