In [12]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('cancer.csv')

# Create DataFrame
df2 = pd.DataFrame(df)

"""# Drop the last column
df = df.drop(df.columns[-1], axis=1)"""

# Apply Label Encoding to the diagnosis column
label_encoder = LabelEncoder()
df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])

In [13]:
# checking missing value
missing_values=df.isna().sum()
print(missing_values)

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [14]:
# Check for duplicated rows
duplicate_rows = df2[df2.duplicated()]

if not duplicate_rows.empty:
    print("Duplicate rows values found! Removing duplicates...")
    
else:
    print("No duplicate rows values found.")



No duplicate rows values found.


In [15]:
# Check for duplicate values in the 'id' column
duplicate_ids = df.duplicated(subset=['id'])

# Check if there are any duplicate 'id' values
if duplicate_ids.any():
    print("Duplicate 'id' values found!")
else:
    print("No duplicate 'id' values found.")

No duplicate 'id' values found.


In [16]:
# outliers removing

# Iterate through each column
for col in df.columns:
    if col != 'diagnosis':  # Exclude 'diagnosis' column
        if np.issubdtype(df[col].dtype, np.number):  # Check if column is numeric
            # Define function to remove outliers based on mean for each column
            def remove_outliers_based_on_mean(column):
                mean = np.nanmean(column)  # Use np.nanmean() to handle NaN values
                std_dev = np.nanstd(column)
                lower_bound = mean - (2.5 * std_dev)
                upper_bound = mean + (2.5 * std_dev)
                filtered_column = column[(column >= lower_bound) & (column <= upper_bound)]
                return filtered_column

            # Apply the function to remove outliers from the column
            df[col] = remove_outliers_based_on_mean(df[col])

# Drop rows with any NaN values after removing outliers
df.dropna(inplace=True)

num_rows = df.shape[0]
print("Number of rows in the DataFrame after removing outliers:", num_rows)


Number of rows in the DataFrame after removing outliers: 451


In [17]:
# checking missing value
missing_values=df.isna().sum()
print(missing_values)

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [18]:
# Discretization

# Create DataFrame
df2 = pd.DataFrame(df)

# Define number of bins
num_bins = 3

# Iterate through each column
for col in df2.columns:
    # Exclude 'diagnosis' and 'id' columns
    if col not in ['diagnosis', 'id']:
        # Perform discretization using the cut function
        df2[f'discretized_{col}'] = pd.cut(df2[col], bins=num_bins, labels=False, duplicates='drop')


print("Modified DataFrame saved to 'cancer_discretized.csv'")

# Save the modified DataFrame to a new CSV file
df2.to_csv('cancer_preprossing.csv', index=False)


Modified DataFrame saved to 'cancer_discretized.csv'


## The reason for using Encodeing
We encode the classified column to convert categorical data into a numerical format that can be easily used for analysis or machine learning algorithms and to be ensure that the classification task can be performed accurately and efficiently on the dataset.

## The reason for cleaning data set
Cleaning involves handling missing values, removing duplicates, correcting errors, and dealing with outliers. 
By cleaning the dataset, we aim to improve data quality, accuracy, and the robustness of any insights or predictions derived from it.

## The reason for using Discretization 
Discretization simplifies the process of representing data, making it easier to analyze, interpret, and make decisions, and can be useful in some machine learning algorithms.