In [39]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('cancer.csv')

# Create DataFrame
df2 = pd.DataFrame(df)


# Apply Label Encoding to the diagnosis column
label_encoder = LabelEncoder()
df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])

In [40]:
# checking missing value
missing_values=df.isna().sum()
print(missing_values)

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [41]:
# Check for duplicated rows
duplicate_rows = df2[df2.duplicated()]

if not duplicate_rows.empty:
    print("Duplicate rows values found! Removing duplicates...")
    
else:
    print("No duplicate rows values found.")

No duplicate rows values found.


In [42]:
# Check for duplicate values in the 'id' column
duplicate_ids = df.duplicated(subset=['id'])

# Check if there are any duplicate 'id' values
if duplicate_ids.any():
    print("Duplicate 'id' values found!")
else:
    print("No duplicate 'id' values found.")

No duplicate 'id' values found.


In [43]:
# outliers removing

# Iterate through each column
for col in df.columns:
    if col != 'diagnosis':  # Exclude 'diagnosis' column
        if np.issubdtype(df[col].dtype, np.number):  # Check if column is numeric
            # Define function to remove outliers based on mean for each column
            def remove_outliers_based_on_mean(column):
                mean = np.nanmean(column)  # Use np.nanmean() to handle NaN values
                std_dev = np.nanstd(column)
                lower_bound = mean - (2.5 * std_dev)
                upper_bound = mean + (2.5 * std_dev)
                filtered_column = column[(column >= lower_bound) & (column <= upper_bound)]
                return filtered_column

            # Apply the function to remove outliers from the column
            df[col] = remove_outliers_based_on_mean(df[col])

# Drop rows with any NaN values after removing outliers
df.dropna(inplace=True)

num_rows = df.shape[0]
print("Number of rows in the DataFrame after removing outliers:", num_rows)

Number of rows in the DataFrame after removing outliers: 451


In [44]:
# checking missing value
missing_values=df.isna().sum()
print(missing_values)

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [45]:
# Discretization

# Create DataFrame
df2 = pd.DataFrame(df)

# Define number of bins
num_bins = 3

# Iterate through each column
for col in df2.columns:
    # Exclude 'diagnosis' and 'id' columns
    if col not in ['diagnosis', 'id']:
        # Perform discretization using the cut function
        df2[f'discretized_{col}'] = pd.cut(df2[col], bins=num_bins, labels=False, duplicates='drop')



In [46]:
from sklearn.feature_selection import RFE
#from sklearn.datasets import load_cancer
from sklearn.linear_model import LogisticRegression

In [47]:
X = df2.drop(columns=['id', 'diagnosis'])  # Features (excluding 'id' and 'diagnosis' columns)
y = df2['diagnosis']  # Target (diagnosis column)

In [48]:
model=LogisticRegression()
rfe=RFE(model, n_features_to_select=2)
new=rfe.fit_transform(X,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [49]:
selected_features=[i for i, mask in enumerate(rfe.support_) if mask]
print("selected features:",selected_features)

selected features: [20, 37]


In [50]:
#normalized data
from sklearn.preprocessing import StandardScaler
coulmns_to_normalize=['radius_mean','texture_mean','perimeter_mean','area_mean','texture_worst','perimeter_worst','area_worst']
data_to_normalize=df2[coulmns_to_normalize]

In [51]:
zscore_scaler=StandardScaler()
normalized_data_zscore=zscore_scaler.fit_transform(data_to_normalize)

In [52]:
df2[coulmns_to_normalize]=normalized_data_zscore

In [53]:
print("Z-score normalized data:")
print(df2)


Z-score normalized data:
             id  diagnosis  radius_mean  texture_mean  perimeter_mean  \
1      842517.0          1     2.385527     -0.243112        2.249570   
2    84300903.0          1     2.077552      0.645516        2.100705   
4    84358402.0          1     2.287535     -1.118971        2.362502   
5      843786.0          1    -0.456240     -0.771692       -0.334009   
6      844359.0          1     1.573593      0.321218        1.566844   
..          ...        ...          ...           ...             ...   
556    924964.0          0    -1.257674      0.221630       -1.249786   
558    925277.0          0     0.292699      1.010670        0.375410   
560    925292.0          0     0.103714      2.152096        0.118233   
565    926682.0          1     2.231539      2.432984        2.162304   
566    926954.0          1     0.996141      2.389574        0.986784   

     area_mean  smoothness_mean  compactness_mean  concavity_mean  \
1     2.702274          0.084

In [54]:
#save the modified dataframe to new csv file 
df2.to_csv('cancer_preprossing.csv',index=False)

## The reason for using Encodeing
We encode the classified column to convert categorical data into a numerical format that can be easily used for analysis or machine learning algorithms and to be ensure that the classification task can be performed accurately and efficiently on the dataset.

## The reason for cleaning data set
Cleaning involves handling missing values, removing duplicates, correcting errors, and dealing with outliers. 
By cleaning the dataset, we aim to improve data quality, accuracy, and the robustness of any insights or predictions derived from it

## The reason for using Discretization
Discretization simplifies the process of representing data, making it easier to analyze, interpret, and make decisions, and can be useful in some machine learning algorithms.

## The reason for using normalization 

Normalization is important because it makes our data easier for modeling and evaluation. By scaling all our features to a specific range, like between 0 and 1, normalization ensures that our numbers are in a consistent format. This consistency helps our models understand and learn from the data more effectively. It also makes it simpler to compare the importance of different features in our model. Overall, normalization makes our data more manageable and helps our models perform better

## The reason for using feature selection 
we used the wrapper method which offers a comprehensive and adaptable approach to feature selection, making it well-suited for our dataset's complexities and our modeling objectives. Its ability to optimize performance, handle complex relationships, and iteratively refine feature subsets makes it a valuable choice for identifying the most relevant features for our predictive modeling tasks.