####  1. Import libraries 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

#### 1. Load data

In [2]:
df = pd.read_csv('breast-cancer-dataset.csv')
df.head()

Unnamed: 0,S/N,Year,Age,Menopause,Tumor Size (cm),Inv-Nodes,Breast,Metastasis,Breast Quadrant,History,Diagnosis Result
0,1,2019,40,1,2,0,Right,0,Upper inner,0,Benign
1,2,2019,39,1,2,0,Left,0,Upper outer,0,Benign
2,3,2019,45,0,4,0,Left,0,Lower outer,0,Benign
3,4,2019,26,1,3,0,Left,0,Lower inner,1,Benign
4,5,2019,21,1,1,0,Right,0,Upper outer,1,Benign


In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213 entries, 0 to 212
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   S/N               213 non-null    int64 
 1   Year              213 non-null    object
 2   Age               213 non-null    int64 
 3   Menopause         213 non-null    int64 
 4   Tumor Size (cm)   213 non-null    object
 5   Inv-Nodes         213 non-null    object
 6   Breast            213 non-null    object
 7   Metastasis        213 non-null    object
 8   Breast Quadrant   213 non-null    object
 9   History           213 non-null    object
 10  Diagnosis Result  213 non-null    object
dtypes: int64(3), object(8)
memory usage: 18.4+ KB
None


#### 1. Data cleaning 

In [4]:
df.columns

Index(['S/N', 'Year', 'Age', 'Menopause', 'Tumor Size (cm)', 'Inv-Nodes',
       'Breast', 'Metastasis', 'Breast Quadrant', 'History',
       'Diagnosis Result'],
      dtype='object')

Having white spaces in column names can lead to potential issues in data analysis and processing

In [5]:
df.columns = df.columns.str.replace(' ', '')
print(df.columns)

Index(['S/N', 'Year', 'Age', 'Menopause', 'TumorSize(cm)', 'Inv-Nodes',
       'Breast', 'Metastasis', 'BreastQuadrant', 'History', 'DiagnosisResult'],
      dtype='object')


identify and handle non-numerical values in our numerical columns 

In [6]:
# Check for non-numerical values in the entire DataFrame
non_numeric_values = ['Year', 'Age', 'Menopause', 'TumorSize(cm)', 
                    'Inv-Nodes', 'Metastasis', 'History']

for col in non_numeric_values:
    df[col] = pd.to_numeric(df[col], errors = 'coerce')

In [7]:
num_cols = df.select_dtypes(include='number').columns

# Convert selected columns to numeric, errors='coerce' will convert non-numeric values to NaN
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

# Now you can check for NaN values in the numeric columns, which would indicate non-numeric characters
nan_values = df[num_cols].isna().sum()

print(nan_values)

S/N              0
Year             1
Age              0
Menopause        0
TumorSize(cm)    1
Inv-Nodes        1
Metastasis       1
History          2
dtype: int64


Our remaining columns in our dataset only take two arguments ie. 0, 1,  "left" or "right".Next step is to assert that this is indeed true.

##### . DiagnosisResult

In [8]:
df["DiagnosisResult"].unique()

array(['Benign', 'Malignant'], dtype=object)

In Diagnosis result we do not have  a third argumnet  

##### . BreastQuadrant

In [9]:
df['BreastQuadrant'].unique()

array(['Upper inner', 'Upper outer', 'Lower outer', 'Lower inner', '#',
       'Upper outer '], dtype=object)

Above we have found a character '#' as well as a second Upper outer only the difference between the upper outer it has white space. we will replace the # with NaN value the proceed to eliminate the white space so as to eliminate the difference.

In [10]:
#eliminate white space
df['BreastQuadrant'] = df['BreastQuadrant'].str.replace(' ', '')

In [11]:
#replace '#'with NaN value
df['BreastQuadrant'] = df['BreastQuadrant'].apply(lambda x: np.nan if x == '#' else x)

##### . Metastasis

In [12]:
df['Metastasis'].unique()

array([ 0.,  1., nan])

In [13]:
#replace '#' with NaN value
df['Metastasis'] = df['Metastasis'].apply(lambda x: np.nan if x == '#' else x)

##### . Breast

In [14]:
df['Breast'].unique()

array(['Right', 'Left', '#'], dtype=object)

In [15]:
df['Breast'] = df['Breast'].apply(lambda x: np.nan if x== '#' else x)

##### . Inv-Nodes

In [16]:
df['Inv-Nodes'].unique()

array([ 0.,  1., nan,  3.])

In [17]:
df['Inv-Nodes'] = df['Inv-Nodes'].apply(lambda x: np.nan if x=='#'else x)

In [18]:
df['Inv-Nodes'] = df['Inv-Nodes'].apply(lambda x: np.nan if x=='3'else x)

##### .Menopause

In [19]:
df['Menopause'].unique()

array([1, 0])

finally check for duplicate rows

In [20]:
duplicate_rows = df[df.duplicated()]
print(duplicate_rows)

Empty DataFrame
Columns: [S/N, Year, Age, Menopause, TumorSize(cm), Inv-Nodes, Breast, Metastasis, BreastQuadrant, History, DiagnosisResult]
Index: []


Before proceeding with our analysis we will drop the first column as it serves no purpose to the project 

In [21]:
df = df.drop('S/N', axis = 1)
df.head()

Unnamed: 0,Year,Age,Menopause,TumorSize(cm),Inv-Nodes,Breast,Metastasis,BreastQuadrant,History,DiagnosisResult
0,2019.0,40,1,2.0,0.0,Right,0.0,Upperinner,0.0,Benign
1,2019.0,39,1,2.0,0.0,Left,0.0,Upperouter,0.0,Benign
2,2019.0,45,0,4.0,0.0,Left,0.0,Lowerouter,0.0,Benign
3,2019.0,26,1,3.0,0.0,Left,0.0,Lowerinner,1.0,Benign
4,2019.0,21,1,1.0,0.0,Right,0.0,Upperouter,1.0,Benign


Check for missing number of missing values in each column

In [22]:
df_null_values = df.isnull().sum().to_frame().rename(columns={'count': 'missing_count'})
print(df_null_values)

                 0
Year             1
Age              0
Menopause        0
TumorSize(cm)    1
Inv-Nodes        1
Breast           6
Metastasis       1
BreastQuadrant   2
History          2
DiagnosisResult  0


In [23]:
null_values = df[df.isnull().any(axis=1)]
print(null_values)

       Year  Age  Menopause  TumorSize(cm)  Inv-Nodes Breast  Metastasis  \
30   2019.0   56          0            9.0        1.0   Left         1.0   
40      NaN   34          1            NaN        NaN    NaN         NaN   
47   2019.0   25          1            5.0        0.0    NaN         0.0   
67   2019.0   40          1            1.0        0.0   Left         0.0   
143  2020.0   29          1            2.0        0.0    NaN         0.0   
164  2020.0   38          1            2.0        0.0    NaN         0.0   
166  2020.0   62          0            3.0        1.0    NaN         1.0   
178  2020.0   49          1            4.0        0.0    NaN         0.0   

    BreastQuadrant  History DiagnosisResult  
30             NaN      0.0       Malignant  
40             NaN      NaN       Malignant  
47      Upperouter      0.0       Malignant  
67      Lowerouter      NaN          Benign  
143     Lowerinner      1.0          Benign  
164     Upperouter      1.0          Be

we will drop the above rowns since this amount is minimal therefor may not affect our model.

In [24]:
df = df.dropna()

In [26]:
#confirm that we have no more missing values
df.isnull().sum()

Year               0
Age                0
Menopause          0
TumorSize(cm)      0
Inv-Nodes          0
Breast             0
Metastasis         0
BreastQuadrant     0
History            0
DiagnosisResult    0
dtype: int64

In [27]:
df2 = df.copy()
df2.head()

Unnamed: 0,Year,Age,Menopause,TumorSize(cm),Inv-Nodes,Breast,Metastasis,BreastQuadrant,History,DiagnosisResult
0,2019.0,40,1,2.0,0.0,Right,0.0,Upperinner,0.0,Benign
1,2019.0,39,1,2.0,0.0,Left,0.0,Upperouter,0.0,Benign
2,2019.0,45,0,4.0,0.0,Left,0.0,Lowerouter,0.0,Benign
3,2019.0,26,1,3.0,0.0,Left,0.0,Lowerinner,1.0,Benign
4,2019.0,21,1,1.0,0.0,Right,0.0,Upperouter,1.0,Benign
