In [None]:
import pandas as pd
import numpy as np

In [None]:
df =pd.read_csv('/content/breast_cancer.csv')

In [None]:
df.sample(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
355,9010258,B,12.56,19.07,81.92,485.8,0.0876,0.1038,0.103,0.04391,...,22.43,89.02,547.4,0.1096,0.2002,0.2388,0.09265,0.2121,0.07188,
399,904357,B,11.8,17.26,75.26,431.9,0.09087,0.06232,0.02853,0.01638,...,24.49,86.0,562.0,0.1244,0.1726,0.1449,0.05356,0.2779,0.08121,
119,865128,M,17.95,20.01,114.2,982.0,0.08402,0.06722,0.07293,0.05596,...,27.83,129.2,1261.0,0.1072,0.1202,0.2249,0.1185,0.4882,0.06111,
165,8712291,B,14.97,19.76,95.5,690.2,0.08421,0.05352,0.01947,0.01939,...,25.82,102.3,782.1,0.1045,0.09995,0.0775,0.05754,0.2646,0.06085,
396,90401601,B,13.51,18.89,88.1,558.1,0.1059,0.1147,0.0858,0.05381,...,27.2,97.33,675.2,0.1428,0.257,0.3438,0.1453,0.2666,0.07686,


### **A. Identification Columns** (Output)
- **`id`**: Unique identifier for each patient (can be ignored for analysis).
- **`diagnosis`**: Indicates whether the tumor is:
  - **M (Malignant)** – Cancerous
  - **B (Benign)** – Non-cancerous

### **B. Feature Columns (Nucleus Measurements)**
Each feature is measured in **three ways**:
- **Mean**: Average value across all detected nuclei (`_mean`)
- **Standard Error (SE)**: Variability of the measurements (`_se`)
- **Worst**: The largest (maximum) value (`_worst`)

#### **1. Shape Features**
- **`radius_*`** – Mean distance from center to perimeter (size of the nucleus).  
- **`perimeter_*`** – Total distance around the nucleus boundary.  
- **`area_*`** – Total area of the nucleus.  
- **`texture_*`** – Variation in gray-scale pixel intensity.

#### **2. Edge & Border Features**
- **`smoothness_*`** – How smooth the cell edges are (higher = irregular).  
- **`compactness_*`** – Compactness of nucleus shape:  
  \[
  \text{Compactness} = \frac{\text{Perimeter}^2}{\text{Area}} - 1.0
  \]
- **`symmetry_*`** – Symmetry of the nucleus shape.  

#### **3. Concavity Features**
- **`concavity_*`** – Severity of concave portions in the nucleus.  
- **`concave points_*`** – Number of concave points on the nucleus perimeter.

#### **4. Fractal Geometry Features**
- **`fractal_dimension_*`** – Measures complexity of the cell border (higher = more irregular growth).

### **C. Unused Column**
- **`Unnamed: 32`**: Contains only NaN values and should be removed.

## **2. Important Insights**
- **Larger `radius`, `perimeter`, and `area`** → More likely to be **malignant**.
- **Higher `compactness`, `concavity`, and `concave points`** → More irregular shapes, associated with **cancer**.
- **Higher `fractal_dimension`** → More complexity in tumor structure, which may indicate malignancy.

In [None]:
df.diagnosis.value_counts()

Unnamed: 0_level_0,count
diagnosis,Unnamed: 1_level_1
B,357
M,212


In [None]:
df.shape

(569, 33)

In [None]:
df.isnull().sum()

Unnamed: 0,0
id,0
diagnosis,0
radius_mean,0
texture_mean,0
perimeter_mean,0
area_mean,0
smoothness_mean,0
compactness_mean,0
concavity_mean,0
concave points_mean,0


In [None]:
df.drop('Unnamed: 32', axis=1, inplace=True)

In [None]:
df.duplicated().sum()

0

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

# Convert 'diagnosis' column to numerical representation before calculating correlation.

### level encoding

In [None]:
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

In [None]:
df.corr()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,1.0,0.039769,0.074626,0.09977,0.073159,0.096893,-0.012968,9.6e-05,0.05008,0.044158,...,0.082405,0.06472,0.079986,0.107187,0.010338,-0.002968,0.023203,0.035174,-0.044224,-0.029866
diagnosis,0.039769,1.0,0.730029,0.415185,0.742636,0.708984,0.35856,0.596534,0.69636,0.776614,...,0.776454,0.456903,0.782914,0.733825,0.421465,0.590998,0.65961,0.793566,0.416294,0.323872
radius_mean,0.074626,0.730029,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,...,0.969539,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066
texture_mean,0.09977,0.415185,0.323782,1.0,0.329533,0.321086,-0.023389,0.236702,0.302418,0.293464,...,0.352573,0.912045,0.35804,0.343546,0.077503,0.27783,0.301025,0.295316,0.105008,0.119205
perimeter_mean,0.073159,0.742636,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,...,0.969476,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019
area_mean,0.096893,0.708984,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,...,0.962746,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738
smoothness_mean,-0.012968,0.35856,0.170581,-0.023389,0.207278,0.177028,1.0,0.659123,0.521984,0.553695,...,0.21312,0.036072,0.238853,0.206718,0.805324,0.472468,0.434926,0.503053,0.394309,0.499316
compactness_mean,9.6e-05,0.596534,0.506124,0.236702,0.556936,0.498502,0.659123,1.0,0.883121,0.831135,...,0.535315,0.248133,0.59021,0.509604,0.565541,0.865809,0.816275,0.815573,0.510223,0.687382
concavity_mean,0.05008,0.69636,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,...,0.688236,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493
concave points_mean,0.044158,0.776614,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,...,0.830318,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661


In [None]:
df.describe()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,0.372583,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,0.483918,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,0.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,0.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,0.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,1.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,1.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [None]:
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape

((455, 31), (114, 31))

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train

array([[-0.23712699, -1.44075296, -0.43531947, ...,  0.9320124 ,
         2.09724217,  1.88645014],
       [-0.23702031,  1.97409619,  1.73302577, ...,  2.6989469 ,
         1.89116053,  2.49783848],
       [-0.23712907, -1.39998202, -1.24962228, ..., -0.97023893,
         0.59760192,  0.0578942 ],
       ...,
       [-0.17630431,  0.04880192, -0.55500086, ..., -1.23903365,
        -0.70863864, -1.27145475],
       [-0.23675669, -0.03896885,  0.10207345, ...,  1.05001236,
         0.43432185,  1.21336207],
       [-0.2371006 , -0.54860557,  0.31327591, ..., -0.61102866,
        -0.3345212 , -0.84628745]])

In [None]:
from sklearn.linear_model import LogisticRegression
lor = LogisticRegression()
lor.fit(X_train, y_train)

In [None]:
y_pred = lor.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9736842105263158

In [None]:
input_data = (-0.23711093, -0.4976419 ,  0.61365274, -0.49813131, -0.53102815,
       -0.57694824, -0.17494424, -0.36215622, -0.284859  ,  0.43345165,
        0.17818232, -0.36844966,  0.55310406, -0.31671104, -0.40524636,
        0.04025752, -0.03795529, -0.18043065,  0.16478901, -0.12170969,
        0.23079329, -0.50044002,  0.81940367, -0.46922838, -0.53308833,
       -0.04910117, -0.04160193, -0.14913653,  0.09681787,  0.10617647,
        0.49035329)
#print(input_data)

np_df = np.asarray(input_data)
#print(np_df)

input_data_reshaped = np_df.reshape(1,-1)
#print(input_data_reshaped)

prediction = lor.predict(input_data_reshaped)

if prediction[0] == 0:
  print('The Breast cancer is Benign')
else:
  print('The Breast Cancer is Malignant')

The Breast cancer is Benign


In [None]:
X_train[10]

array([-0.23711093, -0.4976419 ,  0.61365274, -0.49813131, -0.53102815,
       -0.57694824, -0.17494424, -0.36215622, -0.284859  ,  0.43345165,
        0.17818232, -0.36844966,  0.55310406, -0.31671104, -0.40524636,
        0.04025752, -0.03795529, -0.18043065,  0.16478901, -0.12170969,
        0.23079329, -0.50044002,  0.81940367, -0.46922838, -0.53308833,
       -0.04910117, -0.04160193, -0.14913653,  0.09681787,  0.10617647,
        0.49035329])

In [None]:
import pickle
pickle.dump(lor, open('breast_cancer.pkl', 'wb'))