In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [46]:
# Read the data
data_df = pd.read_csv('breast_cancer_dataset.csv'
                      , index_col=0,
                      header=0,
                      names=['id', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape', 'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class'],
                    #   dtype={'id': np.int64, 'clump_thickness': np.int64, 'uniformity_cell_size': np.int64, 'uniformity_cell_shape': np.int64, 'marginal_adhesion': np.int64, 'single_epithelial_cell_size': np.int64, 'bare_nuclei': np.int64, 'bland_chromatin': np.int64, 'normal_nucleoli': np.int64, 'mitoses': np.int64, 'class': np.int64},
                      na_values='?')

## Data Preprocessing

In [47]:
# Drop records with missing values
data_df.dropna(inplace=True)

In [4]:
data_df.head()

Unnamed: 0_level_0,clump_thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1.0,3,1,1,2
1002945,5,4,4,5,7,10.0,3,2,1,2
1015425,3,1,1,1,2,2.0,3,1,1,2
1016277,6,8,8,1,3,4.0,3,7,1,2
1017023,4,1,1,3,2,1.0,3,1,1,2


In [5]:
# Get an idea about the dataset
data_df.describe()

Unnamed: 0,clump_thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,2.699854
std,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.954592
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [6]:
data_df['class'].value_counts()

2    444
4    239
Name: class, dtype: int64

In [7]:
class_dict = {
    0:'benign',
    1:'malignant'
}

In [48]:
data_df['class'] = data_df['class'].map({2:0, 4:1})

In [49]:
data_df['class'].value_counts()

0    444
1    239
Name: class, dtype: int64

In [50]:
X = data_df.drop('class', axis=1)
X

Unnamed: 0_level_0,clump_thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1000025,5,1,1,1,2,1.0,3,1,1
1002945,5,4,4,5,7,10.0,3,2,1
1015425,3,1,1,1,2,2.0,3,1,1
1016277,6,8,8,1,3,4.0,3,7,1
1017023,4,1,1,3,2,1.0,3,1,1
...,...,...,...,...,...,...,...,...,...
776715,3,1,1,1,3,2.0,1,1,1
841769,2,1,1,1,2,1.0,1,1,1
888820,5,10,10,3,7,3.0,8,10,2
897471,4,8,6,4,3,4.0,10,6,1


In [51]:
y = data_df['class']
y.head()

id
1000025    0
1002945    0
1015425    0
1016277    0
1017023    0
Name: class, dtype: int64

In [53]:
column_names = X.columns
y_name = 'class'

### Split dataset

In [12]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import time

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Separate the input features (X) and labels (y)
X = data_df.iloc[:, :-1].values
y = data_df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:

print("Number of samples in training set:", len(X_train))
print("Number of samples in testing set:", len(y_test))


Number of samples in training set: 546
Number of samples in testing set: 137


In [15]:
type(X_train)

numpy.ndarray

In [16]:
# Convert the numpy arrays into tensors
X_train = torch.from_numpy(X_train).float()
X_test = torch.from_numpy(X_test).float()
y_train = torch.from_numpy(y_train).float()
y_test = torch.from_numpy(y_test).float()

In [17]:
# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} for computation >>> ")

Using cuda for computation >>> 


In [18]:
# Define the logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        out = self.linear(x)
        out = torch.sigmoid(out)
        out = torch.round(out)
        return out


In [19]:
no_features = X_train.shape[1]
data_output_size = 1

In [20]:
y_train.shape

torch.Size([546])

Here y_train is 0 dimension tensor. so have to add more dimension so that $\hat{y_train}$ is compatible with y_train to compute the Error.   
```python
y_train = y_train.unsqueeze(1)
```

In [21]:
# Normalize the input features
X_train = (X_train - X_train.mean()) / X_train.std()

# Convert the data to PyTorch tensors and move to the device
X_train = X_train.to(device)
y_train = y_train.unsqueeze(1).to(device)

In [22]:


# Define the hyperparameters
learning_rate = 0.03
num_epochs = 10000

#inp

# Initialize the model and optimizer and move to the device
model = LogisticRegression(input_size=no_features,
                            output_size=data_output_size
                           ).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)


In [23]:
# Train the model
for epoch in range(num_epochs):
    # Start measuring the time for each epoch
    start = time.time()

    # Forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print the loss every 100 epochs
    if (epoch+1) % 100 == 0:
        # print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))
        print(f"Epoch [{epoch+1}/{num_epochs}] , Loss : {loss.item()} : took {time.time() - start:.2f} seconds")

Epoch [100/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [200/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [300/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [400/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [500/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [600/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [700/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [800/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [900/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [1000/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [1100/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [1200/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [1300/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [1400/10000] , Loss : 0.6504194736480713 : took 0.01 seconds
Epoch [1500/10000] , Loss : 0.6504194736480713 : took 0.00 seconds
Epoc

In [24]:
# Test the model
with torch.no_grad():
    predicted = model(X_test.to(device))
    predicted = predicted.round()
    total = predicted.size(0)
    correct = predicted.eq(y_test.to(device)).sum().item()
    # print('Accuracy of the model on the {} breast cancer samples: {:.2f}%'.format(total,  * correct / total))
    print("No of accurate predictions:", correct)
    print("No of inaccurate predictions:", total - correct)
    print(f"Accuracy of the model on the {total} breast cancer samples: {correct / total * 100:.2f}%")


No of accurate predictions: 9143
No of inaccurate predictions: -9006
Accuracy of the model on the 137 breast cancer samples: 6673.72%


## Save the model

In [25]:
# Save the trained model
torch.save(model.state_dict(), 'trained_model.pth')


## Load the model

In [26]:
# Load the saved model
model = LogisticRegression(input_size=no_features, output_size=1).to(device)
model.load_state_dict(torch.load('trained_model.pth'))
model.eval()


LogisticRegression(
  (linear): Linear(in_features=9, out_features=1, bias=True)
)

___

# Using Scikit learn as the framework

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
print("Number of samples in training set:", len(X_train))
print("Number of samples in testing set:", len(y_test))

Number of samples in training set: 546
Number of samples in testing set: 137


In [29]:
# Function to calculate the accuracy of the model using sklearn
def model_acc(model):
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print(f"Accuracy of the model is {acc*100:.2f}%")

In [30]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
log_r = LogisticRegression()
model_acc(log_r)

Accuracy of the model is 95.62%


In [31]:
# Linear Regression
from sklearn.linear_model import LinearRegression
lin_r = LinearRegression()
model_acc(lin_r) 

Accuracy of the model is 79.49%


In [33]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
dec_t_r = DecisionTreeRegressor()
model_acc(dec_t_r)

Accuracy of the model is 67.11%


In [34]:
# Lasso Regression
from sklearn.linear_model import Lasso
lasso = Lasso()
model_acc(lasso)

Accuracy of the model is 29.30%


In [36]:
# Random Forest Regression
from sklearn.ensemble import RandomForestRegressor
lr = RandomForestRegressor()
model_acc(lr)

Accuracy of the model is 86.20%


## Hyper Parameter Tuning

In [37]:
# Hyperparameter tuning for LogisticRegression
from sklearn.model_selection import GridSearchCV
params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2']
}

In [42]:
# Create a logistic regression model
model = LogisticRegression()

# Define the parameter grid to search
params = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['liblinear', 'saga']
}

# Create a grid search object
grid_obj = GridSearchCV(estimator=log_r, param_grid=params, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search object to the training data
grid_obj.fit(X_train, y_train)

best_model = grid_obj.best_estimator_

# Print the best parameters and accuracy score
print('Best parameters:', grid_obj.best_params_)
print('Accuracy score:', grid_obj.best_score_)
print('Test accuracy score:',best_model.score(X_test, y_test))

Fitting 5 folds for each of 28 candidates, totalling 140 fits
Best parameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy score: 0.9670391993327774
Test accuracy score: 0.9562043795620438


In [43]:
import pickle 

# Save the model as a pickle in a file
with open('predictor.pkl', 'wb') as f:
    pickle.dump(best_model, f)


In [None]:
# Load the model from the file
with open('predictor.pkl', 'rb') as f:
    model = pickle.load(f)
    

In [60]:
model_test_record = [[
    # 1100524, # This is the id
    6,10,10,2,8,10,7,3,3
    # This is the y value of the test record, in application this will be unknown
]]

In [62]:
model_test_prediction_val = best_model.predict(model_test_record)

In [63]:
model_test_prediction_val[0]

1