<a href="https://colab.research.google.com/github/Raghava2004-cpu/Logistic-Regression-Projects/blob/main/Predicting_Breast_Cancer_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('/content/data.csv')
data.head()

In [None]:
data.columns

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.drop_duplicates

In [None]:
data.drop(columns = ['id' , 'Unnamed: 32'] , inplace = True)

In [None]:
data.head()

Malignant:

A malignant tumor is cancerous.
It can invade and destroy nearby tissue and spread (metastasize) to other parts of the body through the bloodstream or lymphatic system.
Malignant tumors often grow rapidly and are considered harmful.
Benign:

A benign tumor is non-cancerous.
It does not spread to other parts of the body and usually grows slowly.
Benign tumors are generally not life-threatening unless they press on vital organs or structures.

In [None]:
(data['diagnosis'].unique())

In [None]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
data['diagnosis'] = label.fit_transform(data['diagnosis'])

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
corr = data.corr()
plt.figure(figsize = (21,8))
sns.heatmap(corr , annot = True)

In [None]:
from sklearn.model_selection import train_test_split

x = data.drop(columns = ['diagnosis'])
y = data['diagnosis']
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.25 , random_state = 42)

In [None]:
from sklearn.preprocessing import StandardScaler
scaling = StandardScaler()
x_train_scale = scaling.fit_transform(x_train)
x_test_scale = scaling.transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 5000)
model.fit(x_train_scale , y_train)


In [None]:
y_pred = model.predict(x_test_scale)

In [None]:
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report , precision_score , recall_score , roc_auc_score

print("Accuracy Score : ", accuracy_score(y_test , y_pred))
print(f"Confusion Matrix : {confusion_matrix(y_test , y_pred)} " )
print(f"Precision Score : {precision_score(y_test , y_pred)} ")
print(f"Recall Score : {recall_score(y_test , y_pred)}")
print(f"ROC AUC SCORE :  {roc_auc_score(y_test , y_pred)}")

In [None]:
cn = confusion_matrix(y_test , y_pred)
plt.figure(figsize = (3,3))
sns.heatmap(cn , annot = True)

USING THE L1 REGULARIZATION

In [None]:
model1 = LogisticRegression(penalty = 'l1' , solver = 'liblinear' , C = 0.1 , random_state = 42 , max_iter = 5000)
model1.fit(x_train_scale , y_train)   #use only liblinear in l1 ...l1 dont support lbfgs
y_pred = model.predict(x_test_scale)

In [None]:
print("Accuracy Score : ", accuracy_score(y_test , y_pred))
print(f"Confusion Matrix : {confusion_matrix(y_test , y_pred)} " )
print(f"Precision Score : {precision_score(y_test , y_pred)} ")
print(f"Recall Score : {recall_score(y_test , y_pred)}")
print(f"ROC AUC SCORE :  {roc_auc_score(y_test , y_pred)}")

USING THE L2 REGULARIZATION

In [None]:
model2 = LogisticRegression(penalty = 'l2' , solver = 'lbfgs' , C = 0.1 , random_state = 42 , max_iter = 5000)
model1.fit(x_train_scale , y_train)
y_pred = model.predict(x_test_scale)

In [None]:
print("Accuracy Score : ", accuracy_score(y_test , y_pred))
print(f"Confusion Matrix : {confusion_matrix(y_test , y_pred)} " )
print(f"Precision Score : {precision_score(y_test , y_pred)} ")
print(f"Recall Score : {recall_score(y_test , y_pred)}")
print(f"ROC AUC SCORE :  {roc_auc_score(y_test , y_pred)}")

USING THE ELASTIC NET REGULARIZATION

In [None]:
model3 = LogisticRegression(penalty = 'elasticnet' , l1_ratio = 0.5 , solver = 'saga' , C = 0.01 , random_state = 42 , max_iter = 5000)

model3.fit(x_train_scale , y_train)
y_pred = model.predict(x_test_scale)

In [None]:
print("Accuracy Score : ", accuracy_score(y_test , y_pred))
print(f"Confusion Matrix : {confusion_matrix(y_test , y_pred)} " )
print(f"Precision Score : {precision_score(y_test , y_pred)} ")
print(f"Recall Score : {recall_score(y_test , y_pred)}")
print(f"ROC AUC SCORE :  {roc_auc_score(y_test , y_pred)}")

In [None]:
from sklearn.model_selection import KFold , cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

kf = KFold(n_splits = 10 , shuffle = True , random_state = 42)
new_model = LogisticRegression(penalty = 'l1' , C = 0.1 , solver = 'liblinear', random_state = 42 , max_iter = 5000 )

best_accuracy = 0
best_fold_data = None

for fold , (train_index , test_index) in enumerate(kf.split(x)):
   x_train , x_test = x.iloc[train_index] , x.iloc[test_index]
   y_train , y_test = y.iloc[train_index]  , y.iloc[test_index]

   scaled = StandardScaler()
   x_train_scaled = scaled.fit_transform(x_train)
   x_test_scaled = scaled.transform(x_test)

   new_model.fit(x_train_scaled , y_train)

   y_predi = new_model.predict(x_test_scaled)

   accuracy = accuracy_score(y_test , y_predi)

   if accuracy > best_accuracy :
      best_accuracy = accuracy
      best_fold_data = (x_train , x_test , y_train , y_test)
   print(f"Accuracy Score {fold+1} : {accuracy}")

print(f"Among all the data best accuracy is : {best_accuracy}")
if best_fold_data :
  x_train , x_test , y_train , y_test = best_fold_data

print(x_train)  # printing the best train data from main data so that i can use this train data to fit the model
print(y_train)

In [None]:
scaler = StandardScaler()
x_train_scale = scaler.fit_transform(x_train)
x_test_scale =  scaler.transform(x_test)

new_model.fit(x_train_scale , y_train)
result = new_model.predict(x_test_scale)
print("Accuracy Score : ", accuracy_score(y_test , result)*100.0)
print(f"Confusion Matrix : {confusion_matrix(y_test , result)} " )
print(f"Precision Score : {precision_score(y_test , result)*100.0} ")
print(f"Recall Score : {recall_score(y_test , result)*100.0}")
print(f"ROC AUC SCORE :  {roc_auc_score(y_test , result)*100.0}")

In [None]:
cn = confusion_matrix(y_test , result)
plt.figure(figsize = (3,3))
sns.heatmap(cn , annot = True)
plt.show()

In [None]:
print(new_model.intercept_)

In [None]:
coef_df = pd.DataFrame({
     'Feature Names ' : x_train.columns,
     'Coefficients'   : new_model.coef_.flatten()
})
coef_df.sort_values('Coefficients' , ascending = False )

In [None]:
from sklearn.model_selection import KFold , cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

kf = KFold(n_splits = 10 , shuffle = True , random_state = 42)
new_model = LogisticRegression(penalty = 'l2' , C = 0.1 , solver = 'lbfgs', random_state = 42 , max_iter = 5000 )

best_accuracy = 0
best_fold_data = None

for fold , (train_index , test_index) in enumerate(kf.split(x)):
   x_train , x_test = x.iloc[train_index] , x.iloc[test_index]
   y_train , y_test = y.iloc[train_index]  , y.iloc[test_index]

   scaled = StandardScaler()
   x_train_scaled = scaled.fit_transform(x_train)
   x_test_scaled = scaled.transform(x_test)

   new_model.fit(x_train_scaled , y_train)

   y_predi = new_model.predict(x_test_scaled)

   accuracy = accuracy_score(y_test , y_predi)

   if accuracy > best_accuracy :
      best_accuracy = accuracy
      best_fold_data = (x_train , x_test , y_train , y_test)
   print(f"Accuracy Score {fold+1} : {accuracy}")

print(f"Among all the data best accuracy is : {best_accuracy}")
if best_fold_data :
  x_train , x_test , y_train , y_test = best_fold_data

print(x_train)  # printing the best train data from main data so that i can use this train data to fit the model
print(y_train)

In [None]:
scaler = StandardScaler()
x_train_scale = scaler.fit_transform(x_train)
x_test_scale =  scaler.transform(x_test)

new_model.fit(x_train_scale , y_train)
result = new_model.predict(x_test_scale)
print("Accuracy Score : ", accuracy_score(y_test , result)*100.0)
print(f"Confusion Matrix : {confusion_matrix(y_test , result)} " )
print(f"Precision Score : {precision_score(y_test , result)*100.0} ")
print(f"Recall Score : {recall_score(y_test , result)*100.0}")
print(f"ROC AUC SCORE :  {roc_auc_score(y_test , result)*100.0}")

In [None]:
print(new_model.intercept_)

In [None]:
coef_df = pd.DataFrame({
     'Feature Names ' : x_train.columns,
     'Coefficients'   : new_model.coef_.flatten()
})
coef_df.sort_values('Coefficients' , ascending = False )