# Data Preprocessing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing Dataset

In [3]:
data_set = pd.read_csv('breastcancer.csv')
X = data_set.iloc[:, :-1].values
y = data_set.iloc[:, -1].values

## Spilt the data into training and testing set


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)




## Logistic Regression Training on training set

In [10]:
import pandas as pd

# Check for missing values
missing_values = pd.DataFrame(X_train).isnull().sum()
print("Missing values in each feature:\n", missing_values)


Missing values in each feature:
 0     344
1     344
2     344
3     344
4     344
5     344
6     344
7     344
8     344
9     344
10    344
11    344
12    344
13    344
14    344
15    344
16    344
17    344
18    344
19    344
20    344
21    344
22    344
23    344
24    344
25    344
26    344
27    344
28    344
29    344
30    344
dtype: int64


In [14]:
import numpy as np

# Check for missing values in the target variable
missing_indices = np.isnan(y)
print("Number of missing values in y:", np.sum(missing_indices))



Number of missing values in y: 430


In [15]:
from sklearn.impute import SimpleImputer

# Convert to DataFrame for imputation
y_df = pd.DataFrame(y, columns=['target'])

# Impute missing values in the target variable
imputer_y = SimpleImputer(strategy='most_frequent')  # Most frequent value for classification
y_imputed = imputer_y.fit_transform(y_df)

# Flatten the array if necessary
y_imputed = y_imputed.ravel()

# Proceed with splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_imputed, test_size=0.3, random_state=0)

# Initialize and train the model
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7766666666666666
Classification Report:
               precision    recall  f1-score   support

         0.0       0.78      1.00      0.87       233
         1.0       0.00      0.00      0.00        67

    accuracy                           0.78       300
   macro avg       0.39      0.50      0.44       300
weighted avg       0.60      0.78      0.68       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=0)

# Fit and resample
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Train the model with resampled data
classifier = LogisticRegression(random_state=0)
classifier.fit(X_resampled, y_resampled)

# Predict and evaluate
y_pred = classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.22333333333333333
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       233
         1.0       0.22      1.00      0.37        67

    accuracy                           0.22       300
   macro avg       0.11      0.50      0.18       300
weighted avg       0.05      0.22      0.08       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

# Define a pipeline with resampling and imputation
smote = SMOTE(random_state=0)
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('smote', smote),
    ('classifier', LogisticRegression(random_state=0, class_weight='balanced'))
])

# Fit the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.22333333333333333
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       233
         1.0       0.22      1.00      0.37        67

    accuracy                           0.22       300
   macro avg       0.11      0.50      0.18       300
weighted avg       0.05      0.22      0.08       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[233   0]
 [ 67   0]]


0.7766666666666666