In [1]:
### LIBRARY IMPORTATION
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Assigning the headers
cols = ['code','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size',
'Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','class']

# Reading the CSV and converting into pandas data frame 
df = pd.read_csv('breast-cancer-wisconsin.data', names=cols)

FileNotFoundError: [Errno 2] No such file or directory: 'breast-cancer-wisconsin.data'

In [None]:
### DATA CLEANING

In [None]:
# Replacing the '?' to null value
df.replace('?', np.nan, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
# Converting data points into integer 
df = df.astype(int, errors='ignore')

In [3]:
df

NameError: name 'df' is not defined

In [None]:
# 
df['Bare Nuclei'] = pd.to_numeric(df['Bare Nuclei'], errors='coerce')
df['Bare Nuclei'].fillna(int(round(df['Bare Nuclei'].mean(), 0)), inplace=True)

In [None]:
df['Bare Nuclei']

In [None]:
df.isnull().sum()

In [None]:
class_mapping = {
    2 : int(0),
    4 : int(1)
}

# To create new column to convert predicted outcome into numerical form
'''
numerical = []

for label in df['class']:
    value = class_mapping[str(label)]
    numerical.append(value)
df['class_numerical'] = numerical
'''

#without creating new column

df['class'].replace(class_mapping, inplace=True)

In [None]:
df['class']

In [None]:
df.info()

In [None]:
sns.pairplot(data=df, hue='class', vars=['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size'])
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(df[2:].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
train, validate, test = np.split(df.sample(frac=1), [int(0.7*len(df)), int(0.85*len(df))])

In [None]:
def over_sampler(dataframe, oversample=False):
    x = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values
    
    if oversample:
        ROS = RandomOverSampler()
        x,y = ROS.fit_resample(x,y)
    
    return x,y

In [None]:
x_train, y_train = over_sampler(train, oversample=True)
x_validate, y_validate = over_sampler(validate, oversample=False)
x_test, y_test = over_sampler(test, oversample=False)

In [None]:
sum(y_test == 0 ),sum(y_test == 1)

In [None]:
train

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=2)
knn_model.fit(x_train, y_train)

In [None]:
y_pred = knn_model.predict(x_validate)

In [None]:
print(classification_report(y_validate, y_pred))

In [None]:
y_predf = knn_model.predict(x_test)

In [None]:
print(classification_report(y_test, y_predf))

In [None]:
# NAIVE BAYES
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'var_smoothing': [1e-150, 1e-100, 1e-50, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2]}

In [None]:
base_model = GaussianNB()

In [None]:
grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

In [None]:
best_params = grid_search.best_params_
best_model = GaussianNB(**best_params)

In [None]:
best_model.fit(x_train, y_train)

In [None]:
y_pred = best_model.predict(x_validate)
print(classification_report(y_validate, y_pred))

In [None]:
y_pred = best_model.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
# LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression

In [None]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(x_train, y_train)

In [None]:
y_pred = lg_model.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
# SINGLE VECTOR MACHINE (SVM)
from sklearn.svm import SVC

In [None]:
svm_model = SVC()
svm_model = svm_model.fit(x_train, y_train)

In [None]:
y_pred = svm_model.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
# RANDOM FOREST REGRESSOR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
rf_model = RandomForestRegressor()
rf_model = rf_model.fit(x_train, y_train)
y_pred = rf_model.predict(x_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

In [None]:
# SCALING
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_validate = scaler.transform(x_validate)

In [None]:
x_validate

In [None]:
###  NEURAL NET  

In [None]:
import tensorflow as tf
from keras.models import Sequential
import matplotlib.pyplot as plt

In [None]:
def plot_history(history):
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))
    ax1.plot(history.history['loss'], label='loss')
    ax1.plot(history.history['val_loss'], label='val_loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Binary CrossEntropy')
    ax1.legend() 
    ax1.grid(True)
    
    ax2.plot(history.history['accuracy'], label='accuracy')
    ax2.plot(history.history['val_accuracy'], label='val_accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    ax2.grid(True)
    
    plt.show()

In [None]:
def train_model(x_train, y_train, num_nodes, dropout_prob, lr, epochs, batch_size):
    nn_model = Sequential([
    tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(10,)),
    tf.keras.layers.Dropout(dropout_prob),
    tf.keras.layers.Dense(num_nodes, activation='relu'),
    tf.keras.layers.Dropout(dropout_prob),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss='binary_crossentropy', metrics=['accuracy'])
    
    history = nn_model.fit(
    x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data = (x_validate, y_validate), verbose=0
    )
    return nn_model, history

In [None]:
least_val_loss = float('inf')
least_val_model = None

epochs = 100
for num_nodes in [16, 32, 64]:
    for dropout_prob in [0, 0.1, 0.2]:
        for lr in [0.1, 0.01, 0.005, 0.001]:
            for batch_size in [35, 70, 140]:
                print(f'{num_nodes}: num_nodes, {dropout_prob}: dropout_prob, {lr}: lr, {batch_size}: batch_size')
                model, history = train_model(x_train, y_train, num_nodes, dropout_prob, lr, epochs, batch_size)
                plot_history(history)
                val_loss = model.evaluate(x_validate, y_validate)
                
                if val_loss[0] < least_val_loss:
                    least_val_loss = val_loss[0]
                    least_val_model = model    

In [None]:
y_pred = least_val_model.predict(x_test)
y_pred = (y_pred > 0.5).astype(int).reshape(-1,)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
least_val_model[0]