<a href="https://colab.research.google.com/github/RohanRaghav/Human_Machine_Collaboration-in-brain-tumor-dignosis/blob/main/Human_Machine_Collaboration_brain_tumor_dignosis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'brian-tumor-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1343913%2F2236708%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240701%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240701T064401Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D57093efa8c935ad53f3a311fba8bf7965abd32ea738637bd202e61d443b91de8e2a84070eb943a78b501339328279103bb792d841f40c532b48b346baa15a0cb00767535066523056101ce70607d06e9889da5f219f1f05bd935aa7daf98be4cac4bd9102a8f0742d0bead5d830d6c1a733da131ce05eb5c2c0cb215ce8fb02d4dc5f987fab01a4b380e8cbb9671e2e251561af80c8b876ba697e89954b9e36e3abde8492e7a61d00bddb23a6fa695abb3f5b5a3020fa96d4a7151e5adb4bec485c335bd9e60138767a5caadb97999f3882288fb746f003f2052da1c57f183f4d942ef48ab3a59e50bbf44a2370c589225b5905282d769e816316f2e3cf1c9dd'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator

In [None]:
gen = ImageDataGenerator(rescale=1./255,validation_split = 0.2,zoom_range=(0.99,0.99),dtype=tf.float32)

In [None]:
train = gen.flow_from_directory("/kaggle/input/brian-tumor-dataset/Brain Tumor Data Set/Brain Tumor Data Set/",
                               target_size = (150,150),
                               batch_size = 256,
                               class_mode = "binary",
                               color_mode = "rgb",
                               shuffle = True,
                               seed = 123,
                               subset = "training")

In [None]:
val = gen.flow_from_directory("/kaggle/input/brian-tumor-dataset/Brain Tumor Data Set/Brain Tumor Data Set/",
                               target_size = (150,150),
                               batch_size = 8,
                               class_mode = "binary",
                               color_mode = "rgb",
                               shuffle = True,
                               seed = 123,
                               subset = "validation")
classes = val.class_indices

# With grayscale as color mode we get high spikes in validation loss in training and substantially lower accuracy compared with a dataset with rgb color mode.

In [None]:
classes

In [None]:
import seaborn as sns

# Class distribution in training dataset

In [None]:
t=0
h=0
for i in range(15):
    a, b = next(train)
    for j in b:
        if j == 1:
            h+=1
        else:t+=1

sns.barplot(x=['tumor','healty'],y=[t,h])


In [None]:
import matplotlib.pyplot as plt
batch = next(train)

plt.imshow(batch[0][0])

# Simple cnn

In [None]:
from keras.layers import Conv2D, MaxPool2D, LeakyReLU, BatchNormalization, Dropout, Dense, InputLayer, Flatten
from keras.losses import BinaryCrossentropy
from keras.optimizers import Adam

In [None]:
model = keras.Sequential()
model.add(InputLayer(input_shape=(150,150,3)))
model.add(Conv2D(filters=32,kernel_size=3, activation="relu", padding="same"))
model.add(MaxPool2D())
model.add(Conv2D(filters=64,kernel_size=3, activation="relu", padding="same"))
model.add(MaxPool2D())


model.add(Flatten())


model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(rate=0.3))
model.add(Dense(64, activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(rate=0.3))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer=Adam(0.001),loss = BinaryCrossentropy(),metrics=['accuracy'])


In [None]:
model.summary()

# Model plot

In [None]:
tf.keras.utils.plot_model(
    model, to_file='model.png', show_shapes=True,
    show_layer_names=True,
)

In [None]:
from keras import utils, callbacks
earlystopping = callbacks.EarlyStopping(monitor="val_loss", mode="min",
                                        patience=5, restore_best_weights = True)

In [None]:
# Assuming you have defined and trained your model (named 'model') before this code snippet
history = model.fit(train, verbose=1, callbacks=[earlystopping], epochs=20, validation_data=val)

# Calculate accuracy
accuracy = history.history['val_accuracy'][-1]
print("Validation Accuracy:", accuracy)


In [None]:
model.save('/kaggle/working/model_cnn.h5')

# Plotting accuracy

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0, 1])
plt.legend(loc='lower right')

# Plotting loss

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label = 'val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim([0, 1])
plt.legend(loc='lower right')

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Conv2D, MaxPool2D, Flatten, Dense, InputLayer
from keras.models import Sequential
from keras.losses import BinaryCrossentropy
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Data preparation
gen = ImageDataGenerator(rescale=1./255, validation_split=0.2, zoom_range=(0.99,0.99), dtype=tf.float32)

train = gen.flow_from_directory("/kaggle/input/brian-tumor-dataset/Brain Tumor Data Set/Brain Tumor Data Set/",
                               target_size=(150, 150),
                               batch_size=256,
                               class_mode="binary",
                               color_mode="rgb",
                               shuffle=True,
                               seed=123,
                               subset="training")

val = gen.flow_from_directory("/kaggle/input/brian-tumor-dataset/Brain Tumor Data Set/Brain Tumor Data Set/",
                               target_size=(150, 150),
                               batch_size=8,
                               class_mode="binary",
                               color_mode="rgb",
                               shuffle=True,
                               seed=123,
                               subset="validation")

# CNN Model
cnn_model = Sequential([
    InputLayer(input_shape=(150, 150, 3)),
    Conv2D(filters=32, kernel_size=3, activation="relu", padding="same"),
    MaxPool2D(),
    Conv2D(filters=64, kernel_size=3, activation="relu", padding="same"),
    MaxPool2D(),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(1, activation="sigmoid")
])



In [None]:
train

In [None]:
train.labels

In [None]:

cnn_model.compile(optimizer=Adam(0.001), loss=BinaryCrossentropy(), metrics=['accuracy'])
cnn_model.summary()

# SVM Model
svm_model = SVC(kernel='linear', probability=True)



In [None]:

# Transfer Learning Model (using a pre-trained model like VGG16)
base_model = keras.applications.VGG16(weights='imagenet', include_top=False, input_shape=(150, 150, 3))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import EarlyStopping

# Assuming you have defined and trained base_model before this code snippet

# Freeze the layers in base_model
for layer in base_model.layers:
    layer.trainable = False

# Define transfer_model
transfer_model = Sequential([
    base_model,
    Flatten(),
    Dense(128, activation='relu'),
    Dense(1, activation="sigmoid")
])

# Compile transfer_model
transfer_model.compile(optimizer=Adam(0.001), loss=BinaryCrossentropy(), metrics=['accuracy'])
transfer_model.summary()

# Assuming you have 'train' and 'val' datasets for training and validation

# Train transfer_model
transfer_history = transfer_model.fit(train, verbose=1, callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=5, restore_best_weights=True)], epochs=20, validation_data=val)

# Calculate accuracy
accuracy = transfer_history.history['val_accuracy'][-1]
print("Validation Accuracy of the transfer_model:", accuracy)


In [None]:
model.save('model_transfer.h5')

In [None]:
import pandas as pd

# Load your dataset into a DataFrame
train_data_path = "/kaggle/input/brian-tumor-dataset/metadata_rgb_only.csv"
train = pd.read_csv(train_data_path)

# Display the column names
print(train.columns)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from joblib import dump

# Data preparation
gen = ImageDataGenerator(rescale=1./255, validation_split=0.2, zoom_range=(0.99, 0.99), dtype=tf.float32)

train = gen.flow_from_directory("/kaggle/input/brian-tumor-dataset/Brain Tumor Data Set/Brain Tumor Data Set/",
                                target_size=(150, 150),  # Adjusted target size
                                batch_size=256,
                                class_mode="binary",
                                color_mode="rgb",
                                shuffle=True,
                                seed=123,
                                subset="training")

val = gen.flow_from_directory("/kaggle/input/brian-tumor-dataset/Brain Tumor Data Set/Brain Tumor Data Set/",
                              target_size=(150, 150),  # Adjusted target size
                              batch_size=8,
                              class_mode="binary",
                              color_mode="rgb",
                              shuffle=True,
                              seed=123,
                              subset="validation")

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=50, random_state=123)  # Set n_estimators to a smaller value

# Extract features and labels
train_features, train_labels = next(train)
val_features, val_labels = next(val)

# Reshape features for Random Forest
train_features = train_features.reshape(train_features.shape[0], -1)
val_features = val_features.reshape(val_features.shape[0], -1)

# Flatten labels for Random Forest
train_labels = train_labels.flatten()
val_labels = val_labels.flatten()

# Train Random Forest model
rf_model.fit(train_features, train_labels)

# Evaluate Random Forest model
rf_val_preds = rf_model.predict(val_features)
rf_accuracy = accuracy_score(val_labels, rf_val_preds)
print("Random Forest Model Accuracy:", rf_accuracy)

# Save Random Forest model
model_path = '/kaggle/working/random_forest_model.joblib'
dump(rf_model, model_path)


In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Conv2D, MaxPool2D, Flatten, Dense, InputLayer
from keras.models import Sequential, load_model
from keras.losses import BinaryCrossentropy
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from joblib import dump

# Data preparation
gen = ImageDataGenerator(rescale=1./255, validation_split=0.2, zoom_range=(0.99, 0.99), dtype=tf.float32)

train = gen.flow_from_directory("/kaggle/input/brian-tumor-dataset/Brain Tumor Data Set/Brain Tumor Data Set/",
                                target_size=(150, 150),
                                batch_size=256,
                                class_mode="binary",
                                color_mode="rgb",
                                shuffle=True,
                                seed=123,
                                subset="training")

val = gen.flow_from_directory("/kaggle/input/brian-tumor-dataset/Brain Tumor Data Set/Brain Tumor Data Set/",
                              target_size=(150, 150),
                              batch_size=8,
                              class_mode="binary",
                              color_mode="rgb",
                              shuffle=True,
                              seed=123,
                              subset="validation")

# CNN Model
cnn_model = Sequential([
    Conv2D(filters=32, kernel_size=3, activation="relu", padding="same", input_shape=(150, 150, 3)),
    MaxPool2D(),
    Conv2D(filters=64, kernel_size=3, activation="relu", padding="same"),
    MaxPool2D(),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(1, activation="sigmoid")
])

cnn_model.compile(optimizer=Adam(0.001), loss=BinaryCrossentropy(), metrics=['accuracy'])
cnn_model.summary()

# Training with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

cnn_history = cnn_model.fit(train, epochs=20, validation_data=val, callbacks=[early_stopping])

# Saving CNN model
cnn_model.save('model_cnn.h5')

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=123)

# Extract features and labels
train_features, train_labels = next(train)
val_features, val_labels = next(val)

# Reshape features for Random Forest
train_features = train_features.reshape(train_features.shape[0], -1)
val_features = val_features.reshape(val_features.shape[0], -1)

# Flatten labels for Random Forest
train_labels = train_labels.flatten()
val_labels = val_labels.flatten()

# Train Random Forest model
rf_model.fit(train_features, train_labels)

# Evaluate Random Forest model
rf_val_preds = rf_model.predict(val_features)
rf_accuracy = accuracy_score(val_labels, rf_val_preds)
print("Random Forest Model Accuracy:", rf_accuracy)

# Save Random Forest model
model_path = '/kaggle/working/random_forest_model.joblib'
dump(rf_model, model_path)

# Ensemble Model
# Load CNN and transfer learning models
cnn_model = load_model('model_cnn.h5')
transfer_model = load_model('model_transfer.h5')

# Generate predictions
cnn_preds = cnn_model.predict(val)
transfer_preds = (transfer_model.predict(val) > 0.5).astype("int32")
rf_preds = rf_model.predict(val_features)

# Reshape predictions if necessary
if cnn_preds.ndim > 1:
    cnn_preds = cnn_preds.flatten()
if transfer_preds.ndim > 1:
    transfer_preds = transfer_preds.flatten()
if rf_preds.ndim > 1:
    rf_preds = rf_preds.flatten()

# Reshape predictions to match the shape of val_labels
max_len = len(val_labels)

cnn_preds = np.resize(cnn_preds, max_len)
transfer_preds = np.resize(transfer_preds, max_len)
rf_preds = np.resize(rf_preds, max_len)

# Ensemble predictions
ensemble_preds = np.round((cnn_preds + transfer_preds + rf_preds) / 3)
ensemble_accuracy = accuracy_score(val_labels, ensemble_preds)
print("Ensemble Model Accuracy:", ensemble_accuracy)


In [None]:


plt.plot(cnn_history.history['accuracy'], label='ensemble_accuracy')
plt.plot(cnn_history.history['val_accuracy'], label='val_ensemble_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0, 1])
plt.legend(loc='lower right')
plt.show()


In [None]:
plt.plot(cnn_history.history['loss'], label='loss')
plt.plot(cnn_history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim([0, 1])
plt.legend(loc='lower right')
plt.show()

In [None]:
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Concatenate, Input

# Get the output shapes of the CNN and transfer learning models
cnn_features = cnn_model.layers[-2].output_shape[1]
transfer_features = transfer_model.layers[-2].output_shape[1]

# Calculate the total number of features
num_features = cnn_features + transfer_features

# Create a combined model for ensemble prediction
ensemble_input = Input(shape=(num_features,), name='ensemble_input')
cnn_output = cnn_model.layers[-2].output  # Get the output of the last dense layer in CNN model
transfer_output = transfer_model.layers[-2].output  # Get the output of the last dense layer in transfer learning model

# Concatenate the outputs of CNN and transfer learning models with the ensemble input
ensemble_output = Concatenate()([cnn_output, transfer_output, ensemble_input])

# Add dense layers for final prediction
ensemble_output = Dense(128, activation='relu')(ensemble_output)
ensemble_output = Dense(1, activation='sigmoid')(ensemble_output)

# Define the ensemble model
ensemble_model = Model(inputs=[cnn_model.input, transfer_model.input, ensemble_input], outputs=ensemble_output)

# Plot the architecture of the ensemble model
plot_model(ensemble_model, to_file='ensemble_model.png', show_shapes=True, show_layer_names=True)


# Next step: Actually create a test set!
# This model will be sufficient for this task, so no need for a bigger model.