In [None]:
import os
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
import numpy as np
import pandas as pd
import time

# Analysis of current Dataset

In [None]:
train_path = '/content/drive/MyDrive/machine_learning/projet/Brain-Tumor-Classification-DataSet-master/Testing'
test_path = '/content/drive/MyDrive/machine_learning/projet/Brain-Tumor-Classification-DataSet-master/Training'

In [None]:
def create_dataframe():
  paths = [train_path, test_path]

  names = []
  groups = []
  labels = []
  heights = []
  widths = []

  for path in paths:
    foldlabels = os.listdir(path)
    for label in foldlabels:
      filenames = os.listdir(path + '/' + label)
      for name in filenames:
        fold_path_im = os.path.join(path, label + '/' + name)
        if '.jpg' in name:

          im = np.array(Image.open(fold_path_im))
          names.append(name)
          groups.append(path.split('/')[-1])
          labels.append(label)
          heights.append(im.shape[0])
          widths.append(im.shape[1])
        else:
          os.remove(fold_path_im)
          print('corrupted file for ',  fold_path_im)

  return pd.DataFrame({
      'file' : names,
      'group' : groups,
      'label' : labels,
      'height' : heights,
      'width' : widths
  })

In [None]:
df = create_dataframe()

In [None]:
df.head()

In [None]:
df.name = 'total dataset'
df_train = df[df['group'] == 'Training'].copy()
df_train.name = 'train dataset'
df_test = df[df['group'] == 'Testing'].copy()
df_test.name = 'test dataset'
datasets = [df, df_train, df_test]

In [None]:
df.name = 'total dataset'
df_train = df[df['group'] == 'Training'].copy()
df_train.name = 'train dataset'
df_test = df[df['group'] == 'Testing'].copy()
df_test.name = 'test dataset'
datasets = [df, df_train, df_test]

In [None]:
fig, axes = plt.subplots(1, len(datasets), figsize=(10 * len(datasets), 6))
for i, dataset in enumerate(datasets):
    h = np.array(dataset['height'])
    axes[i].bar(0, np.mean(h), yerr = np.std(h), alpha=0.5, ecolor='black', capsize=10)
    axes[i].set_title('Box Plot of height for ' + dataset.name)
    axes[i].set_ylabel('')

plt.show()

In [None]:
fig, axes = plt.subplots(1, len(datasets), figsize=(10 * len(datasets), 6))
for i, dataset in enumerate(datasets):
    h = np.array(dataset['width'])
    axes[i].bar(0, np.mean(h), yerr = np.std(h), alpha=0.5, ecolor='black', capsize=10)
    axes[i].set_title('Box Plot of width for ' + dataset.name)
    axes[i].set_ylabel('')

plt.show()

# Re-Split the Data

In [None]:
0.03 * df.shape[0]

In [None]:
(100 * 98)/df.shape[0]

- train = 90% of total dataset
- testing = 12% of total dataset
- 3% of each label in test dataset (3% * 4) => 12%


In [None]:
#recreate the current hierarchy
try:
  os.makedirs('/content/drive/MyDrive/machine_learning/projet/FinalDataset')
except:
  print('already existing')

In [None]:
im_per_label = round(0.2* df.shape[0])
im_per_label

In [None]:
import random
import shutil

In [None]:
def create_train():
  try:
    shutil.rmtree('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Training')
  except:
    print("folder doesn't exist")

  os.makedirs('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Training')
  os.makedirs('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Training/no_tumor')
  os.makedirs('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Training/glioma_tumor')
  os.makedirs('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Training/meningioma_tumor')
  os.makedirs('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Training/pituitary_tumor')

  already_in = []
  for label in df['label'].unique():
    df_label = df[df['label'] == label].copy()
    for i in range(im_per_label):
      j = random.randint(0,df_label.shape[0]-1)
      im_name = df_label['file'].iloc[j]
      while im_name in already_in:
        j = random.randint(0,df_label.shape[0]-1)
        im_name = df_label['file'].iloc[j]
      image = Image.open('/content/drive/MyDrive/machine_learning/projet/Brain-Tumor-Classification-DataSet-master/' + df_label['group'].iloc[j] + '/' + label + '/' + im_name)
      image.save('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Training/' + label + '/' + im_name)
      already_in.append(im_name)
  return already_in

In [None]:
def create_test(train_content):
  try:
    shutil.rmtree('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Testing')
  except:
    print("folder doesn't exist")

  os.makedirs('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Testing')
  os.makedirs('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Testing/no_tumor')
  os.makedirs('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Testing/glioma_tumor')
  os.makedirs('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Testing/meningioma_tumor')
  os.makedirs('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Testing/pituitary_tumor')

  for i in range(df.shape[0]):
    if df['file'].iloc[i] not in train_content:
      image = Image.open('/content/drive/MyDrive/machine_learning/projet/Brain-Tumor-Classification-DataSet-master/' + df['group'].iloc[i] + '/' + df['label'].iloc[i] + '/' + df['file'].iloc[i])
      image.save('/content/drive/MyDrive/machine_learning/projet/FinalDataset/Testing/' + df['label'].iloc[i] + '/' + df['file'].iloc[i])



In [None]:
train_content = create_train()

In [None]:
create_train(test_content)

In [None]:
train_path = '/content/drive/MyDrive/machine_learning/projet/FinalDataset/Testing'
test_path = '/content/drive/MyDrive/machine_learning/projet/FinalDataset/Training'

In [None]:
df_final = create_dataframe()

In [None]:
df_final.head()

In [None]:
df_final.shape

In [None]:
df_final.name = 'total dataset'
df_train = df_final[df_final['group'] == 'Training'].copy()
df_train.name = 'train dataset'
df_test = df_final[df_final['group'] == 'Testing'].copy()
df_test.name = 'test dataset'
datasets = [df_final, df_train, df_test]

In [None]:
fig, axes = plt.subplots(1, len(datasets), figsize=(10 * len(datasets), 6))

for i, dataset in enumerate(datasets):
    label_counts = dataset['label'].value_counts().sort_index()
    label_counts = (label_counts * 100) / dataset.shape[0]

    axes[i].pie(label_counts, labels=label_counts.index, autopct='%1.1f%%', startangle=140)
    axes[i].set_title('Frequency Percentage of each label in ' + dataset.name)
    axes[i].set_ylabel('')

plt.show()