In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import torchvision.transforms as transforms
from sklearn.preprocessing import LabelEncoder

In [2]:
file = []
labels = []
size = []
for dirname,_, filenames in os.walk('lung_colon_image_set'):
    for filename in filenames:
        if not filename.lower().endswith('.ds_store'):
            if "lung_aca" in dirname:
                labels.append("Lung Adenocarcinoma")
            elif "lung_n" in dirname:
                labels.append("Lung Benign Tissue")
            elif "lung_scc" in dirname:
                labels.append("Lung Squamous Cell Carcinoma")
            elif "colon_n" in dirname:
                labels.append("Colon Benign Tissue")
            elif "colon_aca" in dirname:
                labels.append("Colon Adenocarcinoma")
            file.append(os.path.join(dirname, filename))

In [3]:
Slabels = pd.Series(labels,name="Labels")
Sfile  = pd.Series(file,name="Filepaths")
data = pd.concat([Sfile,Slabels],axis=1)

In [4]:
data.head()

Unnamed: 0,Filepaths,Labels
0,lung_colon_image_set/lung_image_sets/lung_aca/...,Lung Adenocarcinoma
1,lung_colon_image_set/lung_image_sets/lung_aca/...,Lung Adenocarcinoma
2,lung_colon_image_set/lung_image_sets/lung_aca/...,Lung Adenocarcinoma
3,lung_colon_image_set/lung_image_sets/lung_aca/...,Lung Adenocarcinoma
4,lung_colon_image_set/lung_image_sets/lung_aca/...,Lung Adenocarcinoma


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Filepaths  25000 non-null  object
 1   Labels     25000 non-null  object
dtypes: object(2)
memory usage: 390.8+ KB


In [6]:
data.isnull().sum()

Filepaths    0
Labels       0
dtype: int64

In [7]:
data['Labels'].value_counts()

Lung Adenocarcinoma             5000
Lung Benign Tissue              5000
Lung Squamous Cell Carcinoma    5000
Colon Benign Tissue             5000
Colon Adenocarcinoma            5000
Name: Labels, dtype: int64

In [None]:
transform = transforms.Compose([transforms.ToTensor()])
grouped = data.groupby('Labels').head(2)

# Create a 2x5 grid of subplots
fig, axes = plt.subplots(2, 5, figsize=(10, 4))

# Iterate over the groups and plot the images in the subplots
for i, (_, row) in enumerate(grouped.iterrows()):
    # Extract the filepath and label from the row
    filepath = row['Filepaths']
    label = row['Labels']

    # Convert the filepath to a string
    filepath = str(filepath)

    # Open the image file
    image = Image.open(filepath)

    # Apply the transformation to convert the image to a tensor
    tensor = transform(image)

    # Convert the tensor to a numpy array
    array = tensor.numpy()

    # Plot the image in the corresponding subplot
    ax = axes[i // 5, i % 5]
    ax.imshow(array.transpose(1, 2, 0))
    ax.axis('off')
    annotation_x = image.width // 2
    annotation_y = image.height + 2
    # Annotate the image with its label
    ax.annotate(label, (annotation_x, annotation_y), color='red', ha='center', va='top')

# Adjust the spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
encoder = LabelEncoder()
encoded = encoder.fit_transform(data['Labels'])
reverse_mapping = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))
# Print the mapping
for number, label in reverse_mapping.items():
    print("Number: {}, Label: {}".format(number, label))

In [None]:
traindf,validationdf = train_test_split(data,random_state=25,train_size=0.8,stratify = data['Labels'])
validationdf,testdf = train_test_split(validationdf,random_state=25,train_size=0.5 ,stratify = validationdf['Labels'])

In [None]:
traindf["Labels"].value_counts()

In [None]:
validationdf['Labels'].value_counts()

In [None]:
testdf['Labels'].value_counts()

In [None]:
def resize_image(image_path, new_width, new_height):
    image = Image.open(image_path)
    resized_image = image.resize((new_width, new_height))
    return resized_image

# Define the new width and height for resizing
new_width = 228
new_height = 228

# Apply the resize_image function to each image in the DataFrame
data['resized_image'] = data['Filepaths'].apply(lambda path: resize_image(path, new_width, new_height))

In [None]:
d = data.copy()
vector = []
for i in range(len(d)):
    filepath = d['Filepaths'][i]
     # Convert the filepath to a string
    filepath = str(filepath)
    # Open the image file
    image = Image.open(filepath)
    # Apply the transformation to convert the image to a tensor
    tensor = transform(image)
    # Convert the tensor to a numpy array
    array = tensor.numpy()
    vector.append(array)

vectorS = pd.series(vector,name="Vector")
d['Vector'] = vectorS
d.head()