In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [20]:
# packages

# standard
import numpy as np
import pandas as pd
import os
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# dicom
import pydicom as dicom
from sklearn.model_selection import train_test_split

In [18]:
# read data
df_train_main = pd.read_csv('../input/rsna-2024-lumbar-spine-degenerative-classification/train.csv')
df_train_label = pd.read_csv('../input/rsna-2024-lumbar-spine-degenerative-classification/train_label_coordinates.csv')
df_train_desc = pd.read_csv('../input/rsna-2024-lumbar-spine-degenerative-classification/train_series_descriptions.csv')
df_test_desc = pd.read_csv('../input/rsna-2024-lumbar-spine-degenerative-classification/test_series_descriptions.csv')
df_sub = pd.read_csv('../input/rsna-2024-lumbar-spine-degenerative-classification/sample_submission.csv')

In [None]:
# Usando melt para transformar colunas em linhas
df_unpivoted = df_train_main.melt(id_vars='study_id', var_name='condition', value_name='status')
frequency_table = df_unpivoted.groupby('condition')['status'].value_counts(normalize=True).unstack(fill_value=0)

# Resetando o índice para que 'condition' seja uma coluna novamente
frequency_table = frequency_table.reset_index()

frequency_table.rename(columns={'Moderate': 'moderate', 'Normal/Mild': 'normal_mild', 'Severe': 'severe'}, inplace=True)
df_sub['condition'] = df_sub['row_id'].str.extract(r'_(.*)')
df_sub = pd.merge(df_sub[['row_id', 'condition']],frequency_table, on='condition', how='inner')[['row_id', 'normal_mild', 'moderate', 'severe']]

In [None]:
# save submission file
df_sub.to_csv('submission.csv', index=False)

In [37]:
import os
import pandas as pd
import numpy as np
import pydicom
import cv2
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import TimeDistributed, Conv2D, MaxPooling2D, Flatten, LSTM, Dense, Input
from tensorflow.keras.optimizers import Adam

# Load the DataFrame
df_train_main = pd.read_csv('../input/rsna-2024-lumbar-spine-degenerative-classification/train.csv')

# Define the data generator function
def data_generator(df, image_dir, batch_size=10, target_size=(32, 32), images_per_batch=12):
    # Shuffle the data at the beginning of each epoch
    df = df.sample(frac=1).reset_index(drop=True)

    # Define the label mapping
    label_mapping = {
        "Normal/Mild": [1, 0, 0],
        "Moderate": [0, 1, 0],
        "Severe": [0, 0, 1]
    }

    def load_images_from_subfolders(study_path, num_images):
        images = []
        subfolders = [f.path for f in os.scandir(study_path) if f.is_dir()]
        subfolders = sorted(subfolders)[:min(len(subfolders), 4)]  # Consider at most 4 subfolders
        images_per_folder = max(1, num_images // len(subfolders))
        
        for folder in subfolders:
            files = [f for f in os.listdir(folder) if f.endswith('.dcm')]
            files = sorted(files)[:images_per_folder]
            
            for file in files:
                image_path = os.path.join(folder, file)
                # Load DICOM image
                dicom = pydicom.dcmread(image_path)
                image = dicom.pixel_array
                
                # Convert image to uint8 if necessary
                if image.dtype != np.uint8:
                    image = cv2.normalize(image, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
                image = cv2.resize(image, target_size)
                image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)  # Convert to RGB
                
                images.append(image)
                
                if len(images) >= num_images:
                    break
            if len(images) >= num_images:
                break
                
        return np.array(images)

    while True:
        for start in range(0, len(df), batch_size):
            end = min(start + batch_size, len(df))
            batch_df = df[start:end]
            
            images_batch = []
            labels_batch = []
            
            for _, row in batch_df.iterrows():
                study_id = row['study_id']
                original_label = row['spinal_canal_stenosis_l1_l2']
                one_hot_label = label_mapping[original_label]
                
                # Build the path to the study folder and get images
                study_path = os.path.join(image_dir, str(study_id))
                images = load_images_from_subfolders(study_path, images_per_batch)
                
                # Normalize images
                images = images.astype('float32') / 255.0
                
                images_batch.append(images)
                labels_batch.append(one_hot_label)
                
            images_batch = np.array(images_batch)
            labels_batch = np.array(labels_batch)
            
            yield images_batch, labels_batch

# Define the CNN-LSTM model
def create_model(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))
    model.add(TimeDistributed(Conv2D(32, (3, 3), activation='relu')))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    model.add(TimeDistributed(Conv2D(64, (3, 3), activation='relu')))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(64))
    model.add(Dense(3, activation='softmax'))
    
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Usage example
image_dir = '/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images'
batch_size = 10
input_shape = (12, 32, 32, 3)

# Create data generators
train_gen = data_generator(df_train_main[:int(0.9*len(df_train_main))], image_dir, batch_size=batch_size)
val_gen = data_generator(df_train_main[int(0.9*len(df_train_main)):], image_dir, batch_size=batch_size)

# Create the model
model = create_model(input_shape)

# Train the model
model.fit(train_gen, steps_per_epoch=180, validation_data=val_gen, validation_steps=20, epochs=10)


Epoch 1/10
[1m162/180[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m41s[0m 2s/step - accuracy: 0.9191 - loss: 0.2758

UnknownError: Graph execution error:

Detected at node PyFunc defined at (most recent call last):
<stack traces unavailable>
KeyError: nan
Traceback (most recent call last):

  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 270, in __call__
    ret = func(*args)

  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/data/ops/from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "/opt/conda/lib/python3.10/site-packages/keras/src/trainers/data_adapters/generator_data_adapter.py", line 52, in get_tf_iterator
    for batch in self.generator:

  File "/tmp/ipykernel_33/2666337837.py", line 68, in data_generator
    one_hot_label = label_mapping[original_label]

KeyError: nan


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_one_step_on_iterator_5379]