In [17]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Ensure stopwords and wordnet are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Load the stopwords list
stop_words = set(stopwords.words('english'))

# Define preprocessing function
def preprocess_text(text):
    # 1. Lower Case
    text = text.lower()
    
    # 2. Remove Links
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # 3. Remove New Lines (\n)
    text = text.replace('\n', ' ')
    
    # 4. Remove Words Containing Numbers
    text = re.sub(r'\w*\d\w*', '', text)
    
    # 5. Remove Extra Spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 6. Remove Special Characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 7. Remove Stop Words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # 8. Stemming
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    
    # 9. Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text

# File paths
train_path = r'D:\archive\all_data\train\radiologytraindata.csv'
val_path = r'D:\archive\all_data\validation\radiologyvaldata.csv'
test_path = r'D:\archive\all_data\test\radiologytestdata.csv'

# Load the data
train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)
#test_data = pd.read_csv(test_path)

# Combine all data for preprocessing
combined_data = pd.concat([train_data])  # val_data, test_data

# Apply preprocessing to the caption column
combined_data['caption'] = combined_data['caption'].apply(preprocess_text)

# Check the result
print(combined_data['caption'].head())

# Split data back into train, validation, and test sets
train_data = combined_data.iloc[:len(train_data)]
#val_data = combined_data.iloc[len(train_data):len(train_data) + len(val_data)]
#test_data = combined_data.iloc[len(train_data) + len(val_data):]

print(train_data.shape)
#print(val_data.shape)
#print(test_data.shape)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ry981\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ry981\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    comput tomographi scan axial view show obliter...
1    bacteri contamin occur complet root canal trea...
2    patient residu paralysi hand poliomyel necessa...
3                       panoram radiograph immedi load
4    plain abdomen xray multipl air level midabdome...
Name: caption, dtype: object
(65450, 3)


In [18]:
import pandas as pd

# Define the path to the CSV file
train_path = r'D:\archive\all_data\train\radiologytraindata.csv'

# Load the CSV file into a DataFrame
train_data = pd.read_csv(train_path)

# Display the first few rows of the DataFrame to verify it was loaded correctly
print(train_data.head())


           id                                    name  \
0  ROCO_00002          PMC4083729_AMHSR-4-14-g002.jpg   
1  ROCO_00003       PMC2837471_IJD2009-150251.001.jpg   
2  ROCO_00004  PMC2505281_11999_2007_30_Fig6_HTML.jpg   
3  ROCO_00005       PMC3745845_IJD2013-683423.005.jpg   
4  ROCO_00007   PMC4917066_amjcaserep-17-301-g001.jpg   

                                             caption  
0   Computed tomography scan in axial view showin...  
1   Bacterial contamination occurred after comple...  
2   The patient had residual paralysis of the han...  
3    Panoramic radiograph after immediate loading.\n  
4   Plain abdomen x-ray: Multiple air levels at t...  


In [19]:
import pandas as pd
import os
from PIL import Image
from PIL import UnidentifiedImageError
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

In [1]:
# Install required libraries
!pip install tensorflow opencv-python-headless

# Correct import statement
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the path to the extracted folder
img_dir = r'D:\archive\all_data\test\radiology\images'

# Function to load images
import os
import cv2
import numpy as np

def load_images(img_dir, img_size=(224, 224)):
    images = []
    labels = []
    for img_name in os.listdir(img_dir):
        img_path = os.path.join(img_dir, img_name)
        img = cv2.imread(img_path)
        img = cv2.resize(img, img_size)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        images.append(img)
        # Assuming label is part of the filename before an underscore
        labels.append(img_name.split('_')[0])
    images = np.array(images)
    labels = np.array(labels)
    return images, labels

# Load the images
images, labels = load_images(img_dir)

# Normalize the images
images = images / 255.0

# Print shapes to verify
print("Original images shape:", images.shape)
print("Original labels shape:", labels.shape)

# Data Augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Fit the datagen on the images
datagen.fit(images)

# Example of how to use the datagen
augmented_images, augmented_labels = next(datagen.flow(images, labels, batch_size=32))

# Print shapes to verify augmentation
print("Augmented images shape:", augmented_images.shape)
print("Augmented labels shape:", augmented_labels.shape)


Original images shape: (8176, 224, 224, 3)
Original labels shape: (8176,)
Augmented images shape: (32, 224, 224, 3)
Augmented labels shape: (32,)
