In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import os

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function for text preprocessing
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    # Remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join the words back into a string
    preprocessed_text = ' '.join(words)
    
    return preprocessed_text

# File paths
caption_file = r'C:\Users\gowsh\OneDrive\Desktop\Building AI Model\dataset\captions.txt'
output_file = r'C:\Users\gowsh\OneDrive\Desktop\Building AI Model\dataset\preprocessed_captions.txt'

# Check if the caption file exists
if not os.path.isfile(caption_file):
    print(f"Error: Caption file '{caption_file}' not found.")
else:
    # Read captions from the file
    with open(caption_file, 'r', encoding='utf-8') as file:
        captions = file.readlines()
    
    # Skip the header line and process each caption
    preprocessed_captions = []
    for caption in captions[1:]:  # Skip the header line
        # Split the line to get image ID and caption
        parts = caption.strip().split(',', 1)  # Split by the first comma
        if len(parts) != 2:
            print(f"Error: Invalid format in line: {caption.strip()}")
            continue
        
        img_id, caption_text = parts
        preprocessed_caption = preprocess_text(caption_text)
        preprocessed_captions.append((img_id, preprocessed_caption))
    
    # Save preprocessed captions to a new file
    with open(output_file, 'w', encoding='utf-8') as file:
        for img_id, preprocessed_caption in preprocessed_captions:
            file.write(f"{img_id},{preprocessed_caption}\n")

    print(f"preprocessed captions saved to '{output_file}' successfully.")

    # Print a limited number of preprocessed captions for demonstration
    print("Sample of preprocessed captions:")
    for i in range(min(10, len(preprocessed_captions))):  # Print only the first 10 for demonstration
        img_id, preprocessed_caption = preprocessed_captions[i]
        print(f"Original Caption: {captions[i+1].strip()}")  # captions[i+1] because we skipped the header
        print(f"preprocessed Caption: {img_id}\t{preprocessed_caption}")
        print()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gowsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gowsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gowsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


preprocessed captions saved to 'C:\Users\gowsh\OneDrive\Desktop\Building AI Model\dataset\preprocessed_captions.txt' successfully.
Sample of preprocessed captions:
Original Caption: 1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .
preprocessed Caption: 1000268201_693b08cb0e.jpg	child pink dress climbing set stair entry way

Original Caption: 1000268201_693b08cb0e.jpg,A girl going into a wooden building .
preprocessed Caption: 1000268201_693b08cb0e.jpg	girl going wooden building

Original Caption: 1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
preprocessed Caption: 1000268201_693b08cb0e.jpg	little girl climbing wooden playhouse

Original Caption: 1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse .
preprocessed Caption: 1000268201_693b08cb0e.jpg	little girl climbing stair playhouse

Original Caption: 1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a wooden cabin .
p