In [5]:
import csv
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import csv
csv.field_size_limit(10_000_000)  # Set the field size limit explicitly


# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """Remove stopwords and apply lemmatization."""
    if not text:
        return ""
    words = text.split()
    processed = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]
    return " ".join(processed)

# Input and output file names
input_file = 'output2.csv'
output_file = 'processed_output.csv'

# Open the input CSV file
with open(input_file, encoding='utf-8') as f_in:
    reader = csv.DictReader(f_in)
    rows = list(reader)  # Read all rows into a list for progress tracking

    # Process each row
    for row in tqdm(rows, desc="Processing rows"):
        for column in ['title', 'abstract', 'body_text', 'body_text2']:
            if column in row:  # Check if the column exists in the row
                row[column] = preprocess_text(row[column])

# Write the processed data to a new CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as f_out:
    fieldnames = rows[0].keys()  # Use the same fieldnames as in the input file
    writer = csv.DictWriter(f_out, fieldnames=fieldnames)

    # Write the header
    writer.writeheader()

    # Write the processed rows
    for row in tqdm(rows, desc="Writing to CSV"):
        writer.writerow(row)

print(f"Data has been successfully written to '{output_file}'.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\flori\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\flori\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\flori\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Processing rows: 100%|██████████████████████████████████████████████████████████| 35094/35094 [04:14<00:00, 137.92it/s]
Writing to CSV: 100%|██████████████████████████████████████████████████████████| 35094/35094 [00:05<00:00, 5959.88it/s]

Data has been successfully written to 'processed_output.csv'.



