In [None]:
import pandas as pd
from googletrans import Translator
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import glob
import os

# Download NLTK resources if not already downloaded
# nltk.download('stopwords')
# nltk.download('punkt')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_sentence)

def translate_to_english(text):
    translator = Translator()
    if text.strip():  # Check if text is not empty or whitespace
        try:
            translated_text = translator.translate(text, src='auto', dest='en').text
        except Exception as e:
            print(f"Translation failed for text: {text}. Error: {e}")
            translated_text = text  # If translation fails, return original text
    else:
        translated_text = text  # If text is empty or whitespace, return as is
    return translated_text


# Define input and output folder paths
input_folder = 'User Data'
output_folder = 'data'

# Ensure the output folder exists or create it if not
os.makedirs(output_folder, exist_ok=True)

# Get all CSV files in the input folder
csv_files = glob.glob(os.path.join(input_folder, '*.csv'))

# Iterate through each CSV file
for csv_file in csv_files:
    # Define output file path
    output_file = os.path.join(output_folder, os.path.basename(csv_file))
    
    # Read the CSV file into a DataFrame
    data = pd.read_csv(csv_file)
    
    # Preprocess text column to handle NaN values
    data['Review Body'] = data['Review Body'].fillna('')  # Replace NaN with empty string
    
    # Translate non-English text to English
    data['Review Body'] = data['Review Body'].apply(translate_to_english)
    
    # Apply the remove_stopwords function to the specified text column
    data['cleaned_body'] = data['Review Body'].apply(remove_stopwords)
    
    # Save the cleaned DataFrame to a new CSV file in the output folder
    data.to_csv(output_file, index=False)
    
    print(f"Stopwords removed and cleaned data saved to {output_file}")
