In [None]:
# data_preprocessing.py

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def load_data(filepath):
    """Loads data from a CSV or JSON file."""
    try:
      if filepath.endswith('.csv'):
        return pd.read_csv(filepath)
      elif filepath.endswith('.json'):
        return pd.read_json(filepath)
      else:
          raise ValueError("Unsupported file type. Only CSV and JSON are supported")
    except Exception as e:
      print(f"Error loading the file: {e}")
      return None

def clean_text(text):
    """Cleans text by removing special characters, punctuation, and numbers."""
    if not isinstance(text, str):
        return ""  # Handle non-string inputs
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # remove urls
    text = re.sub(r'\@\w+|\#','', text) # remove mentions and hashtags
    text = re.sub(r'\d+', '', text) # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    return text

def tokenize_text(text):
    """Tokenizes text into words."""
    if isinstance(text, str):
        return nltk.word_tokenize(text)
    return []

def remove_stopwords(tokens):
    """Removes common stop words."""
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

def lemmatize_text(tokens):
    """Reduces words to their base form using lemmatization."""
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def preprocess_text(text):
    """Combines all preprocessing steps."""
    cleaned_text = clean_text(text)
    tokens = tokenize_text(cleaned_text)
    filtered_tokens = remove_stopwords(tokens)
    lemmatized_tokens = lemmatize_text(filtered_tokens)
    return " ".join(lemmatized_tokens)

def preprocess_dataframe(df, text_column):
    """Applies preprocessing to a pandas DataFrame."""
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in DataFrame.")

    df["processed_text"] = df[text_column].apply(preprocess_text)
    return df


def main():
    # Example usage
    try:
        data = load_data("reviews.csv")
        if data is not None and not data.empty:
            processed_df = preprocess_dataframe(data, "text") # Adjust "text" to your column name
            print(processed_df.head())
        else:
            print("No data or an empty file was loaded. Please provide a csv file with reviews.")
    except ValueError as e:
        print(f"Error during processing: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == '__main__':
    main()
