**Pre-Processing PipeLine**

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import swifter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

nltk.download('stopwords')
nltk.download('punkt')

stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))

code_snippet_pattern = re.compile(r'<code>.*?</code>') 
url_pattern = re.compile(r'https?://\S+|www\.\S+') 
email_pattern = re.compile(r'\b[\w\.-]+@[\w\.-]+\.\w+\b')  
number_pattern = re.compile(r'\b\d+\b') 
non_alpha_pattern = re.compile(r'[^a-zA-Z]+')  

def preprocess_text(text):
    if pd.isnull(text): 
        return ""

    soup = BeautifulSoup(text, 'lxml')
    for code_block in soup.find_all(['code', 'pre']):
        code_block.decompose()
    text = soup.get_text()

    text = code_snippet_pattern.sub(' ', text)
    text = url_pattern.sub(' ', text)
    text = email_pattern.sub(' ', text)
    text = number_pattern.sub(' ', text)
    text = non_alpha_pattern.sub(' ', text)
    text = text.lower()

    tokens = word_tokenize(text)
    tokens = [stemmer.stem(tok) for tok in tokens if tok.isalpha() and tok not in stop_words and len(tok) > 1]
    return ' '.join(tokens)

def preprocess_tags(tags_text):
    if pd.isnull(tags_text):
        return ""
    tags_text = tags_text.replace("|", " " )
    return preprocess_text(tags_text)

def preprocess(input_path, output_path):
    df = pd.read_csv(input_path)

    for col in ['title', 'body']:
        new_col = f'processed_{col}'
        df[new_col] = df[col].swifter.apply(preprocess_text)

    df['processed_tags'] = df['tags'].swifter.apply(preprocess_tags)
    df['concatenated_text'] = df['processed_title'] + ' ' + df['processed_body'] + ' ' + df['processed_tags']
    df['concatenated_text'] = df['concatenated_text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
    df[['id', 'concatenated_text']].to_csv(output_path, index=False)




In [1]:
# Example usage:
# preprocess('input_file.csv', 'output_file.csv')