In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Ensure nltk punkt tokenizer is downloaded
nltk.download('punkt')

# Load CSV 
file_path = "news_sample.csv"
textpd = pd.read_csv(file_path, encoding="utf-8")

# Define the clean_text function
def clean_text(data):
    if not isinstance(data, str):  # Handle NaN values safely
        return ""

    # Convert text to lowercase
    data = data.lower()

    # Remove extra whitespace
    data = re.sub(r'\s+', " ", data)

    # Replace dates
    data = re.sub(r'\d{1,2}[./-]\d{1,2}[./-]\d{2,4}', "<DATE>", data)
    data = re.sub(r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec).? \d{1,2},? \d{4}', "<DATE>", data)
    data = re.sub(r'\d{4}-\d{2}-\d{2}', "<DATE>", data)

    # Replace emails
    data = re.sub(r'[\w._%+-]+@[\w.-]+\.[a-zA-Z]{2,}', "<EMAIL>", data)

    # Replace URLs
    data = re.sub(r'http[s]?://[^\s]+', "<URL>", data)

    # Replace numbers
    data = re.sub(r'\d+(\.\d+)?', "<NUM>", data)

    return data

#Clean all columns
columns_to_clean = ["id", "domain", "type", "url", "content", "title", "authors", "keywords", "meta_keywords", "meta_description", "tags", "summary"]


# Apply cleaning to each column
for col in columns_to_clean:
    if col in textpd.columns:  # Avoid KeyError if column is missing
        textpd[col] = textpd[col].astype(str).apply(clean_text)


# Combine all cleaned text from DataFrame columns
full_text = " ".join(textpd[col].dropna().astype(str).str.cat(sep=" ") for col in columns_to_clean if col in textpd.columns)

# Tokenize the cleaned text
tokens = word_tokenize(full_text)
print("Tokens:",len(tokens))

# Hent stopord
stop_words = set(stopwords.words('english'))

# Fjern stopord
filtered_tokens = [w for w in tokens if w.lower() not in stop_words]
print("Tokens - remowed stop words:",len(filtered_tokens))
Reductionrate_after_stemming_and_stopwords = (1-len(filtered_tokens)/len(tokens))*100
print("Reduction rate after remowing stop words:",Reductionrate_after_stemming_and_stopwords)

ps = PorterStemmer()
stemmed_tokens = [ps.stem(w) for w in filtered_tokens]
print("Tokens - stemmed:",len(stemmed_tokens))
Reductionrate_after_stemming = (1-len(stemmed_tokens)/len(filtered_tokens))*100
print("Reduction rate after remowed stopwords and stemming:",Reductionrate_after_stemming)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emilv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Tokens: 218398
Tokens - remowed stop words: 140592
Reduction rate after stemming and stop words: 35.625784118902196
Tokens - stemmed: 140592
Reduction rate after stemming: 0.0
