In [2]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize

# Ensure nltk punkt tokenizer is downloaded
nltk.download('punkt')

# Load CSV (Ensure correct column names)
file_path = "news_sample.csv"
textpd = pd.read_csv(file_path, encoding="utf-8")

with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

# Define the clean_text function
def clean_text(data):
    if not isinstance(data, str):  # Handle NaN values safely
        return ""

    # Convert text to lowercase
    data = data.lower()

    # Remove extra whitespace
    data = re.sub(r'\s+', " ", data)

    # Replace dates
    data = re.sub(r'\d{1,2}[./-]\d{1,2}[./-]\d{2,4}', "<DATE>", data)
    data = re.sub(r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec).? \d{1,2},? \d{4}', "<DATE>", data)
    data = re.sub(r'\d{4}-\d{2}-\d{2}', "<DATE>", data)

    # Replace emails
    data = re.sub(r'[\w._%+-]+@[\w.-]+\.[a-zA-Z]{2,}', "<EMAIL>", data)

    # Replace URLs
    data = re.sub(r'http[s]?://[^\s]+', "<URL>", data)

    # Replace numbers
    data = re.sub(r'\d+(\.\d+)?', "<NUM>", data)

    return data

# Define relevant text columns
columns_to_clean = ["content", "title", "authors", "tags", "summary", "meta_description", "keywords", "meta_keywords"]

# Apply cleaning to each column
for col in columns_to_clean:
    if col in textpd.columns:  # Avoid KeyError if column is missing
        textpd[col] = textpd[col].astype(str).apply(clean_text)

# Print DataFrame preview after cleaning
print(textpd.head())

# Clean the raw file text separately
cleaned_text = clean_text(text)

# Combine all cleaned text from DataFrame columns
full_text = " ".join(textpd[col].dropna().astype(str).str.cat(sep=" ") for col in columns_to_clean if col in textpd.columns)

# Print sample of cleaned text
print("\n===== FULL TEXT (from DataFrame) =====\n")
print(full_text[:1000])  # Print first 1000 characters

print("\n===== CLEANED RAW FILE TEXT =====\n")
print(cleaned_text[:1000])  # Print first 1000 characters

# Tokenize the cleaned text
tokens_cleaned = word_tokenize(cleaned_text)
tokens_full = word_tokenize(full_text)

# Print token samples
print("\n===== TOKENIZED CLEANED TEXT SAMPLE =====\n", tokens_cleaned[:50])
print("\n===== TOKENIZED FULL TEXT SAMPLE =====\n", tokens_full[:50])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/simonhvidtfeldt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


   Unnamed: 0   id                domain        type  \
0           0  141               awm.com  unreliable   
1           1  256     beforeitsnews.com        fake   
2           2  700           cnnnext.com  unreliable   
3           3  768               awm.com  unreliable   
4           4  791  bipartisanreport.com   clickbait   

                                                 url  \
0  http://awm.com/church-congregation-brings-gift...   
1  http://beforeitsnews.com/awakening-start-here/...   
2  http://www.cnnnext.com/video/18526/never-hike-...   
3  http://awm.com/elusive-alien-of-the-sea-caught...   
4  http://bipartisanreport.com/2018/01/21/trumps-...   

                                             content  \
0  sometimes the power of christmas will make you...   
1  awakening of <NUM> strands of dna – “reconnect...   
2  never hike alone: a friday the <NUM>th fan fil...   
3  when a rare shark was caught, scientists were ...   
4  donald trump has the unnerving ability to a

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/simonhvidtfeldt/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.12/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.12/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.12/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
def virker_lortet():
    print("Lortet virker")

virker_lortet()

For better comparison has frequency analysis been done for both my own cleaning function and the clean-text module function. The results from these functions are very similar. 