In [None]:
#============ TASK 1 ============

import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter

# Ensure nltk punkt tokenizer is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load CSV 
file_path = "news_sample.csv"
textpd = pd.read_csv(file_path, encoding="utf-8")

# Define the clean_text function
def clean_text(data):
    if not isinstance(data, str):  # Handle NaN values safely
        return ""
    data = data.lower()
    data = re.sub(r'\s+', " ", data)
    data = re.sub(r'\d{1,2}[./-]\d{1,2}[./-]\d{2,4}', "<DATE>", data)
    data = re.sub(r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec).? \d{1,2},? \d{4}', "<DATE>", data)
    data = re.sub(r'\d{4}-\d{2}-\d{2}', "<DATE>", data)
    data = re.sub(r'[\w._%+-]+@[\w.-]+\.[a-zA-Z]{2,}', "<EMAIL>", data)
    data = re.sub(r'http[s]?://[^\s]+', "<URL>", data)
    data = re.sub(r'\d+(\.\d+)?', "<NUM>", data)
    return data

#Clean all columns
columns_to_clean1 = ["id", "domain", "type", "url", "content", "title", "authors", "keywords", "meta_keywords", "meta_description", "tags", "summary"]


# Apply cleaning to each column
for col in columns_to_clean1:
    if col in textpd.columns:  # Avoid KeyError if column is missing
        textpd[col] = textpd[col].astype(str).apply(clean_text)


# Combine all cleaned text from DataFrame columns
full_text = " ".join(textpd[col].dropna().astype(str).str.cat(sep=" ") for col in columns_to_clean1 if col in textpd.columns)

# Tokenize the cleaned text
tokens1 = word_tokenize(full_text)
print("Tokens:",len(tokens1))

# Hent stopord
stop_words = set(stopwords.words('english'))

# Fjern stopord
filtered_tokens1 = [w for w in tokens1 if w.lower() not in stop_words]
print("Tokens - remowed stop words:",len(filtered_tokens1))
Reductionrate_after_stemming_and_stopwords = (1-len(filtered_tokens1)/len(tokens1))*100
print("Reduction rate after remowing stop words:",Reductionrate_after_stemming_and_stopwords)

ps = PorterStemmer()
stemmed_tokens1 = [ps.stem(w) for w in filtered_tokens1]
print("Tokens - stemmed:",len(stemmed_tokens1))
Reductionrate_after_stemming = (1-len(stemmed_tokens1)/len(filtered_tokens1))*100
print("Reduction rate after remowed stopwords and stemming:",Reductionrate_after_stemming)

print("Unique tokens:",len(Counter(tokens1)))
print("Unique tokens after remowing stopwords:",len(Counter(filtered_tokens1)))
print("Unique tokens after stemming:",len(Counter(stemmed_tokens1)))



In [None]:
#============ TASK 2 ============

file_path = "995,000_rows.csv"
chunksize = 25000

# Define function to tokenize, remove stopwords, and stem
def tokenize_and_stem(text):
    tokens = word_tokenize(text)
    filtered_tokens = [ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

# List of columns to clean (with clean_text)
columns_to_clean = [
    "id", "domain", "type", "url", "content", "title", "authors", "keywords", 
    "meta_keywords", "meta_description", "tags", "summary"
]

# Create empty DataFrame to collect processed chunks
preprocessed_data = []

# Process chunks
for chunk_number, chunk in enumerate(pd.read_csv(file_path, chunksize=chunksize, low_memory=False)):
    # Apply cleaning function to specified columns
    for col in columns_to_clean:
        if col in chunk.columns:
            chunk[col] = chunk[col].apply(clean_text)
    
    # Remove stopwords, tokenize, and stem only for 'content' column
    if 'content' in chunk.columns:
        chunk['content'] = chunk['content'].astype(str).apply(tokenize_and_stem)
    
    preprocessed_data.append(chunk)
    # Print progress
    print(f"Processed chunk {chunk_number + 1}")

# Combine and save all cleaned data
final_df = pd.concat(preprocessed_data, ignore_index=True)
final_df.to_csv("cleaned_file.csv", index=False)





In [None]:
#============ TASK 4 ============

from sklearn.model_selection import train_test_split
#Split dataset in 80% train and 10% test and 10% validation
X_train, X_rest, y_train, y_rest = train_test_split(cleaned_file, cleaned_file['type'] ,test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)


#Check correct split
print("Training data shape:", len(X_train))
print("Validation data shape:", len(X_val))
print("Testing data shape:", len(X_test))


#ave to induvidual pdf
X_train.to_csv("X_train.csv", index=False)
X_val.to_csv("X_val.csv", index=False)
X_test.to_csv("X_test.csv", index=False)