In [55]:
import re
import sys
import warnings
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt_tab')

# Adjust this path to where your NLTK data is located
nltk.data.path.append('C:\\Users\\Tobias E\\AppData\\Roaming\\nltk_data')

# Suppress all warning messages
if not sys.warnoptions:
    warnings.simplefilter("ignore")



[nltk_data] Downloading package punkt_tab to C:\Users\Tobias
[nltk_data]     E\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [56]:
# 1. Load the data
data_path = "C:\\workspace\\book1.csv"
data_raw = pd.read_csv(data_path, encoding='latin1')

In [57]:
# 2. Shuffle the data
data_raw = data_raw.sample(frac=1)

In [58]:
# 3. Identify category columns
categories = list(data_raw.columns.values)
categories = categories[2:]  # Adjust if your CSV structure changes

In [59]:
# 4. Basic cleaning on the "Heading" column
data_raw['Heading'] = (
    data_raw['Heading']
    .str.lower()
    .str.replace('[^\w\s]', '', regex=True)     # remove punctuation
    .str.replace('\d+', '', regex=True)         # remove digits
    .str.replace('<.*?>', '', regex=True)       # remove HTML tags
)

In [60]:
# 5. Download stopwords and define your stop-word removal function
nltk.download('stopwords')
stop_words = set(stopwords.words('swedish'))

def removeStopWords(sentence):
    return " ".join(
        [word for word in nltk.word_tokenize(sentence) 
         if word not in stop_words]
    )

data_raw['Heading'] = data_raw['Heading'].apply(removeStopWords)

[nltk_data] Downloading package stopwords to C:\Users\Tobias
[nltk_data]     E\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [61]:
# 6. (Optional) Stemming
stemmer = SnowballStemmer("swedish")

def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stemSentence += stemmer.stem(word) + " "
    return stemSentence.strip()

# If you want to apply stemming, uncomment:
# data_raw['Heading'] = data_raw['Heading'].apply(stemming)

In [62]:
# 7. Split the data
train, test = train_test_split(data_raw, random_state=42, test_size=0.30, shuffle=True)

train_text = train['Heading']
test_text = test['Heading']

In [63]:
# 8. Vectorize using TF-IDF
vectorizer = TfidfVectorizer(strip_accents='unicode', 
                             analyzer='word', 
                             ngram_range=(1,3), 
                             norm='l2')
vectorizer.fit(train_text)

x_train = vectorizer.transform(train_text)
y_train = train.drop(labels=['Id', 'Heading'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels=['Id', 'Heading'], axis=1)

In [64]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [65]:
# Fit the model for each category separately
for category in y_train.columns:
	clf.fit(x_train, y_train[category])
#	print(f"Fitted model for category: {category}")

In [66]:
y_pred = clf.predict(x_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 9. Train and evaluate Random Forest with OneVsRestClassifier
rf_model = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, random_state=42))
rf_model.fit(x_train, y_train)

# Predict on the test data
rf_predictions = rf_model.predict(x_test)

# Evaluate the model
print("Random Forest Model Evaluation")
print(classification_report(y_test, rf_predictions, target_names=categories))
print(f"Accuracy: {accuracy_score(y_test, rf_predictions):.2f}")


Random Forest Model Evaluation
                    precision    recall  f1-score   support

           Politik       0.77      0.21      0.33       155
        Utbildning       1.00      0.04      0.07        26
          Religion       0.00      0.00      0.00         2
             Miljo       0.86      0.14      0.24        42
           Ekonomi       0.62      0.28      0.39       178
     LivsstilFritt       0.86      0.14      0.24        87
SamhalleKonflikter       0.77      0.31      0.44       235
             Halsa       0.90      0.11      0.20        79
            Idrott       1.00      0.04      0.08        51
   VetenskapTeknik       0.80      0.11      0.19        38

         micro avg       0.74      0.21      0.33       893
         macro avg       0.76      0.14      0.22       893
      weighted avg       0.78      0.21      0.32       893
       samples avg       0.24      0.23      0.23       893

Accuracy: 0.17
