In [41]:
import re
import sys
import warnings
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt_tab')

# Adjust this path to where your NLTK data is located
nltk.data.path.append('C:\\Users\\Tobias E\\AppData\\Roaming\\nltk_data')

# Suppress all warning messages
if not sys.warnoptions:
    warnings.simplefilter("ignore")



[nltk_data] Downloading package punkt_tab to C:\Users\Tobias
[nltk_data]     E\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [42]:
# 1. Load the data
data_path = "C:\\workspace\\book1.csv"
data_raw = pd.read_csv(data_path, encoding='latin1')

In [43]:
# 2. Shuffle the data
data_raw = data_raw.sample(frac=1)

In [44]:
# 3. Identify category columns
categories = list(data_raw.columns.values)
categories = categories[2:]  # Adjust if your CSV structure changes

In [45]:
# 4. Basic cleaning on the "Heading" column
data_raw['Heading'] = (
    data_raw['Heading']
    .str.lower()
    .str.replace('[^\w\s]', '', regex=True)     # remove punctuation
    .str.replace('\d+', '', regex=True)         # remove digits
    .str.replace('<.*?>', '', regex=True)       # remove HTML tags
)

In [46]:
# 5. Download stopwords and define your stop-word removal function
nltk.download('stopwords')
stop_words = set(stopwords.words('swedish'))

def removeStopWords(sentence):
    return " ".join(
        [word for word in nltk.word_tokenize(sentence) 
         if word not in stop_words]
    )

data_raw['Heading'] = data_raw['Heading'].apply(removeStopWords)

[nltk_data] Downloading package stopwords to C:\Users\Tobias
[nltk_data]     E\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
# 6. (Optional) Stemming
stemmer = SnowballStemmer("swedish")

def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stemSentence += stemmer.stem(word) + " "
    return stemSentence.strip()

# If you want to apply stemming, uncomment:
data_raw['Heading'] = data_raw['Heading'].apply(stemming)

In [48]:
# 7. Split the data
train, test = train_test_split(data_raw, random_state=42, test_size=0.30, shuffle=True)

train_text = train['Heading']
test_text = test['Heading']

In [49]:
# 8. Vectorize using TF-IDF
vectorizer = TfidfVectorizer(strip_accents='unicode', 
                             analyzer='word', 
                             ngram_range=(1,3), 
                             norm='l2')
vectorizer.fit(train_text)

x_train = vectorizer.transform(train_text)
y_train = train.drop(labels=['Id', 'Heading'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels=['Id', 'Heading'], axis=1)

In [50]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [51]:
# Fit the model for each category separately
for category in y_train.columns:
	clf.fit(x_train, y_train[category])
#	print(f"Fitted model for category: {category}")

In [52]:
y_pred = clf.predict(x_test)

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 9. Train the logistic regression model with OneVsRestClassifier
logistic_model = LogisticRegression(max_iter=5000, random_state=42)
one_vs_rest_classifier = OneVsRestClassifier(logistic_model)

# Train the model
one_vs_rest_classifier.fit(x_train, y_train)

# 10. Make predictions
y_pred = one_vs_rest_classifier.predict(x_test)

# 11. Evaluate the model
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=categories))


Accuracy: 11.13%

Classification Report:
                    precision    recall  f1-score   support

           Politik       0.68      0.18      0.28       152
        Utbildning       0.00      0.00      0.00        15
          Religion       0.00      0.00      0.00         5
             Miljo       0.00      0.00      0.00        41
           Ekonomi       0.81      0.21      0.33       165
     LivsstilFritt       0.00      0.00      0.00        89
SamhalleKonflikter       0.80      0.26      0.40       231
             Halsa       1.00      0.03      0.05        78
            Idrott       0.00      0.00      0.00        58
   VetenskapTeknik       0.00      0.00      0.00        34

         micro avg       0.78      0.14      0.24       868
         macro avg       0.33      0.07      0.11       868
      weighted avg       0.58      0.14      0.22       868
       samples avg       0.16      0.15      0.15       868

