# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Importing dataset

In [2]:
df_path = "/content/drive/MyDrive/Colab Notebooks/BSc AI Thesis/comments.csv"
df = pd.read_csv(df_path, sep=';')

## Using smaller version of the dataset at the moment


In [3]:
df_new = df.head(10)

In [4]:
df_new

Unnamed: 0,article_id,comment_id,comment_parent_id,user_id,text,created_at,status,reply_count,respect_count,is_featured
0,6022089,e4b16b91be890d02645d916e0510fc3aeaf0ff4611fcf1...,,4a4185520266364333cc860c3600137451df43fed472d2...,200 mln Ton koolstof verbrand! Daar kan de hel...,2020-01-06 20:26:17.955000,REJECTED,0,0,False
1,6022089,2186fd4e53e9f28a7869bcacd9704ccc2e9a23ed5ff7bd...,,64198bfd09320a5d2cec1bb0cec4c45e2f3e41df8546bb...,Als we Australië teruggeven zoals midnight oil...,2020-01-06 20:29:29.129000,ACCEPTED,0,4,False
2,6022089,47aff4e79476df437522230620f51c914ba92655e8e15c...,,3c294720e416b3ffbd4d2c773d55a8f349fdff1983b199...,Nu maar hopen dat dit soort rampen klimaatscep...,2020-01-06 20:31:13.092000,REJECTED,0,0,False
3,6022089,61b426c5d10c3bc59a58e524a9fcbe3c99179d9927b683...,,3a3d43abb7428e442b3dcb2a8762b917f1f88a3fc0fe70...,Sterkte voor Australië en ik ben ontzettend bl...,2020-01-06 20:35:37.316000,ACCEPTED,0,3,False
4,6022089,b437305d79667335c923229f08a7554fab8da67d8f87d3...,,2b0605d35ebfc132355ee8055b838cded67e4507292ab5...,Goede uitleg. In het Engels vertalen en aan de...,2020-01-06 20:39:02.719000,REJECTED,0,0,False
5,6022089,2363be9fd56e9f52fbd683fbce014534d68628b38f095c...,,4f8f2451461ffff99458329404571e738645edcf576f18...,"Het is vrij eenvoudig, volgens de politie en b...",2020-01-06 20:40:04.248000,ACCEPTED,6,25,False
6,6022089,e8161dca66bb39df4281ca94cd1b10f60878e54696e484...,,59b8093d7edc83c8f9f930864bc42cb5d3569751647ac4...,Ik vraag me af hoe veel signalen sommige polit...,2020-01-06 20:41:11.588000,REJECTED,0,0,False
7,6022089,bd9c88fdff8ebb297eead1eb99d631e5f325e6a7c69bb5...,,4c4ddaa15423e3e6aa99b5c28d40fbf143b928512c6956...,Dit artikel geeft alleen aan waarom de branden...,2020-01-06 20:43:28.236000,ACCEPTED,3,7,False
8,6022089,6f767d597bea4668a41a05a0452365eb0aa470491d7793...,,146970a0357d7d8bd036bdb12ca288acfe8d4ff5bd0d0d...,Mijn liefde voor dieren hart huilt als ik beel...,2020-01-06 20:43:36.391000,REJECTED,0,0,False
9,6022089,8e5be0cbd00915b5b21489b5a9ad20cedf6c6d3f6ed4c4...,,d9928331997d0b4f30d9e460a0682b1b5ceecd77f41b16...,Abel Tasman sprak in zijn reisverslagen al ove...,2020-01-06 20:46:21.174000,ACCEPTED,11,42,False


# Pre-processing

In [5]:
# getting Dutch stop words
stop_words = set(stopwords.words('dutch'))

lemmatizer = WordNetLemmatizer()

## Cleaning the comments

In [6]:
def clean_text(text):
    # converitng comments to lowercase
    text = text.lower()

    # removing numbers and special characters
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    # tokenisation
    tokens = nltk.word_tokenize(text)

    # lemmatising the tokens and removing the stop words
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    # Join tokens into a single string
    clean_text = ' '.join(tokens)
    return clean_text

# new column with cleaned comments
df_new['clean_text'] = df_new['text'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['clean_text'] = df_new['text'].apply(clean_text)


In [7]:
df_new['clean_text']

0    mln ton koolstof verbrand hele wereld komende ...
1    we australië teruggeven zoals midnight oil ooi...
2    hopen soort rampen klimaatsceptici onze eigen ...
3    sterkte australië ontzettend blij regering mil...
4    goede uitleg engels vertalen australische prem...
5    vrij eenvoudig volgens politie brandweer zon a...
6    vraag af signalen sommige politici nodig snapp...
7    artikel geeft alleen waarom branden lang blijv...
8    liefde dieren hart huilt beelden zie ellende o...
9    abel tasman sprak reisverslagen rookpluimen we...
Name: clean_text, dtype: object

# Feature extraction

## tf-idf BoW representaion

In [8]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_new['clean_text'])

In [9]:
print(tfidf_matrix.toarray())

[[0.         0.         0.19126392 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.25167057 0.25167057 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.110266  ]]


## Topic modelling for extractive summary

In [10]:
word2id = vectorizer.vocabulary_
id2word = {v: k for k, v in word2id.items()}
corpus = Sparse2Corpus(tfidf_matrix, documents_columns=False)
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=10, iterations=100, chunksize=1000, alpha='auto', eta='auto', random_state=42)# Abstractive summarization

In [11]:
for i in lda_model.print_topics():
  print(i)

(0, '0.004*"we" + 0.004*"onze" + 0.004*"hopen" + 0.004*"brengen" + 0.004*"branden" + 0.004*"natuur" + 0.004*"tijd" + 0.004*"australië" + 0.004*"soort" + 0.004*"klimaatverandering"')
(1, '0.012*"vuurwerk" + 0.012*"ondertussen" + 0.012*"allen" + 0.012*"dieren" + 0.012*"ellende" + 0.012*"liefde" + 0.012*"zie" + 0.012*"zaniken" + 0.012*"beelden" + 0.012*"hart"')
(2, '0.015*"natuur" + 0.009*"herstellen" + 0.009*"miljard" + 0.009*"infrastructuur" + 0.009*"ontzettend" + 0.009*"gelegd" + 0.009*"niks" + 0.009*"hersteld" + 0.009*"opzij" + 0.009*"regering"')
(3, '0.009*"hand" + 0.009*"broeikaseffect" + 0.009*"kost" + 0.009*"sommige" + 0.009*"problemen" + 0.009*"politici" + 0.008*"jaren" + 0.008*"klimaatverandering" + 0.006*"vraag" + 0.006*"aanpak"')
(4, '0.019*"branden" + 0.017*"oorzaak" + 0.008*"nergens" + 0.008*"vreemde" + 0.008*"alleen" + 0.008*"vuur" + 0.008*"veroorzaker" + 0.008*"sigaret" + 0.008*"geblust" + 0.008*"uitgemaakt"')
(5, '0.009*"klimaatverandering" + 0.009*"australische" + 0.009*