# Importing libraries


In [2]:
# !python3 -m pip install pattern

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

import pandas as pd
from nltk.cluster import KMeansClusterer
from pattern.nl import sentiment
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Importing dataset

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
df_master = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/BSc AI Thesis/comments_new.csv')

  df_master = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/BSc AI Thesis/comments_new.csv')


In [17]:
df = df_master.head(2000)

# Data pre-processing

In [19]:
tokenizer = WordPunctTokenizer()
stopwords = set(stopwords.words('dutch'))

preprocessed_comments = []

for comment in df['text']:
    tokens = tokenizer.tokenize(comment.lower())
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stopwords]
    preprocessed_comments.append(' '.join(tokens))

# Objective vs. subjective classification

In [20]:
vectorizer = TfidfVectorizer()
comment_vectors = vectorizer.fit_transform(preprocessed_comments)

# Add sentiment analysis check to improve clustering

In [22]:
cluster_labels = []
for comment in df['text']:
    polarity, subjectivity = sentiment(comment)
    if subjectivity > 0.5:
        cluster_labels.append(1)  # Subjective
    else:
        cluster_labels.append(0)  # Objective

# Thread structure analysis

In [23]:
threads = {}  # Dictionary to store comment threads

for index, row in df.iterrows():
    comment_id = row['comment_id']
    parent_id = row['comment_parent_id']
    comment_type = cluster_labels[index]  # Comment type (subjective or objective)

    if pd.isnull(parent_id) or parent_id not in threads:
        threads[comment_id] = {'comment': preprocessed_comments[index], 'replies': [], 'type': comment_type, 'subjective_count': 0, 'objective_count': 0}
    else:
        parent_thread = threads[parent_id]
        parent_thread['replies'].append({'comment': preprocessed_comments[index], 'comment_id': comment_id, 'type': comment_type})

# Calculate the counts of subjective and objective replies for each thread
for thread_id, thread in threads.items():
    thread['subjective_count'] = sum(reply['type'] == 1 for reply in thread['replies'])
    thread['objective_count'] = sum(reply['type'] == 0 for reply in thread['replies'])

# Topic Modeling

In [24]:
num_topics = 5
dictionary = gensim.corpora.Dictionary([comment.split() for comment in preprocessed_comments])
corpus = [dictionary.doc2bow(comment.split()) for comment in preprocessed_comments]
lda = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=0)



# Evaluating topic modeling

In [25]:
coherence_model = CoherenceModel(model=lda, texts=[comment.split() for comment in preprocessed_comments], dictionary=dictionary, coherence='u_mass')
umass_coherence_score = coherence_model.get_coherence()

print(f"UMass Coherence Score: {umass_coherence_score}")

UMass Coherence Score: -2.311116580870453


# Semi-supervised learning

In [26]:
X_train, X_test, y_train, y_test = train_test_split(comment_vectors.toarray(), cluster_labels, test_size=0.2, random_state=42)

label_prop_model = LabelPropagation(kernel='knn', n_neighbors=10, max_iter=1000)
label_prop_model.fit(X_train, y_train)

# Evaluating semi-supervised learning

In [27]:
y_pred = label_prop_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Evaluation Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Evaluation Metrics:
Accuracy: 0.48
Precision: 0.7358490566037735
Recall: 0.16738197424892703
F1-score: 0.27272727272727276


# Summary

In [None]:
num_subjective = sum(cluster_labels)
num_objective = len(cluster_labels) - num_subjective

summary = "Discussion Summary:\n\n"
summary += f"Number of Subjective Opinions: {num_subjective}\n"
summary += f"Number of Objective Opinions: {num_objective}\n\n"

summary += "Thread Analysis:\n"
for thread_id, thread in threads.items():
    summary += f"Parent Comment: {thread['comment']} (Type: {'Subjective' if thread['type'] == 1 else 'Objective'})\n"
    summary += f"\tSubjective Replies: {thread['subjective_count']}\n"
    summary += f"\tObjective Replies: {thread['objective_count']}\n"

    for reply in thread['replies']:
        summary += f"\tReply Comment: {reply['comment']} (Type: {'Subjective' if reply['type'] == 1 else 'Objective'})\n"

summary += "\nSentiment Analysis:\n"
for i, sentiment in enumerate(sentiments):
    summary += f"Comment {i+1} - Sentiment: {sentiment}\n"

summary += "\nTopic Modeling:\n"
for i, topic in enumerate(lda.get_topics()):
    summary += f"Topic {i+1} - Top Words: {', '.join([vectorizer.get_feature_names_out()[index] for index in topic.argsort()[:-6:-1]])}\n"

print(summary)

Discussion Summary:

Number of Subjective Opinions: 158
Number of Objective Opinions: 842

Thread Analysis:
Parent Comment: mln ton koolstof verbrand hele wereld komende jaar alle auto s volgas snelwegen mee rijden vechten bierkaai komt rutte eraan km per uur aarde klein landje stikstof probleem gaat oplossen (Type: Objective)
	Subjective Replies: 0
	Objective Replies: 0
Parent Comment: we australië teruggeven zoals midnight oil ooit zong weet zeker gaat afkoelen waar zouden aussies ver willen gaan (Type: Objective)
	Subjective Replies: 0
	Objective Replies: 0
Parent Comment: hopen soort rampen klimaatsceptici onze eigen thierry onze vriend donald inkeer brengen hopen t begrijpt bedoel (Type: Objective)
	Subjective Replies: 0
	Objective Replies: 0
Parent Comment: sterkte australië ontzettend blij regering miljard opzij gelegd infrastructuur natuur herstellen tijd duren voordat hersteld niks allang optie (Type: Subjective)
	Subjective Replies: 0
	Objective Replies: 0
Parent Comment: goe