In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/undergrad_thesis/hn.csv')

In [None]:
df.groupby('keyw').count()['title'].sort_values(ascending=False)

keyw
privacy             2061
open-source         1473
blockchain          1201
competition          348
offline              327
fake news            326
cybersecurity        312
democracy            286
ethical              262
personal data        189
discrimination       173
justice              142
global warming       103
climate crisis        88
inclusive             72
sustainability        61
smart city            47
right to repair       29
digital id            21
filter bubble         19
public space          18
e-id                  14
decentralisation       8
data governance        6
Name: title, dtype: int64

In [None]:
df_priv=df[(df['keyw']=='privacy') & ~df['text'].isna()]

In [None]:
docs=df_priv['text'].tolist()

In [None]:
print(len(docs))

2034


In [None]:
print(docs[0])

Anonymous Data Let's pretend we're analysts at a small college, looking at anonymous survey data about plagiarism. We've gotten responses from the entire student body, reporting if they've ever plagiarized or not. To encourage them to respond honestly, names were not collected. The data here has been randomly generated

On the survey students also report several bits of information about themselves, like their age...

...and what state they're from. This additional information is critical to finding potential patterns in the data—why have so many first-years from New Hampshire plagiarized?

Revealed Information But granular information comes with a cost. One student has a unique age/home state combination. By searching another student database for a 19-year old from Vermont we can identify one of the plagiarists from supposedly anonymous survey data.

Increasing granularity exacerbates the problem. If the students reported slightly more about their ages by including what season they we

In [None]:
# Tokenize the documents.
import nltk
nltk.download("stopwords")

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

docs = [[token for token in doc if token not in stop_words] for doc in docs]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
docs[0][:10]

['anonymous',
 'data',
 'let',
 'pretend',
 'analysts',
 'small',
 'college',
 'looking',
 'anonymous',
 'survey']

In [None]:
# Lemmatize the documents.
import nltk
nltk.download("wordnet")
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
             # Token is a bigram, add to document.
            docs[idx].append(token)

In [None]:
docs[1][-30:]

['people',
 'safe',
 'online',
 'addressing',
 'hate',
 'crime',
 'community',
 'announcement',
 'mark',
 'significant',
 'step',
 'forward',
 'making',
 'privacy',
 'security',
 'reality',
 'everyone',
 'excited',
 'team',
 'munich',
 'leading',
 'way',
 'last_week',
 'developer_conference',
 'around_world',
 'million_people',
 'every_day',
 'browse_web',
 'chrome_browser',
 'first_time']

In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 4493
Number of documents: 2034


In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [None]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.2834.
[([(0.0054431236, 'even'),
   (0.0052470244, 'social'),
   (0.0051557743, 'facebook'),
   (0.005116725, 'think'),
   (0.0046177516, 'u'),
   (0.004416071, 'medium'),
   (0.00436699, 'technology'),
   (0.004242177, 'year'),
   (0.004178721, 'say'),
   (0.0040002507, 'public'),
   (0.0038898592, 'access'),
   (0.0038826123, 'model'),
   (0.0038804316, 'thing'),
   (0.0038435108, 'way'),
   (0.0037941996, 'know'),
   (0.003731327, 'used'),
   (0.0035217637, 'get'),
   (0.0034731256, 'could'),
   (0.0032635634, 'tech'),
   (0.0031933726, 'right')],
  -0.7213205706867131),
 ([(0.051179577, 'facebook'),
   (0.02449236, 'said'),
   (0.014673753, 'policy'),
   (0.013106008, 'email'),
   (0.010103438, 'gdpr'),
   (0.009434107, 'privacy_policy'),
   (0.0093204025, 'post'),
   (0.0072754435, 'year'),
   (0.0072665387, 'zuckerberg'),
   (0.0068681315, 'cambridge'),
   (0.0068246517, 'firm'),
   (0.006458606, 'last'),
   (0.0056378325, 'update'),
   (0.0055667786, 

In [None]:
!pip install pyLDAvis

import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(model, corpus, dictionary)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.0-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
Collecting funcy
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 KB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: funcy, joblib, pyLDAvis
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.1
    Uninstalling joblib-1.1.1:
      Successfully uninstalled joblib-1.1.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-profiling 3.2.0 requires joblib~=1.1.0, but 

  default_term_info = default_term_info.sort_values(
