# Baseline - model applying clustering to complete web64 dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import time
#libraries for preprocessing
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.util import ngrams

import string

#libraries for clustering
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram

from sklearn.decomposition import PCA

In [7]:
df = pd.read_csv('finalnouns.csv', index_col=0)

In [8]:
df.head()

Unnamed: 0_level_0,url,og_url,domain,title,description,content,entities,authors,paywall,fb_total,...,is_homepage,is_document,http_code,feeds,cat_id,cat_score,outbound_link_ids,inbound_link_ids,tagss,Nounss
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
KQe1r321ZaJY,https://www.abcnyheter.no/nyheter/norge/2021/1...,https://www.abcnyheter.no/a/195798411/,www.abcnyheter.no,Færre vil møte klimakrisen med tiltak som i ko...,Andelen som vil ha like kraftige tiltak mot kl...,"Samtidig som klimatoppmøtet i Glasgow starter,...","['Glasgow', 'Fokus', 'Norge', 'Klimatoppmøtet'...",,,0,...,,False,200,,154.0,0.963856,,,[],"['Færre', 'klimakrisen', 'tiltak', 'koronapand..."
jnegLMrLkawZ,https://www.ao.no/oslo-fergenes-siste-dag-pa-f...,https://www.ao.no/5-128-204411,www.ao.no,(+) Oslo-Fergenes siste dag på fjorden: – Det ...,Siden 1917 har Oslo-Fergene fraktet turister o...,"Øyene, Ferger | Oslo-Fergenes siste dag på fjo...",,['Sevda Barazesh'],1.0,1,...,,False,200,,,,,['LDdwm7PORa1Y'],[],"['Oslo-Fergenes', 'dag', 'fjorden']"
Jrb2vD34MbWL,https://www.abcnyheter.no/nyheter/verden/2021/...,https://www.abcnyheter.no/a/195798410/,www.abcnyheter.no,25 drept i raid mot bankrøvere i Brasil,25 antatte medlemmer av en bande bankrøvere bl...,50 politifolk deltok i operasjonen mot to gård...,"['bandemedlemmene', 'Rodolfo Morotti Fernandes...",,,0,...,,False,200,,65.0,1.000018,,,"[('Brasil', False)]","['raid', 'bankrøvere', 'Brasil']"
xkazKYRPqbJ0,https://norgeogverdensnytt.blogg.no/sandra-lyn...,,norgeogverdensnytt.blogg.no,Sandra Lyng (34) har fått korona,AV/ NorgeOgVerdensNytt Foto: Mauricio E...,AV/ NorgeOgVerdensNytt Foto: Mauricio Evensen ...,"['Mauricio Evensen', 'Sandra Lyng', 'Lyngs', '...",['norgeogverdensnytt'],,0,...,,False,200,,127.0,0.714142,,,"[('Sandra Lyng', True)]","['Sandra', 'Lyng']"
Vyb86KpPrevA,https://www.vg.no/nyheter/utenriks/i/V959MJ/ab...,https://www.vg.no/i/V959MJ,www.vg.no,Aborttilgangen tørker inn – nå skal Texas-love...,MEMPHIS (VG) I sørstatene skyves nå kvinner fr...,FORBEREDELSER: Joy Evans er såkalt «patient ed...,"['Texas', 'Roe', 'Wade', 'Memphis', 'Webb', 'H...",,,0,...,,False,200,,168.0,0.683313,['nXe0VD5vNbxr'],['lNbW7oZOEbyg'],"[('Texas-loven', False), ('Høyesterett', False)]","['Aborttilgangen', 'Texas-loven', 'Høyesterett']"


In [15]:
print(df.shape)# convert published date to datetime
date_format = '%Y-%m-%dT%H:%M:%S'

# make features for hour and day
df['pub_day'] = pd.to_datetime(df.pub_date, format=date_format).dt.date
df['pub_hour'] = pd.to_datetime(df.pub_date, format=date_format).dt.hour



(87638, 34)


# Preprocessing

In [16]:
#initialize tokenizer and stemmer
tokenizer = RegexpTokenizer('[^\W\d_]+')
stemmer = SnowballStemmer("norwegian", ignore_stopwords=True)

def clean(doc):   
    doc_ = doc

    # make all lower case
    doc_ = doc_.lower()
    
    # tokenize text. regex includes alphabetic characters, including letters with accents. 
    # excludes punctuation and numbers.
    words = tokenizer.tokenize(doc_)
    
    
    # do not stem stopwords
    stems = list(map(stemmer.stem, words))

    doc_ = " ".join(stems)
    
    # count number of words 
    n_words = len(words)

    return doc_#, title_words, n_words

In [21]:
#clean Nounss
df['Nounss_clean'] = df.Nounss.apply(clean)

# read stopwords from file
sw = [line.strip("\n") for line in open("/Users/nimrarana/Downloads/norwegian_stopwords.txt", "r")]

# initialize vectorizer to count words, ignoring norwegian stopwords
vectorizer = CountVectorizer(stop_words=sw)

# initialize TF-IDF counter
tfidf = TfidfTransformer()


# Clustering of full dataset

In [20]:
# cluster per day
dates = df.pub_day.unique()

cluster_id_list = []
for dt in dates:
    print(dt)
    df_sub = df[df.pub_day == dt]
    print('%d articles included in clustering' % df_sub.shape[0])

   

2021-10-31
4612 articles included in clustering
2021-11-01
6782 articles included in clustering
2021-11-02
7064 articles included in clustering
2021-11-03
7012 articles included in clustering
2021-11-04
7308 articles included in clustering
2021-11-05
6896 articles included in clustering
2021-11-06
4479 articles included in clustering
2021-11-07
4677 articles included in clustering
2021-11-08
6869 articles included in clustering
2021-11-09
6628 articles included in clustering
2021-11-10
6999 articles included in clustering
2021-11-11
6969 articles included in clustering
2021-11-12
6870 articles included in clustering
2021-11-13
4473 articles included in clustering


In [22]:
# cluster per day
dates = df.pub_day.unique()

cluster_id_list = []
for dt in dates:
    print(dt)
    df_sub = df[df.pub_day == dt]
    print('%d articles included in clustering' % df_sub.shape[0])

    # fit vectorizer to corpus of titles
    vectorizer.fit(df_sub.Nounss_clean.values)

    # convert data to word counts
    word_count_matrix = vectorizer.transform(df_sub.Nounss_clean.values)
    n_articles = word_count_matrix.shape[0]
    print('Word count vectorization has %d entries and %d words' % word_count_matrix.shape)

    # fit and transform to titles corpus
    word_term_frequency_matrix = tfidf.fit_transform(word_count_matrix)
    
    # create dataframe from TF-IDF values
    tf = pd.DataFrame(word_term_frequency_matrix.toarray())
    
    # run PCA
    all_pca = PCA(n_components=int(n_articles*0.9), svd_solver='randomized')

    bt = time.time()
    all_pcs = all_pca.fit_transform(tf) #tf.iloc[:5000]
    et = time.time()
    print("time spent: %.2f s" % (et-bt))
    
    pc_95 = np.where(np.cumsum(all_pca.explained_variance_ratio_)>0.95)[0][0]

    ac = AgglomerativeClustering(n_clusters=pc_95, linkage='complete')
    clusters = ac.fit_predict(tf)
    
    # create cluster id from day of year and cluster id
    df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)

    cluster_id_list.append(df_sub.pub_day.apply(lambda x: x.strftime('%j')) + '_' + df_sub.cluster_no.astype(str))

2021-10-31
4612 articles included in clustering
Word count vectorization has 4612 entries and 5258 words
time spent: 73.73 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-01
6782 articles included in clustering
Word count vectorization has 6782 entries and 7101 words
time spent: 212.48 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-02
7064 articles included in clustering
Word count vectorization has 7064 entries and 6867 words
time spent: 247.47 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-03
7012 articles included in clustering
Word count vectorization has 7012 entries and 7043 words
time spent: 211.25 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-04
7308 articles included in clustering
Word count vectorization has 7308 entries and 7270 words
time spent: 236.17 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-05
6896 articles included in clustering
Word count vectorization has 6896 entries and 6818 words
time spent: 358.73 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-06
4479 articles included in clustering
Word count vectorization has 4479 entries and 4986 words
time spent: 57.20 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-07
4677 articles included in clustering
Word count vectorization has 4677 entries and 5017 words
time spent: 62.38 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-08
6869 articles included in clustering
Word count vectorization has 6869 entries and 6850 words
time spent: 1319.35 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-09
6628 articles included in clustering
Word count vectorization has 6628 entries and 6734 words
time spent: 226.86 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-10
6999 articles included in clustering
Word count vectorization has 6999 entries and 6982 words
time spent: 247.93 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-11
6969 articles included in clustering
Word count vectorization has 6969 entries and 7133 words
time spent: 291.72 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-12
6870 articles included in clustering
Word count vectorization has 6870 entries and 6971 words
time spent: 228.12 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


2021-11-13
4473 articles included in clustering
Word count vectorization has 4473 entries and 4908 words
time spent: 86.99 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster_no'] = pd.Series(clusters,index=df_sub.index)


In [23]:
np.cumsum(all_pca.explained_variance_ratio_)

array([0.0099121 , 0.01933457, 0.02725868, ..., 1.        , 1.        ,
       1.        ])

In [29]:
df['cluster_id'] = pd.concat(cluster_id_list)

In [30]:
#save data
#df.to_csv('cop26_clust_noun.csv')

## new dataset with a new feature with the assigned cluster id

In [2]:
#load dataset
df = pd.read_csv('cop26_clust_noun.csv', index_col=0)
df.head()

Unnamed: 0_level_0,url,og_url,domain,title,description,content,entities,authors,paywall,fb_total,...,cat_score,outbound_link_ids,inbound_link_ids,tagss,Nounss,title_clean,pub_day,pub_hour,Nounss_clean,cluster_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
KQe1r321ZaJY,https://www.abcnyheter.no/nyheter/norge/2021/1...,https://www.abcnyheter.no/a/195798411/,www.abcnyheter.no,Færre vil møte klimakrisen med tiltak som i ko...,Andelen som vil ha like kraftige tiltak mot kl...,"Samtidig som klimatoppmøtet i Glasgow starter,...","['Glasgow', 'Fokus', 'Norge', 'Klimatoppmøtet'...",,,0,...,0.963856,,,[],"['Færre', 'klimakrisen', 'tiltak', 'koronapand...",færr klimakris tiltak koronapandemi,2021-10-31,23,færr klimakris tiltak koronapandemi,304_1940
jnegLMrLkawZ,https://www.ao.no/oslo-fergenes-siste-dag-pa-f...,https://www.ao.no/5-128-204411,www.ao.no,(+) Oslo-Fergenes siste dag på fjorden: – Det ...,Siden 1917 har Oslo-Fergene fraktet turister o...,"Øyene, Ferger | Oslo-Fergenes siste dag på fjo...",,['Sevda Barazesh'],1.0,1,...,,,['LDdwm7PORa1Y'],[],"['Oslo-Fergenes', 'dag', 'fjorden']",oslo ferg dag fjord,2021-10-31,23,oslo ferg dag fjord,304_1097
Jrb2vD34MbWL,https://www.abcnyheter.no/nyheter/verden/2021/...,https://www.abcnyheter.no/a/195798410/,www.abcnyheter.no,25 drept i raid mot bankrøvere i Brasil,25 antatte medlemmer av en bande bankrøvere bl...,50 politifolk deltok i operasjonen mot to gård...,"['bandemedlemmene', 'Rodolfo Morotti Fernandes...",,,0,...,1.000018,,,"[('Brasil', False)]","['raid', 'bankrøvere', 'Brasil']",raid bankrøver brasil,2021-10-31,23,raid bankrøver brasil,304_1870
xkazKYRPqbJ0,https://norgeogverdensnytt.blogg.no/sandra-lyn...,,norgeogverdensnytt.blogg.no,Sandra Lyng (34) har fått korona,AV/ NorgeOgVerdensNytt Foto: Mauricio E...,AV/ NorgeOgVerdensNytt Foto: Mauricio Evensen ...,"['Mauricio Evensen', 'Sandra Lyng', 'Lyngs', '...",['norgeogverdensnytt'],,0,...,0.714142,,,"[('Sandra Lyng', True)]","['Sandra', 'Lyng']",sandr lyng,2021-10-31,23,sandr lyng,304_2001
Vyb86KpPrevA,https://www.vg.no/nyheter/utenriks/i/V959MJ/ab...,https://www.vg.no/i/V959MJ,www.vg.no,Aborttilgangen tørker inn – nå skal Texas-love...,MEMPHIS (VG) I sørstatene skyves nå kvinner fr...,FORBEREDELSER: Joy Evans er såkalt «patient ed...,"['Texas', 'Roe', 'Wade', 'Memphis', 'Webb', 'H...",,,0,...,0.683313,['nXe0VD5vNbxr'],['lNbW7oZOEbyg'],"[('Texas-loven', False), ('Høyesterett', False)]","['Aborttilgangen', 'Texas-loven', 'Høyesterett']",aborttilgang tex lov høyesterett,2021-10-31,23,aborttilgang tex lov høyesterett,304_1814


In [52]:
clust_size = df.groupby('cluster_id').Nounss_clean.count()
print('%d single-article stories' % np.sum(clust_size>1))

16412 single-article stories


In [53]:
print('%d datapoint grouped into %d clusters' % (df.shape[0], df.cluster_id.nunique()))

87638 datapoint grouped into 39942 clusters


## Testing how well clusters are working

In [40]:
clust_size[clust_size>1].sort_values(ascending=False).tail()

cluster_id
312_1166    2
312_1164    2
312_1160    2
312_116     2
310_925     2
Name: title_clean, dtype: int64

In [54]:
#Observing Cluster number 304_106
for Nounss in df[df.cluster_id=='304_106'].Nounss:
    print(Nounss)


['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['Viking', 'Sarpsborg']
['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['Sarpsborg', 'formlaget', 'Viking', 'kamp']
['nedturen', 'Viking', 'Sarpsborg']
['Live', 'Viking', 'Sarpsborg']
['Viking-Sarpsborg', 'chat', 'Ingve', 'Bøe']
['Sarpsborg']
['Viking', 'Sarpsborg']


In [55]:
#Extracting domains of the articles in cluster 304_106
df[df.cluster_id=='304_106'].domain

id
MYerRVjGweOB      www.altaposten.no
l9avoLn6rbG1         www.adressa.no
WZdPZWnylaKg         www.itromso.no
ELe32Vzgrd69    www.framtidinord.no
Jrb2vDyPJbWL          www.rbnett.no
y1aKZ9gqzeQG              www.ht.no
4QbYvqzw0dzq             www.smp.no
KQe1r30O3aJY     www.folkebladet.no
Vyb86KGNmevA             www.nrk.no
y5eVPnwz1bEP      www.nettavisen.no
pmbkZXYN5azJ    www.framtidinord.no
4QbYvqzJpdzq         www.itromso.no
olejq6XwWdjN              www.ht.no
N1aMQWk65aWm      www.tipsbladet.no
Vyb86KGxLevA     www.aftenbladet.no
N1aMQWLnPaWm      www.dagsavisen.no
y5eVPnX2MbEP     www.aftenbladet.no
LDdwm7ovRa1Y              www.sa.no
WZdPZWBllaKg              www.sa.no
Name: domain, dtype: object

In [56]:
#Extracting titles of the articles in cluster 304_106
df[df.cluster_id=='304_106'].title

id
MYerRVjGweOB    Sarpsborg stoppet formlaget Viking – vant sin ...
l9avoLn6rbG1    Sarpsborg stoppet formlaget Viking – vant sin ...
WZdPZWnylaKg    Sarpsborg stoppet formlaget Viking – vant sin ...
ELe32Vzgrd69    Sarpsborg stoppet formlaget Viking – vant sin ...
Jrb2vDyPJbWL    Sarpsborg stoppet formlaget Viking – vant sin ...
y1aKZ9gqzeQG    Sarpsborg stoppet formlaget Viking – vant sin ...
4QbYvqzw0dzq    Sarpsborg stoppet formlaget Viking – vant sin ...
KQe1r30O3aJY    Sarpsborg stoppet formlaget Viking – vant sin ...
Vyb86KGNmevA                           Viking tapte mot Sarpsborg
y5eVPnwz1bEP    Sarpsborg stoppet formlaget Viking – vant sin ...
pmbkZXYN5azJ    Sarpsborg stoppet formlaget Viking - vant sin ...
4QbYvqzJpdzq    Sarpsborg stoppet formlaget Viking - vant sin ...
olejq6XwWdjN    Sarpsborg stoppet formlaget Viking - vant sin ...
N1aMQWk65aWm    Sarpsborg stoppet formlaget Viking - vant sin ...
Vyb86KGxLevA     Så kom nedturen for Viking - tapte mot Sarpsborg
N1aMQWL

In [30]:
#Observing Cluster number 304_1
for Nounss in df[df.cluster_id=='304_1'].Nounss:
    print(Nounss)

['helga', 'spøkelsesjakt']
['helgen']
['helga']
['sykehusavdeling', 'helgen']
['poeng', 'helgen']
['helga']


In [46]:
#Observing Cluster number 304_1036
for Nounss in df[df.cluster_id=='304_1036'].Nounss:
    print(Nounss)

['gass', 'Norges', 'kjøttanlegg', 'ribbe', 'jul', 'Synd']
['gass', 'Norges', 'kjøttanlegg', 'ribbe', 'jul', 'Synd']
['gass', 'Norges', 'kjøttanlegg', 'ribbe', 'jul', 'Synd']
['gass', 'Norges', 'kjøttanlegg', 'ribbe', 'jul', 'Synd']
['gass', 'Norges', 'kjøttanlegg', 'ribbe', 'jul', 'Synd']
['gass', 'Norges', 'kjøttanlegg', 'ribbe', 'jul', 'Synd']
['gass', 'Norges', 'kjøttanlegg', 'ribbe', 'jul', 'Synd']


In [50]:
df[df.cluster_id=='304_1036'].domain

id
YRdG5W40QaDz       www.nidaros.no
BDbD9W4Ondl2        www.h-avis.no
N1aMQW4XGaWm            www.sb.no
46dBL94n2a79          www.amta.no
N1aMQW41PaWm            www.ta.no
openrJ5O5e7A    www.nettavisen.no
y1aKZ9nnGeQG    www.nettavisen.no
Name: domain, dtype: object

In [51]:
#Observing Cluster number 304_136
df[df.cluster_id=='304_1036'].title

id
YRdG5W40QaDz    Bånn gass på Norges største kjøttanlegg, men d...
BDbD9W4Ondl2    (+) Bånn gass på Norges største kjøttanlegg, m...
N1aMQW4XGaWm    (+) Bånn gass på Norges største kjøttanlegg – ...
46dBL94n2a79    Bånn gass på Norges største kjøttanlegg – men ...
N1aMQW41PaWm    Bånn gass på Norges største kjøttanlegg – men ...
openrJ5O5e7A    Bånn gass på Norges største kjøttanlegg – men ...
y1aKZ9nnGeQG    Bånn gass på Norges største kjøttanlegg – men ...
Name: title, dtype: object

In [31]:
df[df.cluster_id=='304_1'].domain

id
olejq6zPWdjN               www.nrk.no
YqaQ0gPXlenj    mammapaahjul.blogg.no
BDbD9W31xdl2                www.tb.no
5xe7LV4QOa7r               www.nrk.no
pmbkZXp2NazJ      geriatriks.blogg.no
APdRoj9XEeGy     dinside.dagbladet.no
Name: domain, dtype: object

In [None]:
for Nounss in df[df.cluster_id=='304_1'].Nounss:
    print(Nounss)

In [34]:
for Nounss in df[df.cluster_id=='304_1017'].Nounss:
    print(Nounss)


['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['Abo', 'brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene']
['brusen', 'butikkhyllene', '\u2060', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', '\u2060', 'folk']
['brusen', 'butikkhyllene', '\u2060', 'folk']
['brusen', 'butikkhyllene', '\u2060', 'folk']
['brusen', 'butikkhyllene', '\u2060', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'butikkhyllene', 'folk']
['brusen', 'but

In [37]:
dff=df[df['Nounss'].str.contains('Øst-Europa', regex=False, case=False, na=False)]
dff

Unnamed: 0_level_0,url,og_url,domain,title,description,content,entities,authors,paywall,fb_total,...,cat_score,outbound_link_ids,inbound_link_ids,tagss,Nounss,title_clean,pub_day,pub_hour,Nounss_clean,cluster_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MvbmQNK63bYA,https://www.rb.no/smitten-sprer-seg-i-ost-euro...,https://www.rb.no/5-43-1673392,www.rb.no,Smitten sprer seg i Øst-Europa,Koronasmitten stiger raskt i land som Slovakia...,Koronasmitten stiger raskt i land som Slovakia...,"['Polen', 'Slovakia', 'Tsjekkia', 'Visehelsemi...",['NTB'],,0,...,0.314579,,,"[('Øst-Europa', False)]","['Smitten', 'Øst-Europa']",smitt øst europ,2021-10-31,21,smitt øst europ,304_650
YqaQ0gY5qenj,https://www.nrk.no/nyheter/okt-koronasmitte-i-...,,www.nrk.no,Økt koronasmitte i Øst-Europa,Tsjekkia registrerte søndag mer enn 5.000 smit...,Økt koronasmitte i Øst-Europa Tsjekkia registr...,"['Øst-Europa Tsjekkia', 'Polen', 'Slovakia', '...",['NRK'],,0,...,0.483529,,,"[('Øst-Europa', False)]",['Øst-Europa'],øst europ,2021-10-31,21,øst europ,304_650
zPdyr2ZJweQr,https://www.nettavisen.no/nyheter/utenriks/smi...,https://www.nettavisen.no/12-95-3424199010,www.nettavisen.no,Smitten sprer seg i Øst-Europa,Koronasmitten stiger raskt i land som Slovakia...,Koronasmitten stiger raskt i land som Slovakia...,"['Polen', 'Slovakia', 'Tsjekkia', 'Visehelsemi...","['NTB', 'NTB Tips meg']",,0,...,0.309345,,,"[('Øst-Europa', False)]","['Smitten', 'Øst-Europa']",smitt øst europ,2021-10-31,20,smitt øst europ,304_650
4QbYvqjrOdzq,https://www.rbnett.no/ntb/utenriks/2021/10/31/...,,www.rbnett.no,Smitten sprer seg i Øst-Europa,Koronasmitten stiger raskt i land som Slovakia...,Utenriks Tsjekkia registrerte søndag mer enn 5...,"['Polen', 'Utenriks Tsjekkia', 'Visehelseminis...",['(NTB) NTB-DPA'],,0,...,0.60551,,,"[('Øst-Europa', False)]","['Smitten', 'Øst-Europa']",smitt øst europ,2021-10-31,20,smitt øst europ,304_650
y5eVPn4lBbEP,https://www.itromso.no/ntb/uriks/2021/10/31/Sm...,,www.itromso.no,Smitten sprer seg i Øst-Europa,Koronasmitten stiger raskt i land som Slovakia...,uriks Tsjekkia registrerte søndag mer enn 5.00...,"['Polen', 'Tsjekkia', 'Visehelseminister Walde...",['(NTB) NTB-DPA'],,0,...,0.409833,,,"[('Øst-Europa', False)]","['Smitten', 'Øst-Europa']",smitt øst europ,2021-10-31,20,smitt øst europ,304_650
oQeZVrkv2epZ,https://www.folkebladet.no/utenriks/2021/10/31...,,www.folkebladet.no,Smitten sprer seg i Øst-Europa,Koronasmitten stiger raskt i land som Slovakia...,utenriks Tsjekkia registrerte søndag mer enn 5...,"['Polen', 'Tsjekkia', 'Visehelseminister Walde...",['(NTB) NTB-DPA'],,0,...,0.60551,,,"[('Øst-Europa', False)]","['Smitten', 'Øst-Europa']",smitt øst europ,2021-10-31,20,smitt øst europ,304_650
YqaQ0gYvMenj,https://www.ht.no/ntb/utenriks/2021/10/31/Smit...,,www.ht.no,Smitten sprer seg i Øst-Europa,Koronasmitten stiger raskt i land som Slovakia...,utenriks Tsjekkia registrerte søndag mer enn 5...,"['Polen', 'Tsjekkia', 'Visehelseminister Walde...",['(NTB) NTB-DPA'],,0,...,0.60551,,,"[('Øst-Europa', False)]","['Smitten', 'Øst-Europa']",smitt øst europ,2021-10-31,20,smitt øst europ,304_650
K9b6RKnzQbEv,https://www.altaposten.no/lokalt/NTB/NTB_utenr...,,www.altaposten.no,Smitten sprer seg i Øst-Europa,Koronasmitten stiger raskt i land som Slovakia...,NTB utenriks Tsjekkia registrerte søndag mer e...,"['NTB', 'Polen', 'Tsjekkia', 'Visehelseministe...",['(NTB) NTB-DPA'],,0,...,0.804374,,,"[('Øst-Europa', False)]","['Smitten', 'Øst-Europa']",smitt øst europ,2021-10-31,20,smitt øst europ,304_650
olejq6EOzdjN,https://www.framtidinord.no/ntb/utenriks/2021/...,,www.framtidinord.no,Smitten sprer seg i Øst-Europa,Koronasmitten stiger raskt i land som Slovakia...,utenriks Tsjekkia registrerte søndag mer enn 5...,"['Polen', 'Tsjekkia', 'Visehelseminister Walde...",['(NTB) NTB-DPA'],,0,...,0.60551,,,"[('Øst-Europa', False)]","['Smitten', 'Øst-Europa']",smitt øst europ,2021-10-31,20,smitt øst europ,304_650
QBeXopgq8dyK,https://www.smp.no/ntb/utenriks/2021/10/31/Smi...,,www.smp.no,Smitten sprer seg i Øst-Europa,Koronasmitten stiger raskt i land som Slovakia...,Smitten sprer seg i Øst-Europa - Sunnmørsposte...,,['(NTB) NTB-DPA'],,0,...,,,,"[('Øst-Europa', False)]","['Smitten', 'Øst-Europa']",smitt øst europ,2021-10-31,20,smitt øst europ,304_650


In [39]:
df[df.cluster_id=='304_650'].domain

id
MvbmQNK63bYA              www.rb.no
YqaQ0gY5qenj             www.nrk.no
zPdyr2ZJweQr      www.nettavisen.no
4QbYvqjrOdzq          www.rbnett.no
y5eVPn4lBbEP         www.itromso.no
oQeZVrkv2epZ     www.folkebladet.no
YqaQ0gYvMenj              www.ht.no
K9b6RKnzQbEv      www.altaposten.no
olejq6EOzdjN    www.framtidinord.no
QBeXopgq8dyK             www.smp.no
J0dN9WV5KbLO         www.adressa.no
Name: domain, dtype: object

In [38]:
for Nounss in df[df.cluster_id=='304_650'].Nounss:
    print(Nounss)


['Smitten', 'Øst-Europa']
['Øst-Europa']
['Smitten', 'Øst-Europa']
['Smitten', 'Øst-Europa']
['Smitten', 'Øst-Europa']
['Smitten', 'Øst-Europa']
['Smitten', 'Øst-Europa']
['Smitten', 'Øst-Europa']
['Smitten', 'Øst-Europa']
['Smitten', 'Øst-Europa']
['Smitten', 'Øst-Europa']


# Dictionary of all the articles more than 1 domain

In [42]:
#stories with more than one article
df[df.cluster_id.isin(clust_size[clust_size>1].index)]

# make a dictionary for all clusters with size>1
# collect all unique domains within that cluster
clust_domain_dict = {}
for cluster in clust_size[clust_size>1].index[:60]:
    clust_domain_dict[cluster] = df[df.cluster_id==cluster].domain.unique()
 

In [43]:
clust_domain_dict

{'304_0': array(['www.framtidinord.no', 'www.ha-halden.no', 'www.fjt.no',
        'www.itromso.no'], dtype=object),
 '304_1': array(['www.nrk.no', 'mammapaahjul.blogg.no', 'www.tb.no',
        'geriatriks.blogg.no', 'dinside.dagbladet.no'], dtype=object),
 '304_10': array(['www.sandnesposten.no', 'www.nrk.no'], dtype=object),
 '304_100': array(['www.mitthammerfest.no', 'www.ifinnmark.no', 'www.sb.no'],
       dtype=object),
 '304_1000': array(['www.ta.no', 'www.adressa.no'], dtype=object),
 '304_101': array(['www.adressa.no', 'www.sb.no', 'www.indre.no'], dtype=object),
 '304_1015': array(['www.dagbladet.no', 'www.engelskeklubber.com'], dtype=object),
 '304_1016': array(['www.rbnett.no', 'www.ba.no', 'www.hardanger-folkeblad.no'],
       dtype=object),
 '304_1017': array(['www.pd.no', 'www.gd.no', 'www.tvedestrandsposten.no', 'www.ao.no',
        'www.ranablad.no', 'www.sa.no', 'www.h-a.no', 'www.amta.no',
        'www.op.no', 'www.ha-halden.no', 'www.moss-avis.no', 'www.oa.no',
      

In [3]:
import pandas as pd
df = pd.read_csv('cop26_clust_noun.csv', index_col=0)


In [5]:
df.groupby('cluster_id').size()

cluster_id
304_0       4
304_1       6
304_10      2
304_100     3
304_1000    2
           ..
317_995     2
317_996     2
317_997     6
317_998     2
317_999     1
Length: 39942, dtype: int64