In [334]:
#!pip install -U dhlab
import dhlab as dh

from dhlab import Corpus
from dhlab.api.dhlab_api import document_corpus
import requests
from sentiment import *
import pandas as pd

from dhlab.api.dhlab_api import urn_collocation, get_document_frequencies, collocation, word_variant, word_paradigm
from dhlab.text.utils import urnlist




# Sentimentanalyse i aviskorpus 

1. [Hent inn korpus](#hent-inn-korpusdefinisjon)
2. [Tell ordfrekvens av nøkkelord](#forekomster-per-avisutgave)
3. [Score sentiment for forekomster](#sentimentscore) 
4. [Lagre utdata](#lagre-data)

## Hent inn korpusdefinisjon

To muligheter: 

1. **Last inn** fil med korpusdefinisjon, med parametere:
    - Filsti
2. **Definer korpus**, med input-variabler:
    - aviser 
    - fra 2000-2022 
    - nøkkelord 
    - evt stedsnavn for avisutgivelser
    - evt. avistittel
    - evt. spesifikke datoer

**Valgmulighet 1:**
Last opp eksisterende korpus


In [39]:
# Fyll inn filsti til lokal csv- eller excel-fil med korpusdefinisjon

file_path = "FYLL INN"  

In [40]:
#FIXME: slett celle

file_path = "./digavis-1999-2022.xlsx"

In [41]:
# lagre korpusdefinisjonen som et dhlab.Corpus-objekt
if file_path.endswith(".xlsx"):
    corpus = dh.Corpus.from_df(pd.read_excel(file_path)) 
elif file_path.endswith(".csv"):
    corpus = dh.Corpus.from_csv(file_path)
else:
    print("file_path must be a .csv or .xlsx file: ", file_path)

**Valgmulighet 2**: Definer korpus med gitte kriterier

Fyll inn variablene nedenfor. 

In [42]:
# 1.2. Bygg korpus iterativt av aviser for perioden med et gitt nøkkelord
search_term = "barnevern"
from_year = 1999
to_year = 2000

# Valgfritt:
titles: str = None      # titler på aviser. OBS! små bokstaver, ingen mellomrom, uten norske tegn æ, ø, å
cities: str = None      # byer
docs_per_day = 10       # begrensning på antall aviser per dag


In [43]:

def specific_news_corpus(**params):
        r = requests.post("https://api.nb.no/dhlab/build_corpus", json=params)
        assert (r.status_code == 200) and (r.json())
        return r.json()

def corpus_generator(keyword: str = search_term,from_year=from_year, to_year=to_year, limit=10):
    """Generate a large corpus of daily newspapers where ``keyword`` appears."""
    for date in timestamp_generator(from_year, to_year):
        parameters = {
            "doctype": "digavis",
            #"title":title,
            #"city": city,
            "freetext": f"timestamp: {date}",
            "fulltext": keyword,
            "limit": limit
            }
        #subcorpus = document_corpus(doctype="digavis", fulltext=keyword, freetext=f"timestamp: {date}", limit=docs_per_day)
        try:
            json_obj = specific_news_corpus(**parameters)
            yield json_obj
        except AssertionError as e: 
            continue

In [None]:
# OBS! Ikke ferdig testet kodeblokk
clist = list(*corpus_generator(search_term, from_year=1999, to_year=2000, limit=docs_per_day))

df = pd.DataFrame(clist)
corpus = dh.Corpus.from_df(df)

In [52]:
corpus_df = corpus.frame.dropna(axis=1)

In [119]:
corpus_df.sort_values("timestamp")

Unnamed: 0,dhlabid,urn,title,timestamp,year,doctype
19790,200047839,URN:NBN:no-nb_digavis_agderposten_null_null_19...,agderposten,19990107,1999,digavis
1713,200590031,URN:NBN:no-nb_digavis_fredriksstadblad_null_nu...,fredriksstadblad,19990107,1999,digavis
16510,201457153,URN:NBN:no-nb_digavis_romerikesblad_null_null_...,romerikesblad,19990108,1999,digavis
10734,203740385,URN:NBN:no-nb_digavis_sarpsborgarbeiderblad_nu...,sarpsborgarbeiderblad,19990108,1999,digavis
3892,203872839,URN:NBN:no-nb_digavis_indresmaalenenesavis_nul...,indresmaalenenesavis,19990108,1999,digavis
...,...,...,...,...,...,...
1341,203590330,URN:NBN:no-nb_digavis_meraakerposten_null_null...,meraakerposten,20211223,2021,digavis
7531,203952148,URN:NBN:no-nb_digavis_gjengangeren_null_null_2...,gjengangeren,20211224,2021,digavis
13731,203859506,URN:NBN:no-nb_digavis_vestnytt_null_null_20211...,vestnytt,20211224,2021,digavis
14276,203589164,URN:NBN:no-nb_digavis_glaamdalen_null_null_202...,glaamdalen,20211227,2021,digavis


## Ordfrekvens av nøkkelord

**Tell ordfrekvensen** til nøkkelordet eller flere ord per utgivelse, med disse variablene:
   - søkeord
   - korpus
  
Fyll inn listen med søkeord etter eget ønske.

In [283]:
search_terms = """
barnevern
barnevernet
barnevernets
barneverntjeneste
barneverntjenesten
barneverntjenester
barnevernloven
barnevernsnemda
"""

In [424]:
search_terms = make_list(search_terms)          # Gjør om flerlinje-strengen til en liste
count_terms = corpus.count(search_terms)        # Tell opp forekomstene av søkeordene i korpuset
word_freqs = count_terms.frame.T                # Snu om på rader/kolonner


### Forekomster per avisutgave

`word_freqs` viser absolutt frekvens for hvert søkeord (`word`) per avisutgivelse (`urn`).

Begrepsforklaring:
- `urn`: Unique resource name. Nasjonalbibliotekets interne ID til det scannede bildet av et papirdokument. 
  - Scanningsdatoen er bakt inn i URN-en i ISO-8601-format, og samsvarer som regel med utgivelsesdatoen.
  - Avistittelen er også bakt inn i URN-en
  - Eksempel: `URN:NBN:no-nb_digavis_aamliavisa_null_null_20090319_3_11_1` er Åmliavisas avisutgave den 19.mars 2009
- `word`: eksakt ordform som forekommer i teksten.

In [425]:
word_freqs

word,barnevern,barnevernet,barnevernets,barnevernloven,barnevernsnemda,barneverntjeneste,barneverntjenesten,barneverntjenester
urn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
URN:NBN:no-nb_digavis_aamliavisa_null_null_20090319_3_11_1,1.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_aamliavisa_null_null_20100114_3_1_1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_aamliavisa_null_null_20110217_4_6_1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_aamliavisa_null_null_20110303_4_8_1,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_aamliavisa_null_null_20110331_4_12_1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
URN:NBN:no-nb_digavis_ytringen_null_null_20151201_34_91_1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_ytringen_null_null_20160614_35_45_1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_ytringen_null_null_20160909_35_70_1,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_ytringen_null_null_20170407_36_28_1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Ordfrekvenser totalt i korpuset

Tabellen viser totalfrekvensene av hvert søkeord i korpuset.

In [426]:
word_freqs.sum()

word
barnevern             25568.0
barnevernet           28218.0
barnevernets           1780.0
barnevernloven          452.0
barnevernsnemda           9.0
barneverntjeneste       500.0
barneverntjenesten     1645.0
barneverntjenester      563.0
dtype: float64

## Sentimentscore

Beregn en sentimentscore til kontekstene der nøkkelordet forekommer.

1. **Hent ut kontekster** (kollokasjoner) for nøkkelordet
   - nøkkelord
   - urn --> gir samlet sentimentscore for utgivelsen
   - evt. konkordans (100) per forekomst? --> gir sentimentscore per forekomst.
2. **Beregn sentimentscore** for forekomstene av nøkkelordet
   - kollokasjon
   - positiv ordliste
   - negativ ordliste 
  

In [427]:
# Fyll inn antall ord som skal telles med i konteksten før og etter nøkkelordet.
before=50
after=50

In [428]:

def fetch_finegrained_collocations(urn, word, before, after):
    coll = urn_collocation(
        urns=[urn],
        word=word,
        before=before,
        after=after
    )
    coll = coll.loc[[x for x in coll.index if x.isalpha()]]
    coll.index = [x.lower() for x in coll.index]
    coll = coll.groupby(coll.index).sum()
    return coll


In [432]:

def score_sentiment(row, word, before, after):
    collocations = fetch_finegrained_collocations(row.loc["urn"], word, before, after)
    pos, neg = coll_sentiment(collocations, word, return_score_only=True)
    values = [word, pos, neg, pos-neg]

    columns = ["target_word", "positive", "negative", "sentiment_sum"]

    return dict(zip(columns, values))
                

In [433]:
corpus_df.apply(score_sentiment, axis=1, result_type="expand", args=(search_term, before, after))


----

In [431]:
# tester greier 

for urn in corpus_df.urn:
    freq = word_freqs.loc[urn]
    occurring_words = word_freqs.index[word_freqs > 0]
    x = word_freqs[word]
    for word in occurring_words:
        print(f"{word} forekommer {word_freqs.loc[word]} ganger i {corpus_df.title[corpus_df.urn == urn]} {corpus_df["timestamp"][corpus_df.urn == urn] }")

KeyError: 'URN:NBN:no-nb_digavis_nordhordland_null_null_20160604_43_43_1'

In [420]:
from dhlab.api.dhlab_api import concordance_counts, concordance

word =occurring_words[1]

conc = concordance(urns=[urn], words=word, window=100, limit=100)
conccount = concordance_counts(urns=[urn], words=word, window=100, limit=100)

In [421]:

example = conc.conc.values[0]
example


'... Om tysdag vedtok eit einstemmig formannskap å utvide bemanninga i <b>barnevernet</b> med ei full stilling .'

In [381]:
from dhlab.nbtokenizer import tokenize

tokenize(example)

['...',
 'Om',
 'tysdag',
 'vedtok',
 'eit',
 'einstemmig',
 'formannskap',
 'å',
 'utvide',
 'bemanninga',
 'i',
 '<',
 'b',
 '>',
 'barnevernet',
 '<',
 '/',
 'b',
 '>',
 'med',
 'ei',
 'full',
 'stilling',
 '.']

In [333]:

coll

Unnamed: 0,counts
advokat,1
akutte,4
allereie,1
andre,6
arbeidsgjevaren,1
...,...
å,11
åmli,20
år,10
åttande,4


##  Lagre data 

**Skriv utdata** til en CSV-fil på ditt lokale filsystem.

Kolonner:
-  URN
-  dato
-  avistittel
-  sted
-  nøkkelord
-  ordfrekvens
-  postiv score
-  negativ score
-  sum sentimentscore


In [None]:
outputfile = f"data/sentimentanalyse_{search_term}_aviskorpus_{from_year}-{to_year}.csv"
corpus_df.to_csv(outputfile)