In [1]:
#!pip install -U dhlab
import dhlab as dh
from sentiment import *
import pandas as pd


wordcloud er ikke installert, kan ikke lage ordskyer


# Sentimentanalyse i aviskorpus 

1. [Hent inn korpus](#hent-inn-korpusdefinisjon)
2. [Tell ordfrekvens av nøkkelord](#forekomster-per-avisutgave)
3. [Score sentiment for forekomster](#sentimentscore) 
4. [Lagre utdata](#lagre-data)

## Korpusdefinisjon

**Fyll inn filsti** til lokal csv- eller excel-fil med korpusdefinisjon:


In [458]:
# INPUT KREVES
file_path = "FYLL INN"  

Eventuelt **definer et korpus**  direkte med parametere

In [459]:
# INPUT KREVES

word = "barnevernet"
city="Kristiansand"
from_year=2000
to_year=2022
number_of_docs=10000


In [460]:
# korpusdefinisjonen tas vare på i corpus-variabelen som et dhlab.Corpus-objekt
if file_path!="FYLL INN": 
    corpus = load_corpus_from_file(file_path)
else:
    corpus = dh.Corpus(
    doctype="digavis", 
    fulltext=word, 
    freetext=f"city: {city}",
    from_year=from_year,
    to_year = to_year,
    limit=number_of_docs)

In [456]:
# Fjern kolonner som ikke har data i seg 
news_df = strip_empty_cols(corpus)

## Ordfrekvens av nøkkelord

**Tell ordfrekvensen** til nøkkelordet eller flere ord per utgivelse.
  
Fyll inn listen med søkeord etter eget ønske.

In [464]:
# INPUT KREVES
search_terms = """
barnevern
barnevernet
barnevernets
barneverntjeneste
barneverntjenesten
barneverntjenester
barnevernloven
barnevernsnemda
"""

In [465]:
search_terms = make_list(search_terms)          # Gjør om flerlinje-strengen til en liste
count_terms = corpus.count(search_terms)        # Tell opp forekomstene av søkeordene i korpuset
word_freqs = count_terms.frame.T                # Snu om på rader/kolonner

### Forekomster per avisutgave

`word_freqs` viser absolutt frekvens for hvert søkeord (`word`) per avisutgivelse (`urn`).

Begrepsforklaring:
- `urn`: Unique resource name. Nasjonalbibliotekets interne ID til det scannede bildet av et papirdokument. 
  - Scanningsdatoen er bakt inn i URN-en i ISO-8601-format, og samsvarer som regel med utgivelsesdatoen.
  - Avistittelen er også bakt inn i URN-en
  - Eksempel: `URN:NBN:no-nb_digavis_aamliavisa_null_null_20090319_3_11_1` er Åmliavisas avisutgave den 19.mars 2009
- `word`: eksakt ordform som forekommer i teksten.

In [466]:
word_freqs

word,barnevern,barnevernet,barnevernets,barnevernloven,barneverntjeneste,barneverntjenesten,barneverntjenester
urn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
URN:NBN:no-nb_digavis_budstikkasoegne_null_null_20080220_10_7_1,2.0,1.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_budstikkasoegne_null_null_20081105_10_44_1,0.0,1.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_budstikkasoegne_null_null_20090401_11_13_1,0.0,2.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_budstikkasoegne_null_null_20090513_11_18_1,1.0,7.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_budstikkasoegne_null_null_20091021_11_39_1,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
URN:NBN:no-nb_digavis_sorlandsavisenkr_null_null_20180418_2_11_1,0.0,2.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_sorlandsavisenkr_null_null_20181121_2_42_1,0.0,1.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_sorlandsavisenkr_null_null_20190314_3_0_1,0.0,2.0,0.0,0.0,0.0,0.0,0.0
URN:NBN:no-nb_digavis_sorlandsavisenkr_null_null_20190328_3_0_1,0.0,2.0,0.0,0.0,0.0,0.0,0.0


### Ordfrekvenser totalt i korpuset

Tabellen viser totalfrekvensene av hvert søkeord i korpuset.

In [467]:
word_freqs.sum()

word
barnevern             1398.0
barnevernet           5298.0
barnevernets           396.0
barnevernloven          66.0
barneverntjeneste       20.0
barneverntjenesten     231.0
barneverntjenester      39.0
dtype: float64

## Sentimentscore

Beregn en sentimentscore til kontekstene der nøkkelordet forekommer.
  

In [432]:
# INPUT KREVES
# Fyll inn antall ord som skal telles med i konteksten før og etter nøkkelordet.
before=20
after=20

In [434]:
df = news_df.iloc[0:6]
c = dh.Corpus.from_df(df)

In [472]:
import numpy as np


def unpivot(frame):
    """Reshape a dataframe with multiple indexes.

    Util function copied from Pandas docs:
    https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html
    """
    N, K = frame.shape
    data = {
        "frequency": frame.to_numpy().ravel("F"),
        "urn": np.asarray(frame.columns).repeat(N),
        "word": np.tile(np.asarray(frame.index), K),
    }

    return pd.DataFrame(data, columns=["word", "urn", "frequency"])




def score_sentiment(urn, word, before, after):
    """Calculate a sentiment score for the contexts of ``word`` in a given publication."""
    collocations = fetch_finegrained_collocations(urn, word, before, after)
    pos, neg = coll_sentiment(collocations, word, return_score_only=True)
    values = [pos, neg, pos-neg]
    names = ["positive", "negative", "sentimentscore"]
    return dict(zip(names, values))

def count_and_score_target_word(corpus: dh.Corpus, search_terms: str):
    words = make_list(search_terms)
    counts = corpus.count(words)
    term_counts = unpivot(counts.frame)
    scored_terms = term_counts.apply(lambda x: score_sentiment(x.urn, x.word, before, after ), axis=1, result_type="expand")
    term_counts.loc[:, ["positive", "negative", "sentimentscore"]] = scored_terms
    df = strip_empty_cols(corpus)
    return df.merge(term_counts, how="inner", left_on="urn", right_on="urn")


In [471]:
result = count_and_score_target_word(corpus, search_terms)

ValueError: Expected object or value

In [435]:
counts = c.count(make_list(search_terms))

In [437]:
term_counts = unpivot(counts.frame)

In [438]:
term_counts

Unnamed: 0,word,urn,frequency
0,barnevern,URN:NBN:no-nb_digavis_faedrelandsvennen_null_n...,1.0
1,barnevernet,URN:NBN:no-nb_digavis_faedrelandsvennen_null_n...,17.0
2,barnevern,URN:NBN:no-nb_digavis_oestsida_null_null_20020...,1.0
3,barnevernet,URN:NBN:no-nb_digavis_oestsida_null_null_20020...,1.0
4,barnevern,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,0.0
5,barnevernet,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,2.0
6,barnevern,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,0.0
7,barnevernet,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,1.0
8,barnevern,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,1.0
9,barnevernet,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,2.0


In [439]:
term_counts.loc[:, ["positive", "negative", "sentimentscore"]] = term_counts.apply(lambda x: score_sentiment(x.urn, x.word, before, after ), axis=1, result_type="expand")

In [440]:
term_counts

Unnamed: 0,word,urn,frequency,positive,negative,sentimentscore
0,barnevern,URN:NBN:no-nb_digavis_faedrelandsvennen_null_n...,1.0,3.0,3.0,0.0
1,barnevernet,URN:NBN:no-nb_digavis_faedrelandsvennen_null_n...,17.0,39.0,33.0,6.0
2,barnevern,URN:NBN:no-nb_digavis_oestsida_null_null_20020...,1.0,3.0,6.0,-3.0
3,barnevernet,URN:NBN:no-nb_digavis_oestsida_null_null_20020...,1.0,0.0,3.0,-3.0
4,barnevern,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,0.0,0.0,0.0,0.0
5,barnevernet,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,2.0,3.0,4.0,-1.0
6,barnevern,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,0.0,0.0,0.0,0.0
7,barnevernet,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,1.0,3.0,2.0,1.0
8,barnevern,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,1.0,0.0,1.0,-1.0
9,barnevernet,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,2.0,4.0,7.0,-3.0


In [441]:
newdf = df.merge(term_counts, how="inner", left_on="urn", right_on="urn")

In [442]:
newdf

Unnamed: 0,dhlabid,urn,title,city,timestamp,year,doctype,word,frequency,positive,negative,sentimentscore
0,201310892,URN:NBN:no-nb_digavis_oestsida_null_null_20020...,oestsida,Kristiansand,20020925,2002,digavis,barnevern,1.0,3.0,6.0,-3.0
1,201310892,URN:NBN:no-nb_digavis_oestsida_null_null_20020...,oestsida,Kristiansand,20020925,2002,digavis,barnevernet,1.0,0.0,3.0,-3.0
2,201310902,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,oestsida,Kristiansand,20030226,2003,digavis,barnevern,0.0,0.0,0.0,0.0
3,201310902,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,oestsida,Kristiansand,20030226,2003,digavis,barnevernet,2.0,3.0,4.0,-1.0
4,201310903,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,oestsida,Kristiansand,20030312,2003,digavis,barnevern,0.0,0.0,0.0,0.0
5,201310903,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,oestsida,Kristiansand,20030312,2003,digavis,barnevernet,1.0,3.0,2.0,1.0
6,201310907,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,oestsida,Kristiansand,20030813,2003,digavis,barnevern,1.0,0.0,1.0,-1.0
7,201310907,URN:NBN:no-nb_digavis_oestsida_null_null_20030...,oestsida,Kristiansand,20030813,2003,digavis,barnevernet,2.0,4.0,7.0,-3.0
8,201310915,URN:NBN:no-nb_digavis_oestsida_null_null_20040...,oestsida,Kristiansand,20040128,2004,digavis,barnevern,1.0,7.0,2.0,5.0
9,201310915,URN:NBN:no-nb_digavis_oestsida_null_null_20040...,oestsida,Kristiansand,20040128,2004,digavis,barnevernet,1.0,0.0,0.0,0.0


----

##  Lagre data 

**Skriv utdata** til en CSV-fil på ditt lokale filsystem.

Kolonner:
-  dhlab-ID (digitalt tekstobjekt)
-  URN (digitalt bilde av tekstdokument)
-  avistittel
-  sted
-  dato
-  nøkkelord
-  ordfrekvens
-  postiv score
-  negativ score
-  sum sentimentscore


In [468]:
outputfile = f"sentimentanalyse_aviskorpus_{from_year}-{to_year}_term_{word}.csv"
newdf.to_csv(outputfile)