In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import json
import nltk
from statistics import mean, median, variance, quantiles
import re

In [2]:
path = "labMT.txt"
happiness_ranks = pd.read_csv(path, sep='\t', skiprows = lambda x: x in [0,1,2], index_col = False)
happiness_ranks.head()

Unnamed: 0,word,happiness_rank,happiness_average,happiness_standard_deviation,twitter_rank,google_rank,nyt_rank,lyrics_rank
0,laughter,1,8.5,0.9313,3600,--,--,1728
1,happiness,2,8.44,0.9723,1853,2458,--,1230
2,love,3,8.42,1.1082,25,317,328,23
3,happy,4,8.3,0.9949,65,1372,1313,375
4,laughed,5,8.26,1.1572,3334,3542,--,2332


In [3]:
with open("newsdata.json", "r", encoding="utf-8") as f:
        all_articles = json.load(f)

In [5]:
all_articles.pop('nextPage')

KeyError: 'nextPage'

In [6]:
N = len(all_articles)

In [6]:
def calculate_sentiment(text_string: str):
    # Tokenize page
    remove_punc = re.sub(r'[^\w\s]', ' ', text_string)
    clean_text = re.sub(r'\s+', ' ', remove_punc).strip()
    text_lower = clean_text.lower()
    content = [w for w in text_lower.split() if w.isalpha()] #extract words from string

    # Calculate frequency of words
    fdist = nltk.FreqDist(content)

    # Calculate sentiment
    weighted_sentiment_score = 0
    count = 0
    for w, f in fdist.items():
        try:
            idx = np.where(happiness_ranks.word == w.lower())
            idx = idx[0][0]
            weighted_sentiment_score += happiness_ranks['happiness_average'].iloc[idx]*f
            count += f
        except:
            continue
    
    if weighted_sentiment_score > 0:
        return weighted_sentiment_score/count, content
    else:
        return None, content

In [7]:
sentiments_title = {}
sentiments_description = {}
wl_title = {}
wl_description = {}


count_no_title = 0
count_no_description = 0

for key, article in all_articles.items():
    try:
        title = article['title']
        # List of words of each article title is saved in wl_titles
        sentiment, wl_title[key] = calculate_sentiment(title)
        if sentiment is None:
            count_no_title +=1
        else:
            sentiments_title[key] = sentiment
    except:
        print(f"Could not calculate sentiment for title of {key}")

    try:
        description = article['description']
        # List of words of each article description is saved in wl_description
        sentiment, wl_description[key] = calculate_sentiment(description)
        if sentiment is None:
            count_no_description +=1
        else:
            sentiments_description[key] = sentiment
    except:
        print(f"Could not calculate sentiment for title of {key}")


In [8]:
print(f"Could not calculate sentiment for {count_no_title} articles (or about {round(count_no_title/N*100,2)}%)")
print(f"Could not calculate sentiment for {count_no_description} articles (or about {round(count_no_description/N*100,2)}%)")

Could not calculate sentiment for 23 articles (or about 0.43%)
Could not calculate sentiment for 6 articles (or about 0.11%)


In [None]:
# Save data

with open("sentiments_title.json", "w") as f:
    json.dump(sentiments_title, f)

with open("sentiments_description.json", "w") as f:
    json.dump(sentiments_description, f)

with open("wl_title.json", "w") as f:
    json.dump(wl_title, f)

with open("wl_description.json", "w") as f:
    json.dump(wl_description, f)

In [7]:
# Load data
with open("sentiments_title.json", "r") as f:
    sentiments_title = json.load(f)

with open("sentiments_description.json", "r") as f:
    sentiments_description = json.load(f)

with open("wl_title.json", "r") as f:
    wl_title = json.load(f)

with open("wl_description.json", "r") as f:
    wl_description = json.load(f)

In [8]:
path_emotions = "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
emotion_scores = pd.read_csv(path_emotions, sep='\t', index_col = False, header= None, names = ['word', 'emotion', 'associated'])

In [9]:
emotion_scores

Unnamed: 0,word,emotion,associated
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0
...,...,...,...
141535,zoom,negative,0
141536,zoom,positive,0
141537,zoom,sadness,0
141538,zoom,surprise,0


In [10]:
articles_title_emotions = {}
articles_description_emotions = {}
articles_title_no_emotions = 0
articles_description_no_emotions = 0

for key, article in all_articles.items():
    content = wl_title[key]
    article_emotions = {}
    for word in content:
        if word in emotion_scores.word.values:
            try:
                subset = emotion_scores[(emotion_scores.word == word) & (emotion_scores.associated == 1) ]['emotion']
                for e in subset:
                    if e not in article_emotions:
                        article_emotions[e] = 1
                    else:
                        article_emotions[e] += 1
            except:
                continue

    if len(article_emotions)>0:
        articles_title_emotions[key] = article_emotions
    else:
        articles_title_no_emotions += 1
        print('No emotions')

    # content = wl_description[key]
    # article_emotions = {}
    # for word in content:
    #     if word in emotion_scores.word.values:
    #         try:
    #             subset = emotion_scores[(emotion_scores.word == word) & (emotion_scores.associated == 1) ]['emotion']
    #             for e in subset:
    #                 if e not in article_emotions:
    #                     article_emotions[e] = 1
    #                 else:
    #                     article_emotions[e] += 1
    #         except:
    #             continue
    # if len(article_emotions)>0:
    #     articles_description_emotions[key] = article_emotions
    # else:
    #     articles_description_no_emotions += 1
    #     print('No emotions')
        

No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No emotions
No e

In [13]:
print(f"Could not find emotions for {articles_title_no_emotions} articles (or about {round(articles_title_no_emotions/N*100,2)}%)")
#print(f"Could not find emotions for {articles_description_no_emotions} articles (or about {round(articles_description_no_emotions/N*100,2)}%)")

Could not find emotions for 1167 articles (or about 21.61%)


In [41]:
import requests
import re
import urllib.parse
import urllib.parse
import owid

In [15]:
# https://ourworldindata.org/grapher/gdp-per-capita-worldbank
url_gdp_capita = "https://ourworldindata.org/grapher/gdp-per-capita-worldbank.csv?v=1&csvType=full&useColumnShortNames=true"

# https://ourworldindata.org/grapher/inflation-of-consumer-prices?overlay=download-data
url_inflation = "https://ourworldindata.org/grapher/inflation-of-consumer-prices.csv?v=1&csvType=full&useColumnShortNames=true" 

# https://ourworldindata.org/grapher/consumer-price-index
url_consumer_price_index = "https://ourworldindata.org/grapher/consumer-price-index.csv?v=1&csvType=full&useColumnShortNames=true"


# https://ourworldindata.org/grapher/median-age?overlay=download-data
url_median_age = "https://ourworldindata.org/grapher/median-age.csv?v=1&csvType=full&useColumnShortNames=true"

#https://ourworldindata.org/grapher/average-years-of-schooling?overlay=download-data
url_average_schooling_years = "https://ourworldindata.org/grapher/average-years-of-schooling.csv?v=1&csvType=full&useColumnShortNames=true"


In [None]:
# GDP per capita data (2024)
gdp_capita = pd.read_csv(url_gdp_capita)
print(len(list(np.unique(gdp_capita.Entity))))
gdp_capita = gdp_capita[(gdp_capita.Year == 2024)][['Entity', 'ny_gdp_pcap_pp_kd']]
print(len(list(np.unique(gdp_capita.Entity))))

285
197


In [60]:
# Inflation (2024)
inflation = pd.read_csv(url_inflation)
print(len(list(np.unique(inflation.Entity))))
inflation = inflation[inflation.Year == 2024][['Entity', 'fp_cpi_totl_zg']]
print(len(list(np.unique(inflation.Entity))))


205
171


In [None]:
# Consumer price index 2024
consumer_price_index = pd.read_csv(url_consumer_price_index)
print(len(list(np.unique(consumer_price_index.Entity))))
consumer_price_index = consumer_price_index[consumer_price_index.Year == 2024]
print(len(list(np.unique(consumer_price_index.Entity))))

192
158


In [None]:
# Median age (2024)
median_age = pd.read_csv(url_median_age)
print(len(list(np.unique(median_age.Entity))))
median_age =median_age[median_age.Year == 2024]
print(len(list(np.unique(median_age.Entity))))

253
253


In [71]:
# Years of school (2023)
school_years = pd.read_csv(url_average_schooling_years)
print(len(list(np.unique(school_years.Entity))))
school_years = school_years[school_years.Year == 2023]
print(len(list(np.unique(school_years.Entity))))

294
294


In [69]:
school_years.Year.max()

2023

>- This next part tries to find the biggest import and export partners. It is not finished and I dont know if it is worth the trouble

In [24]:
# To get import and export partners
baseurl = "https://en.wikipedia.org/w/api.php?"
action = "action=query"
title = "titles=List_of_countries_by_leading_trade_partners"
content = "prop=revisions&rvprop=content"
dataformat ="format=json"
query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
print(query)

https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&titles=List_of_countries_by_leading_trade_partners&format=json


In [33]:
req = urllib.request.Request(query, headers = {'User-Agent': 's214205@dtu.dk'})
wikiresponse = urllib.request.urlopen(req)
wikidata = wikiresponse.read()
# wikitext = wikidata.decode('utf-8')
wikitext_long = json.loads(wikidata)

In [34]:
wikitext = wikitext_long['query']['pages']['30815978']['revisions'][0]['*']

In [37]:
tables = re.findall(r"{\|[\s\S]*?\|}", wikitext)
print(tables[0])


{| class="wikitable sortable static-row-numbers static-row-header-text col2center col3center"
! scope="col" | Country
! scope="col" | Leading export market 
! scope="col" | Leading import source
|-
| {{flag| Afghanistan}}||{{IND}}||{{IRN}}
|-
| {{flag| Albania}} || colspan="2"|{{EU}}
|-
| {{flag| Algeria}} || colspan="2"|{{EU}}
|-
| {{flag| Andorra}}
| colspan="2"  |{{EU}}
|-
| {{flag| Angola}}||{{CHN}}||{{EU}}
|-
| {{flag| Antigua and Barbuda}}||{{UAE}}||{{USA}}
|-
| {{flag| Argentina}}||{{BRA}}||{{CHN}}
|-
| {{flag| Armenia}}|| colspan="2"|{{RUS}}
|-
| {{Flag|Aruba}}
|{{COL}}
|{{USA}}
|-
| {{flag| Australia}}|| colspan="2"|{{CHN}}
|-
| {{flag| Austria}}|| colspan="2"|{{EU}}
|-
| {{flag| Azerbaijan}}|| colspan="2"|{{EU}}
|-
| {{flag| Bahamas}}|| colspan="2"|{{USA}}
|-
| {{flag| Bahrain}}|| colspan="2"|{{SAU}}
|-
| {{flag| Bangladesh}}||{{USA}}||{{CHN}}
|-
| {{flag| Barbados}}|| colspan="2"|{{USA}}
|-
| {{flag| Belarus}}|| colspan="2"|{{RUS}}
|-
| {{flag| Belgium}}|| colspan="2"|{{EU}}

In [38]:
def parse_wikitext_table(text):
    rows = []
    current_row = []

    lines = text.strip().split("\n")

    for line in lines:
        line = line.strip()

        # Row separator
        if line.startswith("|-"):
            if current_row:
                rows.append(current_row)
            current_row = []
            continue
        
        # Header row
        if line.startswith("!"):
            cells = [c.strip() for c in line[1:].split("!!")]
            rows.append(cells)
            continue

        # Normal row
        if line.startswith("|"):
            cells = [c.strip() for c in line[1:].split("||")]
            current_row.extend(cells)

    # Append last row
    if current_row:
        rows.append(current_row)

    return rows

In [40]:
parsed_table = parse_wikitext_table(tables[0])

for row in parsed_table:
    print(row)


['scope="col" | Country']
['scope="col" | Leading export market']
['scope="col" | Leading import source']
['{{flag| Afghanistan}}', '{{IND}}', '{{IRN}}']
['{{flag| Albania}}', 'colspan="2"|{{EU}}']
['{{flag| Algeria}}', 'colspan="2"|{{EU}}']
['{{flag| Andorra}}', 'colspan="2"  |{{EU}}']
['{{flag| Angola}}', '{{CHN}}', '{{EU}}']
['{{flag| Antigua and Barbuda}}', '{{UAE}}', '{{USA}}']
['{{flag| Argentina}}', '{{BRA}}', '{{CHN}}']
['{{flag| Armenia}}', 'colspan="2"|{{RUS}}']
['{{Flag|Aruba}}', '{{COL}}', '{{USA}}']
['{{flag| Australia}}', 'colspan="2"|{{CHN}}']
['{{flag| Austria}}', 'colspan="2"|{{EU}}']
['{{flag| Azerbaijan}}', 'colspan="2"|{{EU}}']
['{{flag| Bahamas}}', 'colspan="2"|{{USA}}']
['{{flag| Bahrain}}', 'colspan="2"|{{SAU}}']
['{{flag| Bangladesh}}', '{{USA}}', '{{CHN}}']
['{{flag| Barbados}}', 'colspan="2"|{{USA}}']
['{{flag| Belarus}}', 'colspan="2"|{{RUS}}']
['{{flag| Belgium}}', 'colspan="2"|{{EU}}']
['{{flag| Belize}}', '{{GBR}}', '{{USA}}']
['{{flag| Benin}}', '{{BGD}}'