# **Python & SnowNLP: Sentiment Analysis for the Chinese Language**

In [1]:
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup

def get_all_website_links(url):
    urls = set()
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            continue
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        # cleaning URL
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if domain_name not in href:
            # external link
            urls.add(href)  # Add external links to the set
        else:
            # internal link
            urls.add(href)
    return urls

# Testing the function
url = "http://people.com.cn/"
all_links = get_all_website_links(url)
for link in all_links:
    print(link)


http://www.legaldaily.com.cn/
http://ent.people.com.cn/n1/2024/0326/c1012-40203315.html
mailto://kf@people.cn
http://lianghui.people.com.cn/2024/n1/2024/0311/c458609-40193833.html
https://app.people.cn/h5/topic/subject_normal/2303
http://spanish.peopledaily.com.cn/
http://pic.people.com.cn/n1/2024/0326/c1016-40203427.html
http://cpc.people.com.cn/20th/
http://society.people.com.cn/n1/2024/0326/c1008-40203252.html
http://leaders.people.com.cn/GB/178291/218130/458407/index.html
http://acftu.people.com.cn/
http://data.people.com.cn/
http://paper.people.com.cn/rmrbhwb/index.html
http://www.people.com.cn
http://society.people.com.cn/n1/2024/0326/c1008-40203280.html
http://lianghui.people.com.cn/2024/n1/2024/0312/c458561-40194554.html
http://ztjy.people.cn/n1/2023/0422/c457340-32670511.html
http://paper.people.com.cn/rmrb/html/2022-12/09/nw.D110000renmrb_20221209_2-05.htm
http://kf.people.com.cn/
http://finance.people.com.cn/n1/2024/0326/c1004-40203416.html
http://health.people.com.cn/n1/202

In [5]:
!pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.2-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
     ---------------------------------------- 0.0/7.4 MB ? eta -:--:--
     ---------------------------------------- 0.0/7.4 MB ? eta -:--:--
     ---------------------------------------- 0.0/7.4 MB 435.7 kB/s eta 0:00:17
     ---------------------------------------- 0.0/7.4 MB 4

In [6]:
from newspaper import Article

In [7]:
url = "http://www.china.com.cn/"
a = Article(url, language='zh') # Chinese
a.download()
a.parse()

Building prefix dict from C:\Users\sulayako\AppData\Local\Programs\Python\Python312\Lib\site-packages\jieba\dict.txt ...
Dumping model to file cache C:\Users\sulayako\AppData\Local\Temp\jieba.cache
Loading model cost 1.6050844192504883 seconds.
Prefix dict has been built succesfully.


initialization via the SnowNLP class

In [8]:
pip install snownlp

Collecting snownlp
  Using cached snownlp-0.12.3-py3-none-any.whl
Installing collected packages: snownlp
Successfully installed snownlp-0.12.3
Note: you may need to restart the kernel to use updated packages.


In [9]:
from snownlp import SnowNLP

# Initialize SnowNLP with a Chinese text
s = SnowNLP(u'我喜欢红包')  # Remember to prefix with 'u' to indicate a Unicode string

# Tokenize the text into words
words = s.words

# Print the tokenized words
print(words)


['我', '喜欢', '红包']


In [10]:
from snownlp import SnowNLP
s = SnowNLP(u'我喜欢红包')
list(s.tags)

[('我', 'r'), ('喜欢', 'v'), ('红包', 'n')]

*   r: refers to pronoun.
*   v: refers to verb.
*   n: refers to noun.
*   w: refers to punctuation.



Coming back to the task is, now we need to get the keywords

In [11]:
s.keywords(10)

['红包', '喜欢']

In [12]:
s.summary(3)

['我喜欢红包']

In [13]:
s.sentiments

0.5116350732054405

split the whole text into sentences

In [15]:
from snownlp import SnowNLP

# Replace the input text with your desired text
input_text = """Press release
The spokesperson of the Chinese Embassy in the Philippines responded to reporters’ questions on U.S. Secretary of State Blinken’s erroneous remarks on the South China Sea issue
Taiwan Affairs Office of the State Council: Firmly opposed to any form of official exchanges between countries that have diplomatic relations with China and Taiwan
Ministry of Foreign Affairs: Congratulations to Mustafa on his assumption of office as the new Palestinian Prime Minister
Ministry of Foreign Affairs: Urging Japan to take concrete actions to completely break away from militarism
On March 19, 2024, Foreign Ministry Spokesperson Lin Jian hosted a regular press conference
The spokesman of the National Development and Reform Commission discussed new opportunities for China's economy with the business community
On March 18, 2024, Foreign Ministry Spokesperson Lin Jian hosted a regular press conference
Taiwan Affairs Office of the State Council: Mutual assistance in dealing with emergencies at sea is the right way for both sides in the Taiwan Strait
Spokesperson of the Ministry of Foreign Affairs responded to the negative remarks of the US Ambassador to China regarding China
Ministry of Foreign Affairs: Will provide assistance to Cuba in overcoming current difficulties
On March 15, 2024, Foreign Ministry Spokesperson Wang Wenbin hosted a regular press conference
Taiwan Affairs Office of the State Council: Firmly supports the mainland coast guard department in carrying out law enforcement inspections in waters near Kinmen
Ministry of Foreign Affairs: Spreading false information cannot stop China’s progress
Ministry of National Defense: India should stop actions that are inconsistent with easing the border situation"""

# Create SnowNLP object with the input text
s = SnowNLP(input_text)

# Keywords
print("The keywords are:")
print(*s.keywords(5), sep=", ")

# Summary
print("The summary is:")
print(*s.summary(2), sep=", ")
print(" ")

# Sentiment analysis on individual sentences
sentences = s.sentences
for sentence in sentences:
    s_sentence = SnowNLP(sentence)
    print("Sentence:", sentence)
    print("Sentiment:", s_sentence.sentiments)


The keywords are:
Foreign, Ministry, State, Affairs:, actions
The summary is:
Ministry of National Defense: India should stop actions that are inconsistent with easing the border situation, Ministry of Foreign Affairs: Spreading false information cannot stop China’s progress
 
Sentence: Press release
Sentiment: 0.4737672181921908
Sentence: The spokesperson of the Chinese Embassy in the Philippines responded to reporters’ questions on U.S. Secretary of State Blinken’s erroneous remarks on the South China Sea issue
Sentiment: 0.07737095533983573
Sentence: Taiwan Affairs Office of the State Council: Firmly opposed to any form of official exchanges between countries that have diplomatic relations with China and Taiwan
Sentiment: 0.186883963356826
Sentence: Ministry of Foreign Affairs: Congratulations to Mustafa on his assumption of office as the new Palestinian Prime Minister
Sentiment: 0.21384155132474048
Sentence: Ministry of Foreign Affairs: Urging Japan to take concrete actions to comp

In [16]:
from snownlp import SnowNLP

# Replace the input text with your desired text
input_text = """新闻稿
中国驻菲律宾使馆发言人就美国国务卿布林肯在南海问题上的错误言论答记者问
国台办：坚决反对建交国与中国台湾进行任何形式的官方往来
外交部：祝贺穆斯塔法就任巴勒斯坦新总理
外交部：敦促日方采取切实行动彻底脱离军国主义
2024年3月19日，外交部发言人林坚主持例行记者会
国家发改委新闻发言人与工商界共商中国经济新机遇
2024年3月18日，外交部发言人林坚主持例行记者会
国台办：台海双方互助应对海上突发事件才是正确之道
外交部发言人回应美国驻华大使涉华负面言论
外交部：将向古巴克服当前困难提供帮助
2024年3月15日，外交部发言人汪文斌主持例行记者会
国台办：坚决支持大陆海警部门在金门附近海域开展执法检查
外交部：散布虚假信息阻挡不了中国前进的脚步
国防部：印度应停止与缓和边境局势相悖的行动"""

# Create SnowNLP object with the input text
s = SnowNLP(input_text)

# Keywords
print("The keywords are:")
print(*s.keywords(5), sep=", ")

# Summary
print("The summary is:")
print(*s.summary(2), sep=", ")
print(" ")

# Sentiment analysis on individual sentences
sentences = s.sentences
for sentence in sentences:
    s_sentence = SnowNLP(sentence)
    print("Sentence:", sentence)
    print("Sentiment:", s_sentence.sentiments)


The keywords are:
外交部, 中国, 发言人, 应, 美国
The summary is:
外交部发言人林坚主持例行记者会, 外交部发言人林坚主持例行记者会
 
Sentence: 新闻稿
Sentiment: 0.6645037616567715
Sentence: 中国驻菲律宾使馆发言人就美国国务卿布林肯在南海问题上的错误言论答记者问
Sentiment: 0.3577115446454615
Sentence: 国台办：坚决反对建交国与中国台湾进行任何形式的官方往来
Sentiment: 0.9858811227899732
Sentence: 外交部：祝贺穆斯塔法就任巴勒斯坦新总理
Sentiment: 0.6603689313443105
Sentence: 外交部：敦促日方采取切实行动彻底脱离军国主义
Sentiment: 0.6304746833989406
Sentence: 2024年3月19日
Sentiment: 0.07991033427032224
Sentence: 外交部发言人林坚主持例行记者会
Sentiment: 0.5625949974090925
Sentence: 国家发改委新闻发言人与工商界共商中国经济新机遇
Sentiment: 0.9864792047159534
Sentence: 2024年3月18日
Sentiment: 0.16811374790283784
Sentence: 外交部发言人林坚主持例行记者会
Sentiment: 0.5625949974090925
Sentence: 国台办：台海双方互助应对海上突发事件才是正确之道
Sentiment: 0.991949991808882
Sentence: 外交部发言人回应美国驻华大使涉华负面言论
Sentiment: 0.9879813696514639
Sentence: 外交部：将向古巴克服当前困难提供帮助
Sentiment: 0.5640603991776584
Sentence: 2024年3月15日
Sentiment: 0.09299983468069173
Sentence: 外交部发言人汪文斌主持例行记者会
Sentiment: 0.5218354548251938
Sentence: 国台办：坚决支持大陆海警部门在金门

In [23]:
import pickle
from snownlp import SnowNLP

# Assuming you have trained your model and named it 'model'
model = s

# Save the model as a pickle file
with open("s.pkl", "wb") as f:
    pickle.dump(model, f)