# 北京/东京奥运会开幕式数据可视化

- [数据清洗和观察](#data)

- [关键词提取和展示](#kw)

- [高流量视频信息展示](#spinfo)

- [时序情感分析](#sent)

In [1]:
import pandas as pd
import numpy as np
import re
from pyecharts.charts import WordCloud as wc
from pyecharts.charts import Bar, Line, Grid
import pyecharts.options as opts
from pyecharts.globals import ThemeType, SymbolType

# 提取关键词
import jieba
import jieba.analyse
from collections import Counter

# 情感分析
from snownlp import SnowNLP

import warnings
warnings.filterwarnings('ignore')

## <span id='data'> 数据清洗和观察 </span>

In [38]:
data_bj = pd.read_csv('data/bj_ceremony.csv')
data_dj = pd.read_csv('data/dj_ceremony.csv')
data_bj.drop('Unnamed: 0', axis=1, inplace=True)
data_dj.drop('Unnamed: 0', axis=1, inplace=True)

# 查看空值情况
print(data_bj.isnull().sum())
print()
print(data_dj.isnull().sum())

date     0
title    0
view     0
like     0
dtype: int64

date     0
title    0
view     0
like     0
dtype: int64


In [39]:
# 处理，除去view和like列中的“万”，转化成数值型数据

def turn_str_to_num(df):
    view = df.view
    like = df.like
    for i in range(len(df)):
        view[i] = re.sub('万', '', view[i])
        like[i] = re.sub('万', '', like[i])
    df['view'] = view.astype(float)
    df['like'] = like.astype(float)
    return df


data_bj = turn_str_to_num(data_bj)
data_dj = turn_str_to_num(data_dj)

In [40]:
# 处理date的格式
def process_date(df):
    dd = df.date.copy()
    for i in range(len(dd)):
        if re.search('小时前', dd[i]) is not None:
            '''爬虫是在2022-5-19完成的'''
            dd[i] = "2022-5-19"
        if re.search('昨天', dd[i]) is not None:
            dd[i] = "2022-5-18"
        if len(dd[i]) < 8:
            dd[i] = '2022-' + dd[i]
    df['date'] = dd
    return df


data_bj = process_date(data_bj)
data_dj = process_date(data_dj)

In [41]:
data_bj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 989 entries, 0 to 988
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    989 non-null    object 
 1   title   989 non-null    object 
 2   view    989 non-null    float64
 3   like    989 non-null    float64
dtypes: float64(2), object(2)
memory usage: 31.0+ KB


In [6]:
data_bj.head()[: 3]

Unnamed: 0,date,title,view,like
0,2021-5-18,2008年北京奥运会开幕式,998.7,75.0
1,2021-8-11,全网首发 4K修复《2008年北京奥运会开幕式》文艺演出及点火仪式,100.0,7.3
2,2021-7-23,【1080P 60帧 全回顾】2008北京奥运会开幕式,610.5,62.9


In [7]:
# 载入停用词
stopwords = pd.read_csv('./chinesestopword.txt', sep='\t\n', encoding='utf-8', names=['stopwords'], header=None, quoting=3)
stwlist = stopwords.stopwords.tolist()

## <span id='kw'> 关键词提取和展示 </span>

In [8]:
title_bj = data_bj['title'].tolist()
title_dj = data_dj['title'].tolist()
print(len(title_bj), '\n', title_bj[-1])
print(len(title_dj), '\n', title_dj[-1])

989 
 冬奥会开幕式
952 
 关于东京奥运会特别节目及开幕式...那些神吐槽们（包括日韩网友）...


In [9]:
def get_kw(content):
    pattern = re.compile('\d+')
    wordsCounter = Counter()

    for line in content:
        segs = jieba.lcut(line)
        for seg in segs:
            if len(seg) > 1 and seg not in ['\r\n', '奥运会', '开幕式'] and re.search(pattern, seg) is None and seg not in stwlist:
                wordsCounter[seg] += 1
    return wordsCounter.most_common(200)

In [10]:
kw_bj, kw_dj = get_kw(title_bj), get_kw(title_dj)
kw_bj[:5]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zhang\AppData\Local\Temp\jieba.cache
Loading model cost 0.546 seconds.
Prefix dict has been built successfully.


[('北京', 701), ('奥运', 85), ('回顾', 75), ('中国', 69), ('张艺谋', 55)]

In [11]:
cloudbj = wc()
cloudbj.add(' ', kw_bj, shape=SymbolType.RECT)
cloudbj.set_global_opts(title_opts=opts.TitleOpts(title="北京奥运会开幕式"))

clouddj = wc(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
clouddj.add(' ', kw_dj, shape=SymbolType.RECT)
clouddj.set_global_opts(title_opts=opts.TitleOpts(title="东京奥运会开幕式"))
cloudbj.render_notebook()

In [12]:
clouddj.render_notebook()

## <span id='spinfo'> 高流量视频信息展示 </span>

In [13]:
# 播放量排名前十的视频信息
def top_view_10(data_bb):
    top_view = data_bb.sort_values(by='view', ascending=False)
    top_view = top_view.reset_index(drop=True).iloc[: 10]
    return top_view


tvbj = top_view_10(data_bj)
tvdj = top_view_10(data_dj)
tvbj

Unnamed: 0,date,title,view,like
0,2021-7-24,夏季奥运会开幕式全集（1972-2012&1932）,9992.0,164.0
1,2022-2-5,日本滑雪运动员奥运村vr初体验,9647.0,240.0
2,2019-8-17,08年奥运开幕式科比享受全场最高欢呼声，由此可见篮球在国内地位！,9403.0,84.0
3,2018-8-8,【怀旧/北京奥运会十周年】2008年8月8日新闻联播cut,9387.0,33.0
4,2018-8-10,【北京奥运十周年】『奥运•十年记忆』 2008年北京奥运会开幕式前的垫场表演（国外国内都没有...,9331.0,58.0
5,2021-10-1,【九九八十一】千年华夏 高燃！五千年中华史 国庆贺翻调填词【乐正绫】,9250.0,765.0
6,2019-12-5,北京奥运会开幕式（超清无解说）,9034.0,35.0
7,2021-9-21,【奥运会】高燃｜混剪｜健儿须快马，意气趁年少,8930.0,916.0
8,2022-3-12,2008.8.8北京奥运会开幕式CCTV广告,8919.0,139.0
9,2018-10-22,【古筝】您的好友拉弦乐器古筝上线,8919.0,137.0


In [14]:
# 点赞数排名前十的视频信息
def top_like_10(data_bb):
    top_like = data_bb.sort_values(by='like', ascending=False)
    top_like = top_like.reset_index(drop=True).iloc[: 10]
    return top_like


tlbj = top_like_10(data_bj)
tldj = top_like_10(data_dj)
tlbj

Unnamed: 0,date,title,view,like
0,2021-8-8,13年前的今天，北京惊艳世界！,6.7,9495.0
1,2021-9-11,北京奥运会开幕式之万名演员妆容的背后,8.1,9431.0
2,2021-7-24,【北京奥运】YYDS 德国小哥回顾08北京奥运会开幕式,20.3,8711.0
3,2022-1-17,韩国SBS电视台偷拍泄密北京奥运会开幕式,12.8,8001.0
4,2021-6-20,畅谈曾孝濂+乌合麒麟作品《岛》：世界海洋日、人与地球、人类命运共同体的中国智慧；大格局下的真...,7.7,7520.0
5,2021-11-5,北京奥运会开幕式的神细节，效果至今无人能超越,6.0,7249.0
6,2021-9-21,《胡旋舞》 ｜ 花好月圆会 殷硕舞动西域风情,15.2,7240.0
7,2017-8-9,"【怀旧向】【满满的都是回忆】Sarah Brightman,刘欢 - 我和你 (2008年北...",26.7,7103.0
8,2021-7-19,【双语字幕全网首发】2008年NHK北京奥运会开幕式全程，极佳的日语学习材料！,24.2,6643.0
9,2020-10-18,这就是我日思夜想没有呈现的北京奥运会凤凰涅槃浴火重生的点火仪式,52.9,6443.0


In [15]:
# 获取标题关键词，每个标题提取3个词，使用TF-IDF算法
def get_title_kw(df):
    title = df.title
    kws = []
    pattern = re.compile('\d+')
    for each in title:
        segs = jieba.analyse.extract_tags(each, topK=3)
        segs = filter(lambda x: len(x) > 1, segs)
        segs = filter(lambda x: re.search(pattern, x) is None, segs)
        segs = filter(lambda x: x not in stwlist, segs)
        for seg in segs:
            kws.append(seg)
    return kws


kw_view_bj = get_title_kw(tvbj)
kw_like_bj = get_title_kw(tlbj)
kw_view_dj = get_title_kw(tvdj)
kw_like_dj = get_title_kw(tldj)

In [16]:
# 清点词频
def count_freq(lst):

    def takeSecond(elem):
        return elem[1]

    words = set(lst)
    wordfreq = []
    for w in words:
        wordfreq.append((w, lst.count(w)))

    wordfreq.sort(key=takeSecond)
    return wordfreq


kwfreq_view_bj = count_freq(kw_view_bj)
kwfreq_like_bj = count_freq(kw_like_bj)
kwfreq_view_dj = count_freq(kw_view_dj)
kwfreq_like_dj = count_freq(kw_like_dj)
kwfreq_view_bj

[('古筝', 1),
 ('初体验', 1),
 ('解说', 1),
 ('超清', 1),
 ('CCTV', 1),
 ('欢呼声', 1),
 ('vr', 1),
 ('科比', 1),
 ('弦乐器', 1),
 ('直播', 1),
 ('新闻联播', 1),
 ('混剪', 1),
 ('奥运村', 1),
 ('乐正', 1),
 ('cut', 1),
 ('健儿', 1),
 ('翻调', 1),
 ('好友', 1),
 ('高燃', 2),
 ('开幕式', 3)]

In [17]:
# 用条形图展示结果
viewbarbj = Bar()
viewbarbj.add_xaxis([x[0] for x in kwfreq_view_bj])
viewbarbj.add_yaxis('词频', [x[1] for x in kwfreq_view_bj], color="#d14a61")
viewbarbj.reversal_axis()
viewbarbj.set_series_opts(label_opts=opts.LabelOpts(position="right"))
viewbarbj.set_global_opts(title_opts=opts.TitleOpts(title="top播放量关键词-北京"),
                          legend_opts=opts.LegendOpts(pos_left="20%"),)

likebarbj = Bar()
likebarbj.add_xaxis([x[0] for x in kwfreq_like_bj])
likebarbj.add_yaxis('词频', [x[1] for x in kwfreq_like_bj], color="#5793f3")
likebarbj.reversal_axis()
likebarbj.set_series_opts(label_opts=opts.LabelOpts(position="right"))
likebarbj.set_global_opts(title_opts=opts.TitleOpts(title="top点赞数关键词-北京", pos_right="30%"),
                          legend_opts=opts.LegendOpts(pos_right="20%"),)

viewbardj = Bar()
viewbardj.add_xaxis([x[0] for x in kwfreq_view_dj])
viewbardj.add_yaxis('词频', [x[1] for x in kwfreq_view_dj], color="#d14a61")
viewbardj.reversal_axis()
viewbardj.set_series_opts(label_opts=opts.LabelOpts(position="right"))
viewbardj.set_global_opts(title_opts=opts.TitleOpts(title="top播放量关键词-东京", pos_top="50%"),
                          legend_opts=opts.LegendOpts(pos_left="20%"),)

likebardj = Bar()
likebardj.add_xaxis([x[0] for x in kwfreq_like_dj])
likebardj.add_yaxis('词频', [x[1] for x in kwfreq_like_dj], color="#5793f3")
likebardj.reversal_axis()
likebardj.set_series_opts(label_opts=opts.LabelOpts(position="right"))
likebardj.set_global_opts(title_opts=opts.TitleOpts(title="top点赞数关键词-东京", pos_right="30%", pos_top="50%"),
                          legend_opts=opts.LegendOpts(pos_right="20%"),)

<pyecharts.charts.basic_charts.bar.Bar at 0x25d63b02a00>

In [18]:
top_view_like = Grid(init_opts=opts.InitOpts(width="1100px", height="1200px"))
top_view_like.add(viewbarbj, grid_opts=opts.GridOpts(pos_bottom="60%", pos_left="55%"))
top_view_like.add(likebarbj, grid_opts=opts.GridOpts(pos_bottom="60%", pos_right="55%"))
top_view_like.add(viewbardj, grid_opts=opts.GridOpts(pos_top="55%", pos_left="55%"))
top_view_like.add(likebardj, grid_opts=opts.GridOpts(pos_top="55%", pos_right="55%"))
top_view_like.render_notebook()

## <span id='sent'> 时序情感分析 </span>
将时间划分为10个时间段，展示不同时间段内视频的情感变化

In [44]:
def get_time_sent(df):
    time_sent = []
    # 将时间划分为10个时间段
    date = []
    timestep = int(df.shape[0] / 10)  # 日期划分的步长
    l1, l2 = 0, timestep
    
    from datetime import datetime
    df['date'] = pd.to_datetime(df['date'])  # date转为时间格式
    df = df.sort_values(by='date').reset_index(drop=True)
    # 提取每段时间内的关键词
    for i in range(10):
        # 取第l1~l2条数据
        cur_data = df.iloc[l1: l2, :]

        # 记录该时间段内第一个视频的发表时间
        date.append(cur_data.date.tolist()[0].strftime('%Y-%m-%d'))

        # 往后一个时间步
        l1 = l2
        l2 += timestep

        # 对该时间段内的title进行情感分析
        sent = []
        cur_content = cur_data['title'].tolist()
        for line in cur_content:
            line = re.sub(re.compile('【.*】'), '', line)  # 除去【】和其中的内容
            if len(line) == 0 or re.search(re.compile('\D'), line) is None:
                continue
            s = SnowNLP(line)
            sent.append(s.sentiments)
        
        avgscore = sum(sent) / len(sent)
        time_sent.append(round(avgscore, 3))  # 保存情感的分的平均值

    return time_sent, date

In [45]:
sent_bj, date_bj = get_time_sent(data_bj)

data_dj2 = data_dj[data_dj.date > '2021']  # 选择2021年（东京奥运会开始）及以后的数据
sent_dj, date_dj = get_time_sent(data_dj2)

In [46]:
date_dj

['2021-02-17',
 '2021-07-22',
 '2021-07-23',
 '2021-07-24',
 '2021-07-24',
 '2021-07-25',
 '2021-07-26',
 '2021-07-29',
 '2021-08-06',
 '2021-08-27']

In [47]:
line1 = (
    Line()
    .add_xaxis(date_bj)
    .add_yaxis("", sent_bj, 
               markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_="average")]), 
               is_step=True)
    .set_global_opts(title_opts=opts.TitleOpts(title="北京"),
                     yaxis_opts=opts.AxisOpts(min_=0.72))
)

line2 = (
    Line()
    .add_xaxis(date_dj)
    .add_yaxis("", sent_dj,
               markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_="average")]), 
               is_step=True)
    .set_global_opts(title_opts=opts.TitleOpts(title="东京", pos_top="48%"),
                     legend_opts=opts.LegendOpts(pos_top="55%"),
                     yaxis_opts=opts.AxisOpts(min_=0.72))
)

In [48]:
grid = Grid(init_opts=opts.InitOpts(width="900px", height="900px"))
grid.add(line1, grid_opts=opts.GridOpts(pos_bottom="60%"))
grid.add(line2, grid_opts=opts.GridOpts(pos_top="60%"))
grid.render_notebook()