In [1]:
from pyecharts.charts import WordCloud
import pandas as pd
import numpy as np
import jieba
from collections import Counter
from pyecharts.globals import SymbolType
from pyecharts import options as opts
from pyecharts.globals import ThemeType
from pyecharts.charts import Bar, Page
from pyecharts.faker import Faker
from pyecharts.charts import Line
from pyecharts.charts import *
import os
import re
from pyecharts.commons.utils import JsCode

In [2]:
df = pd.read_csv('data/bilibili_data_analysis.csv')
df = df.drop_duplicates(subset=['标题'],keep='first',inplace=False)

In [3]:
#缺失值处理
df = df.drop(df[df['时间']>100].index)
df[df.isnull().values==True]
#重置索引
df=df.reset_index(drop=True, inplace=False)

In [4]:
df['点赞/播放比'] = list(map(lambda x,y: x/y, df['点赞'],df['播放']))

# 标题分析

In [5]:
#先判断文件存不存在，存在则先把它删除
if(os.path.isfile("bilibili_title.txt")):
    os.remove(r'bilibili_title.txt')

**标题写入TXT文档**

In [6]:
#将标题写入TXT文件
with open('bilibili_title1.txt','w',encoding="utf-8") as fp:
    for title in df['标题']: 
         fp.write(title)
fp.close()
#读取文件

**jieba分词**

In [7]:
title_cut_list = []
with open('bilibili_title1.txt','r',encoding="utf-8") as fp:
    for title in fp:
        title = title.replace("\n", "")
        title_cut = jieba.lcut(title)
        title_cut_list.append(title_cut) 
fp.close()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\70780\AppData\Local\Temp\jieba.cache
Loading model cost 0.605 seconds.
Prefix dict has been built successfully.


In [8]:
stopwords = set()
content = [line.strip() for line in open('data/stopwords.txt','r', encoding='utf-8').readlines()]
stopwords.update(content)

**去掉一些停用词**

In [9]:
c= Counter()
for a in title_cut_list[0]:
    if len(a)>1 and a not in stopwords:
        c[a] += 1

**绘制词云图**

In [10]:
word_counts_top200 = c.most_common(200)
word1 = WordCloud(init_opts=opts.InitOpts(width='1350px', height='750px', theme=ThemeType.MACARONS))
word1.add('词频', data_pair=word_counts_top200,
          word_size_range=[15, 108], textstyle_opts=opts.TextStyleOpts(font_family='cursive'),
          shape=SymbolType.DIAMOND)
word1.set_global_opts(title_opts=opts.TitleOpts('B站标题词云图'),
                      )

word1.render_notebook()

**对标题关键词进行分析**

In [11]:
x_data = []
y_data = []
for i in range(0,60):
    x_data.append(c.most_common(200)[i][0])
    y_data.append(c.most_common(200)[i][1])

In [12]:
b= Bar(init_opts=opts.InitOpts(theme=ThemeType.CHALK,height='500px',width='1000px'))
b.add_xaxis(x_data)
b.add_yaxis("出现频次",y_data,label_opts=opts.LabelOpts(is_show=False,position='top'), itemstyle_opts=opts.ItemStyleOpts(
                                color=JsCode("""new echarts.graphic.LinearGradient(0, 0, 0, 1, 
                                             [{
                                                 offset: 0,
                                                 color: 'rgb(255,99,71)'
                                             }, {
                                                 offset: 1,
                                                 color: 'rgb(32,178,170)'
                                             }])"""))
                  )
b.set_global_opts(
    title_opts=opts.TitleOpts(title="标题词频"),
    toolbox_opts=opts.ToolboxOpts(),
    legend_opts=opts.LegendOpts(is_show=False),
    tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross'),
    xaxis_opts=opts.AxisOpts(name='发布时间',
                                               type_='category',                                           
                                               axislabel_opts=opts.LabelOpts(rotate=45),
                                               ),
    yaxis_opts=opts.AxisOpts(name='', splitline_opts=opts.SplitLineOpts(is_show=True,linestyle_opts=opts.LineStyleOpts(type_='dash')),

                             )
)  
b.render_notebook()

In [13]:
pattern = "？"
query_count = len(df[df['标题'].astype(str).str.contains(pattern,regex = True)].sort_values(by=['播放'],ascending=False))
pattern = "！"
exclamation_count = len(df[df['标题'].astype(str).str.contains(pattern,regex = True)].sort_values(by=['播放'],ascending=False))
pattern = "？！|！？"
query_exclamation_count=len(df[df['标题'].astype(str).str.contains(pattern,regex = True)].sort_values(by=['播放'],ascending=False))
total_video = len(df)
other_video = total_video+query_exclamation_count-query_count-exclamation_count

In [14]:
data_list = [query_count,exclamation_count,other_video ]
table_list = ['含问号标题','含感叹号标题','两者都不含的标题']

In [15]:
# c = (
#     Pie()
#     .add(
#         "",
#         [list(z) for z in zip(table_list,data_list)],
#         radius=["40%", "75%"],
#     )
#     .set_global_opts(
#         title_opts=opts.TitleOpts(title="各大分区播放量均值"),
#         legend_opts=opts.LegendOpts(orient="vertical", pos_top="6%", pos_left="2%"),
#     )
#     .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
# )
# c.render_notebook()

In [16]:
c = (
    Pie()
    .add(
        "",
        [list(z) for z in zip(table_list,data_list)],
        radius=["40%", "55%"],
        label_opts=opts.LabelOpts(
            position="outside",
            formatter="{a|{a}}{abg|}\n{hr|}\n {b|{b}: }{c}  {per|{d}%}  ",
            background_color="#eee",
            border_color="#aaa",
            border_width=1,
            border_radius=4,
            rich={
                "a": {"color": "#999", "lineHeight": 22, "align": "center"},
                "abg": {
                    "backgroundColor": "#e3e3e3",
                    "width": "100%",
                    "align": "right",
                    "height": 22,
                    "borderRadius": [4, 4, 0, 0],
                },
                "hr": {
                    "borderColor": "#aaa",
                    "width": "100%",
                    "borderWidth": 0.5,
                    "height": 0,
                },
                "b": {"fontSize": 16, "lineHeight": 33},
                "per": {
                    "color": "#eee",
                    "backgroundColor": "#334455",
                    "padding": [2, 4],
                    "borderRadius": 2,
                },
            },
        ),
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="含特殊符号标题分析"))
)
c.render_notebook()

## 结论

标题这里能简单的到一些信息，人们通常喜欢用一些什么，表示惊讶的句子来当标题，这样好像很吸引人

# 各大分区播放量分析

In [17]:
df = df.drop(df[df['分区'] == 'all'].index)
df=df.reset_index(drop=True, inplace=False)

In [18]:
animal_df =df[df['分区'] == 'animal'].sort_values(by=['播放'],ascending=False)

In [19]:
animal_df['播放'].mean()

640279.1279527559

In [20]:
partition_group_df = df.groupby('分区')

函数作用为将df中的数据统一保留两位小数

In [21]:
def data_normalization(a):
    mean_np = np.array(a)
    mean_np_2f = np.round(mean_np,2) 
    return list(mean_np_2f)

把各大分区的一些播放量均值和最大值求出来

In [22]:
view_mean_list1 = list(partition_group_df.agg({'播放':'mean'})['播放'].values/10000)
view_max_list1 =  list(partition_group_df.agg({'播放':'max'})['播放'].values/10000)
#将列表中的均值保留两位小数
view_mean_list = data_normalization(view_mean_list1)                  
#将列表中的最大值保留两位小数
view_max_list = data_normalization(view_max_list1)

In [23]:
partition_list=['animal','car','cinephile','dance','douga','ent','fashion','food','game','guochuang','kichiku','knowledge','life','music','origin','rookie','sports','tech']
c=Bar(init_opts=opts.InitOpts(theme=ThemeType.CHALK,height='500px',width='1000px'))
c.add_xaxis(partition_list)
c.add_yaxis("各大分区播放量均值",view_mean_list)
c.set_global_opts(
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
        title_opts=opts.TitleOpts(title="B站播放量分布", subtitle="单位:万"),
        tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross')
    )
c.render_notebook()

绘制饼状图，分析各大分区均值情况

In [24]:
from pyecharts.charts import Pie
from pyecharts.faker import Faker

In [25]:
# c = (
#     Pie()
#     .add(
#         "",
#         [list(z) for z in zip(partition_list,view_mean_list )],
#         center=["50%", "56%"],
#     )
#     .set_global_opts(
#         title_opts=opts.TitleOpts(title="各大分区播放量均值"),
#         legend_opts=opts.LegendOpts(pos_top="5%", pos_left="1%"),
#     )
#     .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
# )
# c.render_notebook()

In [26]:
c = (
    Pie()
    .add(
        "",
        [list(z) for z in zip(partition_list,view_mean_list)],
        radius=["40%", "75%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="各大分区播放量均值"),
        legend_opts=opts.LegendOpts(orient="vertical", pos_top="6%", pos_left="2%"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
c.render_notebook()

In [27]:
c=Bar(init_opts=opts.InitOpts(theme=ThemeType.CHALK,height='500px',width='1000px'))
c.add_xaxis(['animal','car','cinephile','dance','douga','ent','fashion','food','game','guochuang','kichiku','knowledge','life','music','origin','rookie','sports','tech'])
c.add_yaxis("各大分区最大播放量",view_max_list)
c.set_global_opts(
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
        title_opts=opts.TitleOpts(title="B站播放量分布", subtitle="单位:万"),
        tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross')
    )

c.render_notebook()

**各大分区播放量结论**

内容补充

# 播放量靠前的数据分析

In [28]:
head_100_video_sort_by_view_df = df.sort_values(by=['播放'],ascending=False).head(100)

In [29]:
#按时间排序，准备分析时间长短对播放量的影响，时间作为横轴，播放量为纵轴
head_100_video_df = head_100_video_sort_by_view_df.sort_values(by=['时间'],ascending=True).head(100)

## top100播放量与发布时间的分析

In [30]:
head_100_video_time_list = data_normalization(list(head_100_video_df['时间'].values))
head_100_video_view_list = data_normalization(list(head_100_video_df['播放'].values/10000))
head_100_video_like_list = data_normalization(list(head_100_video_df['点赞'].values/1000))
head_100_video_coin_list = data_normalization(list(head_100_video_df['硬币'].values/1000))

In [31]:
b= Bar(init_opts=opts.InitOpts(theme=ThemeType.CHALK,height='500px',width='1000px'))
b.add_xaxis(head_100_video_time_list)
b.add_yaxis("播放量/万",head_100_video_view_list,label_opts=opts.LabelOpts(is_show=False,position='top'), itemstyle_opts=opts.ItemStyleOpts(
                                color=JsCode("""new echarts.graphic.LinearGradient(0, 0, 0, 1, 
                                             [{
                                                 offset: 0,
                                                 color: 'rgb(255,99,71)'
                                             }, {
                                                 offset: 1,
                                                 color: 'rgb(32,178,170)'
                                             }])"""))
                  )
b.set_global_opts(
    title_opts=opts.TitleOpts(title="时间与播放量关系"),
    toolbox_opts=opts.ToolboxOpts(),
    legend_opts=opts.LegendOpts(is_show=False),
    tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross'),
    xaxis_opts=opts.AxisOpts(name='发布时间',
                                               type_='category',                                           
                                               axislabel_opts=opts.LabelOpts(rotate=45),
                                               ),
    yaxis_opts=opts.AxisOpts(name='', splitline_opts=opts.SplitLineOpts(is_show=True,linestyle_opts=opts.LineStyleOpts(type_='dash')),

                             )
)  
b.render_notebook()

发布时间对视频播放量的影响不是很大，作为创作者应该在视频内容质量上下功夫

In [32]:
k= 0
#统计各个时间段视频数量
video_number_list = []
xlable_time = [] #用作横坐标参数
for i in range(0,31):
    k=i
    day_to_day = str(k)+'-'+str(k+1)
    xlable_time.append(day_to_day)
    video_number_list.append(len(df[(df['时间'].values<k+1) & (df['时间'].values>=k)]))
video_number_list.append(len(df[df['时间'].values>=31]))
xlable_time.append('>=31')

In [33]:
b= Bar(init_opts=opts.InitOpts(theme=ThemeType.CHALK,height='500px',width='1000px'))
b.add_xaxis(xlable_time)
b.add_yaxis("视频数量",video_number_list,label_opts=opts.LabelOpts(is_show=False,position='top'), itemstyle_opts=opts.ItemStyleOpts(
                                color=JsCode("""new echarts.graphic.LinearGradient(0, 0, 0, 1, 
                                             [{
                                                 offset: 0,
                                                 color: 'rgb(255,99,71)'
                                             }, {
                                                 offset: 1,
                                                 color: 'rgb(32,178,170)'
                                             }])"""))
                  )
b.set_global_opts(
    title_opts=opts.TitleOpts(title="发布时间与上热门视频数量"),
    toolbox_opts=opts.ToolboxOpts(),
    legend_opts=opts.LegendOpts(is_show=False),
    tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross'),
    xaxis_opts=opts.AxisOpts(name='发布时间',
                                               type_='category',                                           
                                               axislabel_opts=opts.LabelOpts(rotate=45),
                                               ),
    yaxis_opts=opts.AxisOpts(name='', splitline_opts=opts.SplitLineOpts(is_show=True,linestyle_opts=opts.LineStyleOpts(type_='dash')),

                             )
)  
b.render_notebook()

发布了越久的视频上热门的再次概率是比较小的，除非那个视频真的很火，所以创作者们观察前一两天视频的热度，就可以知道视频制作效果怎么样

## top20视频中，播放量与各大因素的关系

In [34]:
head_100_video_df = head_100_video_sort_by_view_df

In [35]:
head_100_video_time_list = data_normalization(list(head_100_video_df['时间'].values/24))
head_100_video_view_list = data_normalization(list(head_100_video_df['播放'].values/10000))
head_100_video_like_list = data_normalization(list(head_100_video_df['点赞'].values/10000))
head_100_video_coin_list = data_normalization(list(head_100_video_df['硬币'].values/1000))

In [36]:
df['点赞/播放比'] = list(map(lambda x,y: x/y, df['点赞'],df['播放']))

In [37]:
bar=Bar(init_opts=opts.InitOpts(theme=ThemeType.CHALK,height='500px',width='1000px'))
bar.add_xaxis(head_100_video_df['标题'][:20].tolist())
bar.add_yaxis('播放量/万',y_axis=head_100_video_view_list[:20],
              yaxis_index=0,
              label_opts=opts.LabelOpts(is_show=False),
              stack='stack1',
              color="#d14a61"
             )
bar.add_yaxis('点赞/万',head_100_video_like_list[:20],
              yaxis_index=0,label_opts=opts.LabelOpts(is_show=False),
              stack='stack1',
              color="#5793f3"
             )
#extend_axis yaxis而不是yaxis_opts
bar.extend_axis(yaxis=opts.AxisOpts(name='点赞/播放比',
                                    min_ = -1,
                                    max_ = 0.5,
                                    position='right',

                                    axisline_opts=opts.AxisLineOpts(
                                    linestyle_opts=opts.LineStyleOpts(color="#675bba")
                                         ),
                                    axislabel_opts=opts.LabelOpts(formatter="{value} %")
                                   )
               )
bar.set_global_opts(title_opts=opts.TitleOpts(title='播放量Top20视频'),
                    xaxis_opts=opts.AxisOpts(
                        name='',
                        type_='category',
                        name_gap = 35,
                        axislabel_opts=opts.LabelOpts(interval=0,rotate=20)),
                    yaxis_opts=opts.AxisOpts(name='',splitline_opts=opts.SplitLineOpts(is_show=True),
                                             axislabel_opts=opts.LabelOpts(formatter="{value}万")
                                            ),
                    tooltip_opts=opts.TooltipOpts(trigger="axis", 
                                                  axis_pointer_type="cross"
                                                 )
                    
                   )
line=Line()
line.add_xaxis(head_100_video_df[:20]['标题'])
line.add_yaxis('点赞/播放比',
               y_axis=head_100_video_df['点赞/播放比'].tolist(),               
               label_opts=opts.LabelOpts(is_show=False),
               symbol='emptyCircle',
               is_symbol_show=True,
               color="#675bba",
               yaxis_index=1
              )


bar.overlap(line).render_notebook()


In [38]:
# 播放量top10作品漏斗图
list_funnel = ['播放','点赞','硬币','收藏','分享']
list_tl = head_100_video_df.sort_values(by='播放', ascending=False).head(30)['标题'].tolist()

tl = Timeline()
for i in list_tl:
    funnel = (
        Funnel(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
        .add(
            "作品数据",
            [list(z) for z in zip(list_funnel, np.array(head_100_video_df[head_100_video_df['标题']==i][list_funnel]).flatten().tolist())],
            label_opts=opts.LabelOpts(position="inside")
        )
        .set_global_opts(title_opts=opts.TitleOpts(title="播放量top30作品漏斗图"))
    )
    tl.add(funnel, time_point = i)
tl.render_notebook()

In [39]:
line = Line()
line.add_xaxis(head_100_video_df['标题'].tolist()[:20])
line.add_yaxis('点赞',
               head_100_video_df['点赞'].tolist()[:20],
               stack='stack',
               is_smooth=True,
               is_symbol_show=False,
               linestyle_opts=opts.LineStyleOpts(width=0),
               areastyle_opts=opts.AreaStyleOpts(
                   opacity=0.8,
                   color=JsCode("""
                                new echarts.graphic.LinearGradient(
                                    0, 0, 0, 1,
                                    [{offset: 0, color: 'rgba(128, 255, 165)'},
                                     {offset: 1, color: 'rgba(1, 191, 236)'}],
                                    false)
                                """)
               ),
               # 标出关键点的数据
                markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="min"), opts.MarkPointItem(type_="max"),
                                                    opts.MarkPointItem(type_="average")]),
               )

line.add_yaxis('投币',
               head_100_video_df['硬币'].tolist()[:20],
               stack='stack',
               is_smooth=True,
               is_symbol_show=False,
               linestyle_opts=opts.LineStyleOpts(width=0),
               areastyle_opts=opts.AreaStyleOpts(
                   opacity=0.8,
                   color=JsCode("""
                                new echarts.graphic.LinearGradient(
                                    0, 0, 0, 1,
                                    [{offset: 0, color: 'rgba(255, 191, 0)'},
                                     {offset: 1, color: 'rgba(224, 62, 76)'}],
                                    false)
                                """)
               ),
               # 标出关键点的数据
            markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="min"), opts.MarkPointItem(type_="max"),
                                                    opts.MarkPointItem(type_="average")]),
               )

line.add_yaxis('收藏',
               head_100_video_df['收藏'].tolist()[:20],
               stack='stack',
               is_smooth=True,
               is_symbol_show=False,
               linestyle_opts=opts.LineStyleOpts(width=0),
               areastyle_opts=opts.AreaStyleOpts(
                   opacity=0.8,
                   color=JsCode(
                       """
                                new echarts.graphic.LinearGradient(
                                    0, 0, 0, 1,
                                    [{offset: 0, color: 'rgba(255, 0, 135)'},
                                     {offset: 1, color: 'rgba(135, 0, 157)'}],
                                    false)
                                """
                            )
               ),
               # 标出关键点的数据
            markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="min"), opts.MarkPointItem(type_="max"),
                                                    opts.MarkPointItem(type_="average")]),
               )
line.add_yaxis('播放',
                head_100_video_df['播放'].tolist()[:20],
               stack='stack',
               is_smooth=True,
               is_symbol_show=False,
               linestyle_opts=opts.LineStyleOpts(width=0),
               areastyle_opts=opts.AreaStyleOpts(
                   opacity=0.8,
                   color=JsCode(
                       """
                                new echarts.graphic.LinearGradient(
                                    0, 0, 0, 1,
                                    [{offset: 0, color: 'rgba(255, 50, 120)'},
                                     {offset: 1, color: 'rgba(135, 0, 120)'}],
                                    false)
                                """
                            )
               ),
               # 标出关键点的数据
            markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="min"), opts.MarkPointItem(type_="max"),
                                                    opts.MarkPointItem(type_="average")]),
               )



line.set_global_opts(xaxis_opts=opts.AxisOpts(boundary_gap=False,is_show = False),
                     yaxis_opts=opts.AxisOpts(
                         is_show = False,
                         axisline_opts=opts.AxisLineOpts(is_show=False),
                        axistick_opts=opts.AxisTickOpts(is_show=False),
                        splitline_opts=opts.SplitLineOpts(is_show=True,
                                                                                linestyle_opts=opts.LineStyleOpts(color='#E0E6F1'))
                                                                                ),
                     tooltip_opts=opts.TooltipOpts(is_show=True, trigger='axis', axis_pointer_type='cross'),
                     title_opts=opts.TitleOpts(title="top20视频的一键三连"),
                     datazoom_opts=opts.DataZoomOpts(type_='inside')
                     ) 

line.set_series_opts(opts.LabelOpts(is_show=False))
line.set_colors(colors=['#80FFA5', '#FF0087', '#FFBF00'])

grid = Grid(init_opts=opts.InitOpts(theme='white',width='1000px', height='600px'))
grid.add(line, grid_opts=opts.GridOpts(pos_left='3%', pos_right='4%', pos_bottom='3%'))
grid.render_notebook()

### 播放量top20的视频硬币分享收藏与播放量的关系

In [40]:
bar = Bar(init_opts=opts.InitOpts(theme='dark',
                                  width='1000px',
                                  height='600px',)
                                  )
bar.add_xaxis(head_100_video_df['标题'].tolist()[:30])
# 添加一个Y轴
bar.extend_axis(yaxis=opts.AxisOpts(type_="value",
                                    position="right",
                                    is_scale=True,
                                    axislabel_opts=opts.LabelOpts(margin=20, color="white",
                                                                  formatter=
                                                                  JsCode(
                                                            """function (value)
                                                            {return Math.floor(value);}""")),
                                    axisline_opts=opts.AxisLineOpts(
                                        linestyle_opts=opts.LineStyleOpts(
                                            width=2, color="#fff")
                                    ),
                                    axistick_opts=opts.AxisTickOpts(
                                        is_show=True,
                                        length=15,
                                        linestyle_opts=opts.LineStyleOpts(
                                            color="#ffffff1f")
                                    ),
                                    ))
bar.add_yaxis('弹幕量', head_100_video_df['弹幕'].tolist()[:30], yaxis_index=0,
#               z_level=0,
              category_gap='30%',
              itemstyle_opts=opts.ItemStyleOpts(color='#66c18c', 
                                                opacity=0.8),
              label_opts=opts.LabelOpts(is_show=False))
bar.set_global_opts(
                    # visualmap_opts=opts.VisualMapOpts(type_='color', min_=500, max_=2000,series_index=0,
                    #                                   range_color=['#0071ce', '#ffc220', '#ffffff']),
                    title_opts=opts.TitleOpts(title="top20弹幕-评论量趋势图",
                                              pos_left="center",
                                              pos_top='1%',
                                              title_textstyle_opts=opts.TextStyleOpts(
                                                  font_size=20,
                                                color='#00BFFF')),
                    legend_opts=opts.LegendOpts(is_show=True, pos_top='6%'),
                    xaxis_opts=opts.AxisOpts(boundary_gap=False,
                                             is_show = False,
                                             axislabel_opts=opts.LabelOpts(
                                                 margin=30, color="white"),
                                             axisline_opts=opts.AxisLineOpts(
                                                 is_show=False),
                                             axistick_opts=opts.AxisTickOpts(
                                                 is_show=True,
                                                 length=10,
                                                 linestyle_opts=opts.LineStyleOpts(
                                                     color="#ffffff1f"),
                                             ),
                                             splitline_opts=opts.SplitLineOpts(
                                                 is_show=True, linestyle_opts=opts.LineStyleOpts(
                                                     color="#ffffff1f")
                                             ),
                                             ),
                    yaxis_opts=opts.AxisOpts(
    type_="value",
    position="left",
    is_scale=True,
    axislabel_opts=opts.LabelOpts(margin=20, 
                                  color="white",
                                  formatter=JsCode(
                                      """function (value) {return Math.floor(value);}""")),
    axisline_opts=opts.AxisLineOpts(
        linestyle_opts=opts.LineStyleOpts(
            width=2, color="#fff")
    ),
    axistick_opts=opts.AxisTickOpts(
        is_show=True,
        length=15,
        linestyle_opts=opts.LineStyleOpts(
            color="#ffffff1f"),
    ),
    splitline_opts=opts.SplitLineOpts(
        is_show=True, linestyle_opts=opts.LineStyleOpts(
            color="#ffffff1f")
    ),
)
)

line = Line(init_opts=opts.InitOpts(theme='light',
                                    width='1000px',
                                    height='600px'))
line.add_xaxis(head_100_video_df['标题'][:30].tolist(),
               )
# 将line数据通过yaxis_index指向后添加的Y轴
line.add_yaxis('评论数', head_100_video_df['评论'][:30].tolist(), yaxis_index=1,
               is_smooth=True,
               symbol_size=8,
               color='red',
               z_level=1,
               label_opts=opts.LabelOpts(is_show=False),
               itemstyle_opts=opts.ItemStyleOpts(color='white'),
               linestyle_opts={
                   'normal': {
                       'width': 3,
                       'shadowColor': 'rgba(0, 0, 0, 0.5)',
                       'shadowBlur': 5,
                       'shadowOffsetY': 10,
                       'shadowOffsetX': 10,
                       'curve': 0.5,
                       'color': 'red'
                   }
               })

bar.overlap(line)
bar.render_notebook()

### 热门视频的时长

In [41]:
df_duration = pd.read_csv('data/bilibili_data_analysis_duration.csv')
df_duration = df_duration.drop_duplicates(subset=['标题'],keep='last',inplace=False)#按标题去重
#缺失值处理
df_duration = df_duration.drop(df_duration[df_duration['时间']>100].index)
df_duration = df_duration.drop(df_duration[df_duration['时长']>10000].index)
df_duration[df_duration.isnull().values==True]
#重置索引
df_duration=df_duration.reset_index(drop=True, inplace=False)

In [42]:
df_duration['时长'] = df_duration['时长'].values/60

In [43]:
df_duration['时长'].describe()

count    2505.000000
mean        5.655210
std         9.864127
min         0.116667
25%         1.433333
50%         2.816667
75%         6.400000
max       154.033333
Name: 时长, dtype: float64

In [44]:
k= 0
#统计各个时间段视频数量
video_number_list = []
xlable_time = [] #用作横坐标参数
for i in range(0,31):
    k=i
    day_to_day = str(k)+'-'+str(k+1)+'分钟'
    xlable_time.append(day_to_day)
    video_number_list.append(len(df_duration[(df_duration['时长'].values<k+1) & (df_duration['时长'].values>=k)]))
video_number_list.append(len(df_duration[df_duration['时长'].values>=31]))
xlable_time.append('>=31')

In [45]:
b= Bar(init_opts=opts.InitOpts(theme=ThemeType.CHALK,height='500px',width='1000px'))
b.add_xaxis(xlable_time)
b.add_yaxis("视频数量",video_number_list,label_opts=opts.LabelOpts(is_show=False,position='top'), itemstyle_opts=opts.ItemStyleOpts(
                                color=JsCode("""new echarts.graphic.LinearGradient(0, 0, 0, 1, 
                                             [{
                                                 offset: 0,
                                                 color: 'rgb(255,99,71)'
                                             }, {
                                                 offset: 1,
                                                 color: 'rgb(32,178,170)'
                                             }])"""))
                  )
b.set_global_opts(
    title_opts=opts.TitleOpts(title="视频时长分布"),
    toolbox_opts=opts.ToolboxOpts(),
    legend_opts=opts.LegendOpts(is_show=False),
    tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross'),
    xaxis_opts=opts.AxisOpts(name='视频时长',
                                               type_='category',                                           
                                               axislabel_opts=opts.LabelOpts(rotate=45),
                                               ),
    yaxis_opts=opts.AxisOpts(name='', splitline_opts=opts.SplitLineOpts(is_show=True,linestyle_opts=opts.LineStyleOpts(type_='dash')),

                             )
)  
b.render_notebook()

## 抓词分析

In [46]:
pattern = '中国'

In [47]:
df[['作者','标题','播放','标签']][df['标题'].astype(str).str.contains(pattern,regex = True)].sort_values(by=['播放'],ascending=False).head(10)

Unnamed: 0,作者,标题,播放,标签
888,陈花旦,外国消防员：早知道中国消防员这样，我就不来了,7846351,生活 日常
3442,新华社,久等了！B站首个8K视频来了，追寻最美中国星,3966424,科技 数码
726,恋爱像奶茶一样美味,轮椅冰壶世锦赛，中国队最后一投完成不可能完成的任务！逆转瑞典获得冠军！太不容易了！,3783542,运动 竞技体育
5684,一目十影,可能是中国最“干净”的电影了，当你看懂了，一切的苦难都会过去！《城南旧事》,3661372,影评 电影解说
325,马蛟龙Long,让这抹中国红，飞舞在伦敦塔桥。,3037974,舞蹈 舞蹈综合
7116,西柚剧好吃,日本人竟拿中国小孩做实验，吃了糖就会变傻，只有女孩一直在装傻！,2821698,影视 影视杂谈
3180,马蛟龙Long,让中国水袖飞舞在英国海德公园。,2670815,舞蹈 舞蹈综合
1496,青木的中国日记,我叫青木来自东京，来到中国之后发现太多不一样的地方了，就让我用视频来记录我在中国的生活吧,2570088,生活 日常
3714,李格法国UP主,我拿到了中国绿卡!,2199567,生活 日常
5174,大漠叔叔,中国人不骗中国人，边境发财日入过万,2133652,警察 搞笑
