In [1]:
import pandas as pd
from pyecharts.charts import Line, Bar, Map, Timeline
from pyecharts import options as opts
from pyecharts.globals import WarningType
WarningType.ShowWarning = False

from pyecharts.globals import CurrentConfig, NotebookType
# If you're using Jupyter notebook, please uncomment next line and comment the second line
# CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB

In [2]:
df = pd.read_csv('data/daily_confirmed.csv')
df.set_index(pd.to_datetime(df['日期']), inplace=True)

In [3]:
print('Data Amount:', df.shape[0])
print('Data Time Range:', df.index.min().date(), df.index.max().date())

Data Amount: 247
Data Time Range: 2020-01-20 2020-09-22


In [4]:
df.head(3)

Unnamed: 0_level_0,日期,湖北,香港,广东,浙江,河南,湖南,安徽,上海,黑龙江,...,海南,甘肃,吉林,贵州,宁夏,澳门,青海,西藏,累计确诊,每日新增
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-20,2020-1-20,270,0,14,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,290,290
2020-01-21,2020-1-21,375,0,23,5,1,1,0,9,0,...,0,0,0,0,0,0,0,0,435,145
2020-01-22,2020-1-22,444,1,32,27,5,9,4,16,1,...,4,0,0,1,1,1,0,0,588,153


In [5]:
def addDailyMap(df) -> Map:
    province = df.index.values[1:-2].tolist()
    values = df.values[1:-2].astype('int').tolist()
    date = df.values[0]
    m = (
        Map(
            init_opts=opts.InitOpts()
        )
        .add('Cumculative Confirmed', [list(z) for z in zip(province, values)], 'china', is_map_symbol_show=False)
        .set_series_opts(label_opts=opts.LabelOpts(is_show=True))
        .set_global_opts(
            title_opts=opts.TitleOpts(title='Cumulative number of confirmed COVID-19 in China over provinces', subtitle=date, pos_left='center'),
            legend_opts=opts.LegendOpts(is_show=False),
            visualmap_opts=opts.VisualMapOpts(
                is_piecewise=True,
                pieces=[
                    {"max": 100, "color": '#ffeead', 'label': 'less than 100'},
                    {"min": 100, "max": 500, 'color': '#f29c2b', 'label': '100-500'},
                    {"min": 500, "max": 1000, 'color': '#d9534f', 'label': '500-1000'},
                    {"min": 1000, "max": 2000, "color":'#de4307', 'label': '1000-2000'},
                    {"min": 2000, 'color': '#dd0a35', 'label': 'above 2000'}
                ]
            )
        )
    )
    return m

In [6]:
def getMapTimeline(df):
    t = (
        Timeline(init_opts=opts.InitOpts())
        .add_schema(play_interval=500)
    )
    for date in df.index:
        t.add(addDailyMap(df.loc[date]), date.date())
    return t

In [7]:
t = getMapTimeline(df)

In [8]:
t.load_javascript()

<pyecharts.render.display.Javascript at 0x1b30af96e08>

In [9]:
t.render_notebook()

# Findings through Interactive Analysis：

## Confirmed cases

- In around **20th January**, the epidemics begam to spread in Guangdong, Hubei, Beijing, Shanghai and other places. At this time, the number of confirmed cases in Hubei has exceeded 200.
- Since then, the epidemic has quickly spread from Hubei to other areas. The number of confirmed cases in Hubei has exceeded 1,000 on **26th January**. In addition to Hubei, the number of confirmed cases in Zhejiang and Guangdong has exceed 1,000; Globally, countries such as the United States, Australia, France and Thailand also started to have confirmed cases.
- In around **2nd February**, the number of confirmed cases in Hubei has exceeded five digits, while the number for other provinces continued to increase; Globally, the  epidemics spread slowly in Europe, Southeast Asia, and the Americas.
- In mid-February, Tibet became the first province in China that had zero confirmed case; China's epidemic ushered in an inflection point, and the number of existing confirmed cases flattened, no longerincreased, and began to slowly decrease. At this moment, the number of confirmed cases was about 50,000, and most of them came from Hubei. 
- At the end of **February** and early **March** the number of domestic confirmed cases gradually decreased. At this time, large-scale infections began to occur abroad and spread to many other countries.At this time, the epidemics in Italy and Iran were more serious; the epidemic began to spread to Africa and South America.
- After **mid-March**, the domestic epidemic has been basically under control, and most provinces have returned to single digits or cleared to zero. The vast majority of existing confirmed cases was still in in Hubei; while most countries in the world have reported confirmed cases, and many countries have exceeded five digits. The number of confirmed cases is more serious in Europe, Iran, and the United States. 
- In early **April**, the number of domestic confirmed cases continued to decrease, but some provinces showed a slight counterattack; the foreign epidemic has spread to almost all countries in the world, and the number of confirmed cases in the United States has exceeded 200,000. The most serious area in the world; 
- From **mid-April**, due to imported reasons, a relatively serious epidemic counter-attack occurred in Heilongjiang and Northeast China. 
- The number of confirmed cases was close to 500, but it was gradually controlled in early **May**. At this time, the number of confirmed cases in the U.S. exceeded one million at 5.8. However, some countries with severe early epidemics have taken effective control measures to relieve the epidemic.

## Death cases

- In early **February**, countries around the world begain to have death cases.
- The number of domestic deaths tended to flatten in **mid-February**.
- In **mid-March**, deaths in various countries around the world began to appear or began to increase significantly. 
- In **May**, United States and Europe has the most number of death cases, and this number for many other countries also has far exceeded that of China.



## Motraility rate：

The mortality rate can also reflect the degree of control over the epidemic. The lower the mortality rate, the greater the probability that the patient will be treated. In areas with sufficient medical resources, the mortality rate can be reduced to about 1%. However, in areas with insufficient medical resources, the mortality rate can be as high as 10%.

#### Domestic mortality data analysis: 
- In late January, the mortality rate in China was more serious in Heilongjiang, Hubei, Hunan and Henan. But, at the end of January and early February, other provinces except Hubei showed a downward trend; 
- The number of confirmed cases peaked in mid-February, and the average domestic mortality rate was 2.5%. Provinces such as Hubei, Heilongjiang, Hainan, Taiwan have relatively high mortality rate. Since then, the mortality rate in Hubei has continued to rise, possibly due to insufficient medical resources to give patients effective treatment. It is also possible due to the accumlated confirmed patients experienced death. Also, Xinjiang and Heilongjiang have also increased in mortality rate. 
- After mid-March, with the domestic epidemic basically under control, the national average death rate was 4%, and the number for Hubei was around 4.7%; 
- In mid-April, the death case total was recalculated. Finally, the death rate in Hubei was 6.6%, and the domestic average death rate was 5.5%. 

#### Global mortality data analysis: 
- From late March, with the large-scale spread of the epidemic, the mortality rate of the epidemic in various countries has gradually increased; 
- it is worth noting that the statistics of many countries indicate that the mortality rate is a peak in the first few days of the epidemic, which may indicate the initial stage Failure to conduct good tests and discoveries on patients can only obtain information from severely ill patients with the new crown; although some underdeveloped countries such as Africa have less data on the epidemic report, the mortality rate is higher; 
- it may indicate a mild disease for the new crown Patients do not have good testing capabilities; countries with higher mortality rates are significantly concentrated in Europe, indicating a relative shortage of medical resources; Mexico’s mortality rate is also high; 

Overall, China gradually controlled the epidemic in March and set an excellent example for the world's anti-epidemic movement. At this time, the epidemic in developed countries such as Europe and the Americas started to get worse. It may be that developed countries have a larger population of people moving around the world. But it is more likely that developed countries do not have effective testing measures ,and therefore cannot report confirmed cases in time. 

In [10]:
def addDailyBar(df) -> Bar:
    province = df.index[1:-2].values.tolist()
    values = df.values[1:-2].astype('int').tolist()
    data = [list(z) for z in zip(province, values)]
    data.sort(key=lambda x:x[1])
    
    bar = (
        Bar(
            init_opts=opts.InitOpts()
        )
        .add_xaxis([t[0] for t in data])
        .add_yaxis('累计确诊', [t[1] for t in data])
        .reversal_axis()
        .set_series_opts(label_opts=opts.LabelOpts(position='right'))
        .set_global_opts(
            title_opts=opts.TitleOpts(title='Cumulative number of confirmed COVID-19 in China over provinces', subtitle=df.values[0], pos_left='15%'),
            legend_opts=opts.LegendOpts(pos_right='10%', pos_bottom='10%'),
            tooltip_opts=opts.TooltipOpts(trigger='axis')
        )
    )
    return bar

In [11]:
t = (
    Timeline(init_opts=opts.InitOpts(width='1100px', height='800px'))
    .add_schema(play_interval=500)
)
for date in df.index:
    bar = addDailyBar(df.loc[date])
    t.add(bar, date.date())

In [12]:
t.load_javascript()

<pyecharts.render.display.Javascript at 0x1b30b661848>

In [13]:
t.render_notebook()

In [14]:
t.render('render/Accumulated diagnosis bar.html')

'D:\\编程\\lab\\Data Mining\\render\\Accumulated diagnosis bar.html'

In [15]:
def aggByDate() -> Bar:
    barDf = df.loc[:, '每日新增']
    x_data = df.loc['2020-01-20':'2020-05-01'].index.date.tolist()
    
    bar = (
        Bar()
        .add_xaxis(x_data)
        .add_yaxis(
            series_name='Daily new',
            y_axis=barDf.values.tolist(),
            yaxis_index=0
        )
        .extend_axis(yaxis=opts.AxisOpts())
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(
            title_opts=opts.TitleOpts('', pos_left='35%'),
            tooltip_opts=opts.TooltipOpts(trigger='axis', axis_pointer_type='cross'),
            legend_opts=opts.LegendOpts(pos_right='10%', pos_bottom='15%')
        )
    )
    
    lineDf = df.loc[:, '累计确诊']
    line = (
        Line()
        .add_xaxis(x_data)
        .add_yaxis(
            series_name='Cumulative confirmed', 
            y_axis=lineDf.values.tolist(),
            label_opts=opts.LabelOpts(is_show=False),
            yaxis_index=1
        )
    )
    
    bar.overlap(line)
    
    return bar

In [16]:
bar = aggByDate()

In [17]:
bar.load_javascript()

<pyecharts.render.display.Javascript at 0x1b30b14e888>

In [18]:
bar.render_notebook()

In [19]:
df = pd.read_csv('data/china-social-news.csv')

In [20]:
df.set_index(pd.to_datetime(df['时间']), inplace=True)
df.dropna(inplace=True)
df.sort_index(inplace=True)

In [21]:
print('Data Amount：', df.shape[0])
print('Data Time Range:', df.index.min().date(), df.index.max().date())

Data Amount： 1334
Data Time Range: 2020-01-26 2020-05-19


In [22]:
df.head(3)

Unnamed: 0_level_0,标题,时间,URL,正文内容,来源
时间,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-26,中国生物多样性保护与绿色发展基金会：向湖北紧急支援4万个口罩,2020-01-26,http://www.chinanpo.gov.cn/1944/123496/nextind...,大年三十，中国绿发会紧急采购4万个口罩，支援武汉市和襄阳市，今天首批12000个口罩已到达湖...,中国生物多样性保护与绿色发展基金会
2020-01-26,中国旅行社协会,2020-01-26,http://www.chinanpo.gov.cn/1944/123533/nextind...,：处理行前解约应注意这几点针对新型冠状病毒肺炎疫情，积极响应国家相关部门的通知要求，发挥法律...,中国旅行社协会
2020-01-26,中国旅行社协会,2020-01-26,http://www.chinanpo.gov.cn/1944/123530/nextind...,：致境内外旅游供应商、旅游业者的一封公开信,中国旅行社协会


In [23]:
from jieba import analyse

def get_keywords(news_list):
    keywords = []
    for keyword in analyse.extract_tags(''.join(news_list), topK=100, withWeight=True):
        keywords.append(keyword)
    return keywords

In [24]:
from pyecharts import options as opts
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType

from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB

def render_wordcloud(data, title):
    c = (
        WordCloud()
        .add(series_name="", data_pair=data, word_size_range=[20, 100], shape=SymbolType.ROUND_RECT)
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title=title, title_textstyle_opts=opts.TextStyleOpts(font_size=23)
            ),
            tooltip_opts=opts.TooltipOpts(is_show=True),
        )
    )
    return c

In [25]:
from pyecharts.charts import Timeline

t2 = (
    Timeline(init_opts=opts.InitOpts(width='1100px', height='800px'))
    .add_schema(play_interval=500)
)

In [26]:
def add_to_timeline(data, date):
    keywords = get_keywords(data)
    c = render_wordcloud(keywords, '')
    t2.add(c, date.date())

In [27]:
def get_timeline():
    result = []
    count = 0
    date = pd.to_datetime('2020-01-26')
    for date in pd.date_range(start='2020-01-26', end='2020-05-19'):
        try:
            count = count + df.loc[date, '正文内容'].shape[0]
            for item in df.loc[date, '正文内容'].tolist():
                result.append(item)
            if count >= 80:
                add_to_timeline(result, date)
                result.clear()
                count = 0
        except Exception:
            continue
    if count > 0:
        add_to_timeline(result, date)

In [28]:
get_timeline()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\49518\AppData\Local\Temp\jieba.cache
Loading model cost 0.625 seconds.
Prefix dict has been built successfully.


# Word Cloud

- We used crawler technology to crawl the news of 'the Epidemic Prevention and Control Zone of the Public Service Platform of Chinese Social Organizations', and obtained a total of 1,400+ news between 26th Januaray and 19th May. 

- Word cloud is a visual display of the "keywords" that appear frequently in the text. The word cloud image filters out a large amount of low-frequency and low-quality text information, so that the viewer only needs to scan the text at a glance. Grasp the main point of the text.

- We use 'jieba' to segment all the articles published on the platform, get the topic words (take the top 100), and render the word cloud map.

In [29]:
t2.load_javascript()

<pyecharts.render.display.Javascript at 0x1b317457ec8>

In [30]:
t2.render_notebook()

#### Word Cloud Analysis

- In the early stage of the epidemic, the subject headings were mainly "prevention and control", "organization", "work", and "society", corresponding to the main anti-epidemic forces for government mobilization and social organization donations
- Since the end of February, words such as "enterprise" and "resumption of work" have become more and more important, corresponding to the main demand for resumption of work. 

## TF-IDF

TF-IDF (Term Frequency-InversDocument Frequency) is a weighting technique commonly used in information processing and data mining. This technology uses a statistical method to calculate the importance of a word in the entire corpus based on the number of times the word appears in the text and the document frequency in the entire corpus. Its advantage is that it can filter out some common but irrelevant words, while retaining important words that affect the entire text.

#### TF - IDF = TF * IDF

- TF (Term Frequency) represents the frequency of a certain keyword in the entire article. 
- IDF (InversDocument Frequency) represents the frequency of calculating inverted text. Text frequency refers to the number of times a certain keyword appears in all articles of the entire corpus. Inverse document frequency is also called inverse document frequency. It is the inverse of document frequency and is mainly used to reduce the effect of some common words in all documents that have little effect on the document.


In [31]:
from pyecharts.charts import Bar

def get_tf_bar(x_data, y_data):
    b = (
        Bar(
            init_opts=opts.InitOpts()
        )
        .add_xaxis(x_data)
        .add_yaxis(
            series_name='',
            y_axis=y_data
        )
        .add_dataset()
        .reversal_axis()
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(
            title_opts=opts.TitleOpts(title='TF-IDF Ranking', pos_left='35%'),
            legend_opts=opts.LegendOpts(pos_right='10%', pos_bottom='10%'),
            tooltip_opts=opts.TooltipOpts(trigger='axis')
        )
    )
    return b

In [32]:
keywords = get_keywords(df.loc[:, '正文内容'])
keywords[:10]

[('疫情', 0.22706395346709843),
 ('防控', 0.17074977030094585),
 ('组织', 0.0675057784424657),
 ('捐赠', 0.062243513190542804),
 ('社会', 0.057935903583603246),
 ('工作', 0.057022622823337914),
 ('协会', 0.054245645974525636),
 ('防疫', 0.04727786567173266),
 ('冠状病毒', 0.04383926161901604),
 ('肺炎', 0.04318461971307456)]

In [33]:
x_data = []
y_data = []

for keyword in keywords[:20]:
    x_data.append(keyword[0])
    y_data.append(keyword[1])
    
x_data.reverse()
y_data.reverse()

In [34]:
b = get_tf_bar(x_data, y_data)

In [35]:
b.load_javascript()

<pyecharts.render.display.Javascript at 0x1b3176a4b08>

In [36]:
b.render_notebook()

#### TF-IDF Analaysis:

The output result is as shown in the figure above. It can be seen that terms such "epidemic", "organization", "donation", "society", "association", "pneumonia", "materials" are all high-frequency words, which are also topics of general concern to the public