In [7]:
%matplotlib inline
import matplotlib
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import scikits.bootstrap as boot
import matplotlib.pyplot as plt
import re
import networkx as nx
from wordcloud import WordCloud
from textblob import TextBlob
from math import isnan
import statsmodels.api as sm
from scipy.stats import mannwhitneyu


# view all columns
pd.set_option('display.max_columns', None)
plt.rcParams['ps.useafm'] = True
plt.rcParams['font.sans-serif'] = 'Spica Neue'
#plt.rcParams['font.sans-serif'] = 'IPAexGothic'

In [None]:
####################
# network analysis #
####################

In [None]:
# import file path
def dates2fpaths(pkl_dir, start_date_str, end_dates_str):
    start_date = pd.to_datetime(start_date_str).tz_localize(None)
    end_date = pd.to_datetime(end_dates_str).tz_localize(None)
    diff = (end_date - start_date).days + 1
    dt_list = [start_date + timedelta(days=i) for i in range(diff)]
    return [pkl_dir.joinpath(str(dt)[0:10].replace('-','')+'.pkl.xz') for dt in dt_list]

# capture statistic information of vaccine texts
def dataset_summary_vaccine(pkl_dir, fpaths):
    # initialize variables
    tweet_num =0
    retweet_num =0
    reply_num =0
    favorite_count =0
    user_name = []
    user_hashtag = []
    
    for f in fpaths:
        try:
            dt = pd.to_datetime(str(f.name)[0:8])
            print(dt)
            df = pd.read_pickle(f)

            # English only
            df = df[(df['lang'] == 'en')] 
            df = df[df['text'].str.contains('vaccines|vaccine|vaccinated|vaccination|vaccineoutside|vaccinate|vaccinologist|vaccinert|coronavirusvaccine', na=False)]
            
            # number of retweet
            day_retweet = len(df[df['retweeted_status_created_at'].notnull()])
            retweet_num = retweet_num + day_retweet
            
            # number of reply
            day_reply = len(df[df['in_reply_to_user_id_str'].notnull()])  
            reply_num = reply_num + day_reply           
            
            # number of tweet
            tweet_num = tweet_num + len(df)- day_retweet - day_reply
            
            # number of favourite
            for i in range(len(df['favorite_count'])):
                favorite_count += df['favorite_count'][i]
            
            # all accounts
            for i in range(len(df['user_screen_name'])):
                user_name.append(df['user_screen_name'][i])
            
            # all hashtags
            for i in df['hashtags']:
                # tokenize
                tokenizer = nltk.RegexpTokenizer(r"\w+")
                new_words = tokenizer.tokenize(i)
                for j in new_words:
                    user_hashtag.append(j)
                    
        except:
            print('error: ', f)
    return tweet_num, retweet_num, reply_num, favorite_count, user_name, user_hashtag

In [None]:
# retweet network
# contruct retweet network
def retweet_netwroks(pkl_dir, fpaths, filename):

    e = []
    for f in fpaths:
        try:
            dt = pd.to_datetime(str(f.name)[0:8])
            print(dt)
            df = pd.read_pickle(f)

            # English only
            df = df[(df['lang'] == 'en')] 
            
            # retweets only
            df = df[df['retweeted_status_created_at'].notnull()]

            # find keyword cocurrence        
            df = df[df['text'].str.contains('vaccines|vaccine|vaccinated|vaccination|vaccineoutside|vaccinate|vaccinologist|vaccinert|coronavirusvaccine', na=False)]
            
            # retweet edges
            # from (source) and to (target or user who post the original)
            edge_dic = df.groupby(by=['user_screen_name', 'retweeted_status_screen_name']).size().to_dict()
            for k, v in edge_dic.items():
                e.append([k[0], k[1], {'weight': v}])
        except:
            print('error: ', f)
        
    D = nx.from_edgelist(e, create_using=nx.DiGraph)
    nx.write_gexf(D, filename)

In [None]:
# Use Louvain algorithm in Gephi to find Pro and Anti clusters.
# Download nodes information of each cluster by Gephi named "RTnet_vacc_2021_06_30>20.csv" 
# which modularity_class of '4' represent for Pro and '0' represent for Anti. 

In [None]:
# Network measures(including basic information of network, Network density, Clustering coefficient and Average distance)
def retweet_network_information(name):
    # RT network
    G = nx.read_gexf(name)
    print('All nodes:', nx.info(G))
    print('\n')

    # RT network >= 20
    node = list(G.nodes)
    for i in range(G.number_of_nodes()):
        if G.degree[node[i]] < 20:
            G.remove_node(node[i])
    print('node degree>=20:', nx.info(G))
    
    # save gexf
    # nx.write_gexf(G, 'RTnet_vacc_2020_12_01 - 2021_06_30>20.gexf')

    # nodes.csv from Gephi
    nodes = pd.read_csv('RTnet_vacc_2021_06_30>20.csv')

    # read nodes for Anti and Pro
    # 4 represent for Pro, 0 represent for Anti 
    pro_nodes = list(nodes[nodes['modularity_class']==4]['Id'])
    anti_nodes = list(nodes[nodes['modularity_class']==0]['Id'])

    print('\n')
    print('number of Pro: ', len(pro_nodes))
    print('number of Anti: ',len(anti_nodes))
    print('\n')

    # fliter nodes for pro
    G_pro = G.subgraph(pro_nodes)
    print(nx.info(G_pro))

    print('\n')
    # fliter nodes for anti
    G_anti = G.subgraph(anti_nodes)
    print(nx.info(G_anti))

    print('\n')
    # network density
    G_network_density = nx.density(G)
    print('G_network_density: ', G_network_density)

    G_pro_network_density = nx.density(G_pro)
    print('G_pro_network_density: ', G_pro_network_density)

    G_anti_network_density = nx.density(G_anti)
    print('G_anti_network_density: ', G_anti_network_density)
    print('\n')
    
    # global clustering coeffcient

    G_average_clustering = nx.average_clustering(G)
    G_pro_average_clustering = nx.average_clustering(G_pro)
    G_anti_average_clustering = nx.average_clustering(G_anti)

    print('average local clustering coefficient of entirety: ', G_average_clustering)
    print('average local clustering coefficient of Pro: ', G_pro_average_clustering)
    print('average local clustering coefficient of Anti: ', G_anti_average_clustering)
    print('\n')
    
    # average distance
    G_pro_average_distance = nx.average_shortest_path_length(nx.to_undirected(G_pro))
    G_anti_average_distance = nx.average_shortest_path_length(nx.to_undirected(G_anti))

    print('Average distance of Pro: ', G_pro_average_distance)
    print('Average distance of Anti: ', G_anti_average_distance)

In [None]:
# Find Top-10 users 
def find_top_users(name):
    G = nx.read_gexf(name)
    a = G.degree
    degree = sorted(G.degree(), key=lambda x:x[1], reverse=True)

    def find_top_users_for_cluster(number):

        node  = list(nodes[nodes['modularity_class']==number]['Id'])
        num = len(list(nodes[nodes['modularity_class']==number]['Id']))

        # names and degrees of all nodes labeled with 'number'
        degree0  = []
        for i in range(len(degree)):
            if degree[i][0] in node:
                degree0.append(degree[i])
        ratio = num/len(nodes)*100
        ratio=('%.2f' % ratio)

        # 打印top 10 
        top_10 = sorted(degree0, key=lambda x:x[1], reverse=True)[:10]
        print("Top 10 of {}:".format(number))
        # print('Ratio:{}%'.format(ratio))
        print(top_10)  

    # print ratio and top user of each cluster
    cluster_name = [4,0]
    for i in cluster_name:
        find_top_users_for_cluster(i)
        print('---------------------')

In [5]:
def Distribution_of_the_indegree(name):
    
    # RT network
    G = nx.read_gexf(name)

    # nodes.csv from Gephi
    nodes = pd.read_csv('RTnet_vacc_2021_06_30>20.csv')

    # read nodes for Anti and Pro
    # 4 represent for Pro, 0 represent for Anti 
    pro_nodes = list(nodes[nodes['modularity_class']==4]['Id'])
    anti_nodes = list(nodes[nodes['modularity_class']==0]['Id'])
    
    G_pro = G.subgraph(pro_nodes)
    G_anti = G.subgraph(anti_nodes)
    
    ax1 = plt.subplot(2, 1, 1)
    degree_hist = np.array(nx.degree_histogram(G_pro))
    pk = degree_hist / degree_hist.sum()
    plt.xlabel('In Degree of G_pro')
    plt.ylabel('Fraction of Nodes')
    plt.loglog(pk)
    
    ax2 = plt.subplot(2, 1, 2)  
    degree_hist = np.array(nx.degree_histogram(G_anti))
    pk = degree_hist / degree_hist.sum()
    plt.xlabel('In Degree of G_anti')
    plt.ylabel('Fraction of Nodes')
    plt.loglog(pk)
    
    plt.show()
    #plt.xlim(1, 11000)

In [None]:
pkl_dir = Path('CoronaTweets_pkl_20210630')
#fpaths = sorted(pkl_dir.glob('*.pkl.xz'))
fpaths = dates2fpaths(pkl_dir, '2020-02-20', '2021-06-30')
# capture statistic information of vaccine
tweet_num_vaccine, retweet_num_vaccine, reply_num_vaccine, favorite_count_vaccine, user_name_vaccine, user_hashtag_vaccine = dataset_summary_vaccine(pkl_dir, fpaths)
print(tweet_num_vaccine, retweet_num_vaccine, reply_num_vaccine, favorite_count_vaccine, len(set(user_name_vaccine)), len(set(user_hashtag_vaccine)))

In [None]:
# save retweet network

# contruct retweet network
pkl_dir = Path('CoronaTweets_pkl_20210630')
# fpaths = sorted(pkl_dir.glob('*.pkl.xz'))
fpaths = dates2fpaths(pkl_dir, '2020-02-20', '2021-06-30')
retweet_netwroks(pkl_dir, fpaths, 'RTnet_vacc_2021_06_30>20.gexf')

# contruct retweet network before mass vaccination
pkl_dir = Path('CoronaTweets_pkl_20210630')
#fpaths = sorted(pkl_dir.glob('*.pkl.xz'))
fpaths = dates2fpaths(pkl_dir, '2020-02-20', '2020-12-01')
retweet_netwroks(pkl_dir, fpaths, 'RTnet_vacc_2020_02_20 - 2020_12_01.gexf')

# contruct retweet network after mass vaccination
pkl_dir = Path('CoronaTweets_pkl_20210630')
#fpaths = sorted(pkl_dir.glob('*.pkl.xz'))
fpaths = dates2fpaths(pkl_dir, '2020-12-02', '2021-06-30')
retweet_netwroks(pkl_dir, fpaths, 'RTnet_vacc_2020_12_01 - 2021_06_30.gexf')

In [None]:
# Measures and find Top-10 users for different retweet network which 2020_12_01 represent the timepoint for mass vaccination 
# 2020_02_20 To 2021_06_30
print('Timeline: From 2020_02_20 To 2021_06_30')
retweet_network_information('RTnet_vacc_2021_06_30.gexf')
find_top_users('RTnet_vacc_2021_06_30>20.gexf')

# 2020_02_20 To 2020_12_01
print('Timeline: From 2020_02_20 To 2020_12_01')
retweet_network_information('RTnet_vacc_2020_02_20 - 2020_12_01.gexf')
find_top_users('RTnet_vacc_2020_02_20 - 2020_12_01>20.gexf')

# 2020_12_01 To 2021_06_30
print('Timeline: From 2020_12_01 To 2021_06_30')
retweet_network_information('RTnet_vacc_2020_12_01 - 2021_06_30.gexf')
find_top_users('RTnet_vacc_2020_12_01 - 2021_06_30>20.gexf')

In [None]:
# Distribution of the indegree
Distribution_of_the_indegree('RTnet_vacc_2021_06_30.gexf')

In [None]:
#######################
# linguistic analysis #
#######################

In [None]:
# collect texts
def collect_texts(pkl_dir, fpaths, label):
    if label == 'tweets':
        df_pro = []
        df_anti = []
        for f in fpaths:
            try:

                # read excel of anti and pro nodes
                nodes = pd.read_csv('RTnet_vacc_2021_06_30>20.csv')
                nodes_of_pro= list(nodes[nodes['modularity_class']==4]['Id'])
                nodes_of_anti = list(nodes[nodes['modularity_class']==0]['Id'])

                dt = pd.to_datetime(str(f.name)[0:8])
                print(dt)
                df = pd.read_pickle(f)

                # English only
                df = df[(df['lang'] == 'en')]

                # retweets only
                # df = df[df['retweeted_status_created_at'].notnull()]

                # replies only
                # df = df[df['in_reply_to_user_id_str'].notnull()]

                # tweets only
                df = df[df['retweeted_status_created_at'].isnull()]
                df = df[df['in_reply_to_user_id_str'].isnull()]

                # find tweets containing keywords      
                df = df[df['text'].str.contains('vaccines|vaccine|vaccinated|vaccination|vaccineoutside|vaccinate|vaccinologist|vaccinert|coronavirusvaccine', na=False)]

                # save only time and text
                # df = df[['created_at', 'text']]

                # save dataframe
                for index, row in df.iterrows():
                    # sort texts for anti and pro
                    if row['user_screen_name'] in nodes_of_anti:
                        df_anti.append(row[['created_at', 'text']])
                    elif row['user_screen_name'] in nodes_of_pro:
                        df_pro.append(row[['created_at', 'text']])
            except:
                print('error: ', f)
        return df_pro, df_anti
    elif label == 'replies':
        df_pro = []
        df_anti = []
        for f in fpaths:
            try:

                # read excel of anti and pro nodes
                nodes = pd.read_csv('RTnet_vacc_2021_06_30>20.csv')
                nodes_of_pro= list(nodes[nodes['modularity_class']==4]['Id'])
                nodes_of_anti = list(nodes[nodes['modularity_class']==0]['Id'])

                dt = pd.to_datetime(str(f.name)[0:8])
                print(dt)
                df = pd.read_pickle(f)

                # English only
                df = df[(df['lang'] == 'en')]

                # retweets only
                # df = df[df['retweeted_status_created_at'].notnull()]

                # replies only
                df = df[df['in_reply_to_user_id_str'].notnull()]

                # tweets only
                # df = df[df['retweeted_status_created_at'].isnull()]
                # df = df[df['in_reply_to_user_id_str'].isnull()]

                # find tweets containing keywords      
                df = df[df['text'].str.contains('vaccines|vaccine|vaccinated|vaccination|vaccineoutside|vaccinate|vaccinologist|vaccinert|coronavirusvaccine', na=False)]

                # save only time and text
                # df = df[['created_at', 'text']]

                # save dataframe
                for index, row in df.iterrows():
                    # sort texts for anti and pro
                    if row['user_screen_name'] in nodes_of_anti:
                        df_anti.append(row[['created_at', 'text']])
                    elif row['user_screen_name'] in nodes_of_pro:
                        df_pro.append(row[['created_at', 'text']])
            except:
                print('error: ', f)
        return df_pro, df_anti

In [None]:
pkl_dir = Path('CoronaTweets_pkl_20210630')
#fpaths = sorted(pkl_dir.glob('*.pkl.xz'))
fpaths = dates2fpaths(pkl_dir, '2020-02-20', '2021-06-30')
# collect teeets
tweet_pro, tweet_anti = collect_tweets(pkl_dir, fpaths, 'tweets')
# collect replies
reply_pro, reply_anti = collect_replies(pkl_dir, fpaths, 'replies')

# save texts
tweet_pro = pd.DataFrame(tweet_pro)
tweet_pro.to_csv('tweet_pro')

tweet_anti = pd.DataFrame(tweet_anti)
tweet_anti.to_csv('tweet_anti')

reply_pro = pd.DataFrame(reply_pro)
reply_pro.to_csv('reply_pro')

reply_anti = pd.DataFrame(reply_anti)
reply_anti.to_csv('reply_anti')

# save tweets

In [None]:
# Then we use LIWC to analyze texts of each group. LIWC can help us get kinds(e.g. negative, positive, analytic) of scores of each text.
# We use text's score in each gruop to find if there exist significance difference by independent t-test.
# And we conclude each group scores in files 'LIWC2015 Results (tweet_pro.csv).csv', 'LIWC2015 Results (tweet_anti.csv).csv'
# 'LIWC2015 Results (reply_pro.csv).csv', 'LIWC2015 Results (reply_anti.csv).csv'

In [None]:
# concatenate scores results for 'tweet'
tweet_pro = pd.read_csv('LIWC2015 Results (tweet_pro.csv).csv')
tweet_anti = pd.read_csv('LIWC2015 Results (tweet_anti.csv).csv')
tweet_vaccine = tweet_pro.append(tweet_anti).dropna() 

In [None]:
# concatenate scores results for 'replies'
reply_pro = pd.read_csv('LIWC2015 Results (reply_pro.csv).csv')
reply_anti = pd.read_csv('LIWC2015 Results (reply_anti.csv).csv')
reply_vaccine = reply_pro.append(reply_anti).dropna() 

In [None]:
def t_test(df1, df2, column):
    if df1[column].agg(np.std) != df2[column].agg(np.std): 
        
        df1_column = sm.stats.DescrStatsW(df1[column])
        df2_column = sm.stats.DescrStatsW(df2[column])
        
        mean = df2_column.mean - df1_column.mean
        pvalue = list(sm.stats.CompareMeans(df2_column, df1_column).ttest_ind(usevar='unequal'))[1]
        result.append([mean, pvalue])

    else:
        print("Pooled")

In [None]:
# We want to check the significance difference of means in anti and pro in reply texts and tweet texts respectively

# Tweets
result = []
# categories we want to compare within LIWC
categories = ['affect', 'posemo', 'negemo', 'Analytic', 'funct', 'pronoun']
for category in categories:
    t_test(tweet_pro, tweet_anti, category)

# print p-value for each category    
k = 0
for i in categories:
    print(i)
    print('p-value: ',result[k][1])
    k=k+1 

# confidence intervals with bootstrap
for category in categories:
    print(boot.ci(tweet_pro[category]))
    print(boot.ci(tweet_anti[category]))
    

# Replies
result = []
# categories we want to compare within LIWC
categories = ['affect', 'posemo', 'negemo', 'Analytic', 'funct', 'pronoun']
for category in categories:
    t_test(reply_pro, reply_anti, category)

# print p-value for each category    
k = 0
for i in categories:
    print(i)
    print('p-value: ',result[k][1])
    k=k+1
    
# confidence intervals with bootstrap
for category in categories:
    print(boot.ci(reply_pro[category]))
    print(boot.ci(reply_anti[category]))

In [None]:
# We also want to check if there are significance difference exist in the mode of reply and tweet texts with negative emotion
# before and after mass vaccination.

# We find the timepoint of 2020-12-01 mannualy and divide all the texts

tweet_pro_before = tweet_pro[0:77608]
tweet_pro_after = tweet_pro[77608:]

tweet_anti_before = tweet_anti[0:25983]
tweet_anti_after = tweet_anti[25983:]

reply_pro_before = reply_pro[0:7945]
reply_pro_after = reply_pro[7945:]

reply_anti_before = reply_anti[0:12888]
reply_anti_after = reply_anti[12888:]

# And we compare all the groups by negetive emotion before and after 2020-12-01

# tweet_pro_before VS tweet_anti_before
t_test(tweet_pro_before, tweet_anti_before, 'negemo')

# tweet_pro_after VS tweet_anti_after
t_test(tweet_pro_after, tweet_anti_after, 'negemo')

# reply_pro_before VS reply_anti_before
t_test(reply_pro_before, reply_anti_before, 'negemo')

# tweet_pro_after VS tweet_anti_after
t_test(reply_pro_after, reply_anti_after, 'negemo')

In [None]:
# Merge the texts of pro and anti by Tweet and Reply
pro = tweet_pro.append(reply_pro)
anti = tweet_anti.append(reply_anti)

# compare moral perspectives in Reply and Anti
moral_name = ['HarmVirtue', 
       'HarmVice',
       'FairnessVirtue',
       'FairnessVice',
       'IngroupVirtue',
       'IngroupVice',
       'AuthorityVirtue',
       'AuthorityVice',
       'PurityVirtue',
       'PurityVice',
       'MoralityGeneral']

# t-test
result = []
for moral in moral_name:
    t_test(pro, anti, moral)
    
# print p-value for each category    
k = 0
for i in moral_name:
    print(i)
    print('p-value: ',result[k][1])
    k=k+1

In [None]:
# Wordcloud

In [8]:
from wordcloud import (WordCloud, get_single_color_func)
import matplotlib.pyplot as plt


class SimpleGroupedColorFunc(object):
    """Create a color function object which assigns EXACT colors
       to certain words based on the color to words mapping

       Parameters
       ----------
       color_to_words : dict(str -> list(str))
         A dictionary that maps a color to the list of words.

       default_color : str
         Color that will be assigned to a word that's not a member
         of any value from color_to_words.
    """

    def __init__(self, color_to_words, default_color):
        self.word_to_color = {word: color
                              for (color, words) in color_to_words.items()
                              for word in words}

        self.default_color = default_color

    def __call__(self, word, **kwargs):
        return self.word_to_color.get(word, self.default_color)


class GroupedColorFunc(object):
    """Create a color function object which assigns DIFFERENT SHADES of
       specified colors to certain words based on the color to words mapping.

       Uses wordcloud.get_single_color_func

       Parameters
       ----------
       color_to_words : dict(str -> list(str))
         A dictionary that maps a color to the list of words.

       default_color : str
         Color that will be assigned to a word that's not a member
         of any value from color_to_words.
    """

    def __init__(self, color_to_words, default_color):
        self.color_func_to_words = [
            (get_single_color_func(color), set(words))
            for (color, words) in color_to_words.items()]

        self.default_color_func = get_single_color_func(default_color)

    def get_color_func(self, word):
        """Returns a single_color_func associated with the word"""
        try:
            color_func = next(
                color_func for (color_func, words) in self.color_func_to_words
                if word in words)
        except StopIteration:
            color_func = self.default_color_func

        return color_func

    def __call__(self, word, **kwargs):
        return self.get_color_func(word)(word, **kwargs)

In [10]:
def SpecificWordCloud(name, label):
    
    if label == 'pro':
        # read text
        tweet_pro = pd.read_csv('LIWC2015 Results (tweet_pro_df.csv).csv', encoding = 'unicode_escape')
        reply_pro = pd.read_csv('LIWC2015 Results (reply_pro_df.csv).csv', encoding = 'unicode_escape')
    else:
        # read text
        tweet_pro = pd.read_csv('LIWC2015 Results (tweet_anti_df.csv).csv', encoding = 'unicode_escape')
        reply_pro = pd.read_csv('LIWC2015 Results (reply_anti_df.csv).csv', encoding = 'unicode_escape')
        
    # read liwc dictionary
    liwc_dic = pd.read_csv('liwc dictionary.csv')

    # save needed column name
    # columns_name = liwc_dic.columns
    # name = [i for i in columns_name[30: 35]]

    # save needed column 
    name_dic = liwc_dic[name]

    # remove all '*'

    def cleanTxt(text):
        text = re.sub(r'\*', '', text)
        return text

    for i in range(len(name)):
        name_dic[name[i]] = name_dic[name[i]].dropna().apply(cleanTxt)

    # convert pd to dictionary
    name_dic = name_dic.to_dict('list')

    # delete nan
    clean_dic = {k:[elem for elem in v if elem is not np.nan] for k,v in name_dic.items()}

    # create colors for each category
    new_key = ['green', 'blue', 'red', 'black', 'orange', 'purple']
    old_key = name
    new_key = new_key[0: len(name)]
    
    for i in range(len(new_key)):
        clean_dic[new_key[i]] = clean_dic.pop(old_key[i])

    # If the value in the dictionary exists in the text, add once for each existence.

    # values of tweet_pro 
    # 'C' is the column of text
    tweet_pro_x = []
    for i in tweet_pro['C']:
        for j in clean_dic.values():
            for word in j:
                if word in i:
                    tweet_pro_x.append(word)

    # values of tweet_anti 
    for i in reply_pro['C']:
        for j in clean_dic.values():
            for word in j:
                if word in i:
                    tweet_pro_x.append(word)

    # add text
    st_tweet_pro_x = ''
    for i in tweet_pro_x:
        st_tweet_pro_x = st_tweet_pro_x + ' ' + str(i)

    # Since the text is small collocations are turned off and text is lower-cased
    wc = WordCloud(collocations=False, background_color='white', min_word_length = 3, max_words = 50).generate(st_tweet_pro_x.lower())

    color_to_words = clean_dic

    # Words that are not in any of the color_to_words values
    # will be colored with a grey single color function
    default_color = 'grey'

    # Create a color function with single tone
    # grouped_color_func = SimpleGroupedColorFunc(color_to_words, default_color)

    # Create a color function with multiple tones
    grouped_color_func = GroupedColorFunc(clean_dic, default_color)

    # Apply our color function
    wc.recolor(color_func=grouped_color_func)

    # Plot
    plt.figure(figsize=(8,6))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [None]:
def wordcloud(index1, index2, label):
    # We can specify any pair of numbers correlated with columns
    columns_name = liwc_dic.columns
    name = [i for i in columns_name[index1: index2]]
    print(name)`
    SpecificWordCloud(name, label)

In [None]:
liwc_dic = pd.read_csv('liwc dictionary.csv')
columns_name = liwc_dic.columns
print(columns_name)

In [None]:
# We can specify any pair of numbers correlated with columns to make wordcloud
# For example, in LIWC, index 0 to 5 represent 'Insight', 'Cause', 'Discrep', 'Tentat', 'Certain', 'Differ' for Cognitive processes,
# so we use wordcloud(0, 6, 'pro') to find if there exist difference in Cognitive processes subcategories by wordcloud