<h1 style="color:red; text-align:center">METHODOLOGY - Code</h1>

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import re, string
from collections import defaultdict
import csv

from unidecode import unidecode
from difflib import SequenceMatcher

import networkx as nx
import community as community_louvain
import networkx.algorithms.community as nxcom
from graphviz import Digraph, Graph
from networkx.algorithms.community.quality import modularity

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from wordcloud import WordCloud


## Functions declaration

In [None]:
# PREPARE THE DATASET

# list of frequent and isignificant hashtags to remove
stopwords=["meme","memes","memeita", "repost", "photooftheday", "picoftheday", "moodoftheday", "webstagram", "tagstagramers", "bestoftheday",
           "photodaily", "tagsta", "igers",  "ootd", "shot", "shots", "pics", "instangram", "youtuber", "youtube", "boysofinstagram", "daily",
           "goodlookingguys", "onthisday", "post", "tagga", "taggaituoiamici", "picby",
           "follow", "follower", "followers", "followme", "followher", "alwaysfollowback", "pleasefollow", "followall", "followforfollow",
           "followbackalways", "follow4followback", "teamfollowback", "pleasefollowme", "ifollow", "followbackteam", "follow4follow",
           "followback", "followerforfollower", "followerforfollowers", "followersforfollower", "followersforfollowers", "followtrick",
           "followalways",
           "like", "likes", "likeforlike", "likeback", "likeforfollow", "likeforfollows", "likeforfollower", "likeforfollowers",
           "likesforfollow", "likesforfollower", "likesforfollowers", "likeforlikeback", "likeforlikes", "likesforlikes", "tagsforlikes",
           "tagforlike", "tagsforlike", "like4like", "likes4likes", "like4likes", "likes4like", "unlimlikes",
           "instagram", "instagood", "instadaily", "instabeauty", "instacool", "instamood", "instalike", "instago", "instagramer", "instalove",
           "instagramers", "instalife", "instapic", "instapics", "instavideo", "instastories", "instaglam", "instasaturday", "instanlike",
           "instaphotography", "instadailypic", "instafamous", "instamoment", "instahome", "unforgettableinstagrammer", "instaphoto",
           "instagrammer", "instagrammers", "instatag", "shout", "shoutout", "shoutout4shoutout", "shoutout4shoutouts", "shoutouter", 
           "shoutouters", "shoutoutforshoutout", "shoutoutforshoutouts", "shoutouts", "shoutouts4shoutout", "shoutouts4shoutouts", 
           "shoutoutsforshoutout", "shoutoutsforshoutouts",
           "instalive", "toptags" , "ilove", "lovelife", 
           "instablog", "instaboy", "instagirl", "insta", "instaphotos", "instavideos", "instacolors", "instacolor",
           "cool", "goodmorning", "iphoneonly", "lifestyle", "tumblr", "simply", "thanks", "thankyou", "youandme", "meandyou", "loveu",
           "loveyou", "beauty", "mylove", "ioete", "forever", "cute", "beautifulday", "best", "mood", "smile",
           "video", "photo", "happy", "happiness", "happyness", "photography", "beautiful", "life", "love", "amore", "amazing", "grazie"]


# Remove posts with expected reactions equal to 0
def remove_posts_followers_zero(df_influencers):
    
    return df_influencers[df_influencers["followers"]!=0]

# Extract hashtags from the text in the description of a post
# Input: DataFrame of all posts
# Ouput: the same DataFrame with the "list_hashtags" column in addition
def words_finder(df, field="description"):
    
    lists_hashtags=[]
    regex = "#(\w+)"
    
    for text in df[field].to_list():
     
        lst2=[]

        lst = re.findall(regex, str(text))

        for s in lst:
            lst2.append(unidecode_word(s))
            
        lst2=filter_words(lst2)
            
        lists_hashtags.append(lst2)
        
    df["list_hashtags"]=lists_hashtags
    
    return df


# Unidecode words
# Input: string of the word
# Output: string of the unidecoded word
def unidecode_word(text):    
    
    return deEmojify(unidecode(text))


# Remove emojis in words
# Input: string of the word
# Output: string of the word without emojis
def deEmojify(text):
    
    regrex_pattern = re.compile(pattern = 
        ("[\u263a-\U0001f645]"), flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)


# Remove stopwords (frequent hashtags) from the list of hashtags
# Input: list of hashtags
# Ouput: filtered list of hashtags
def filter_words(lst):
    
    lst=[hashtag for hashtag in lst if hashtag not in stopwords] 
    
    return lst

In [None]:
# FIND EXPECTED REACTIONS FOR EACH POST PER INFLUENCER

# Find expected reactions for each post for the current influencer
# Input: DataFrame of all the posts of all influencers
# Output: the same DataFrame, but with the "expected" column in addition, computed for each influencer
def expected_reactions_per_infl(df):
    
    influencers=df["accountID"].drop_duplicates().to_list()
    
    dfs = []
    
    for infl in influencers:
        
        print("Processing:", infl)
        tmp_df=df[df["accountID"]==infl]
        tmp_df=tmp_df.sort_values(by=["date"])
        
        df_tmp = find_expected(tmp_df)
        dfs.append(df_tmp)
        
    expected = pd.concat(dfs) 
    
    return expected


# Find expected reactions for each post for the current influencer
# Input: DataFrame of the posts of an influencer
# Output: the same DataFrame, but with the "expected" column in addition
def find_expected(tmp_df):
    
    i=0
    serie=tmp_df["reactions"]
    
    tmp_df["expected"]=0
    
    for post in tmp_df.itertuples():

        # Expanding window to take the last 100 posts or, if there are fewer, all of them
        samples = serie[ i-100 if i -100 -1>0 else 0 : i]
        
        # If there are less than 10 posts by the current influencer, ignore them
        if len(samples)<10:
            i+=1
            continue
            
        Q1 = np.quantile(samples, 0.25)
        Q3 = np.quantile(samples, 0.75)
        
        # Take the subset of samples in the interval [Q1, Q3]
        sub_samples=[s for s in samples if ((s>Q1) & (s<Q3))]
        
        expected=np.mean(sub_samples)

        tmp_df.loc[tmp_df["postID"]==post.postID,"expected"]=expected
            
        i+=1
    
    return tmp_df


# Remove posts with expected reactions equal to 0
def remove_posts_expected_zero(df_influencers):
    
    return df_influencers[df_influencers["expected"]!=0]

In [None]:
# FIND OUTLIERS FOR EACH INFLUENCER - ANOMALY DETECTION PHASE

# Find anomalies (notable posts) for the influencers through the Boxplot Rule method
# Input: DataFrame of all the posts of all influencers
# Output: DataFrame of notable posts of all influencers
def Boxplot_rule_per_infl(df):
    
    influencers=df["accountID"].drop_duplicates().to_list()
    
    dfs = []
    
    for infl in influencers:
        
        print("Processing:", infl)
        tmp_df=df[df["accountID"]==infl]
        tmp_df=tmp_df.sort_values(by=["date"])
        
        df_tmp = find_outliers(tmp_df)
        dfs.append(df_tmp)
        
    anomalies = pd.concat(dfs) 
    
    return anomalies


# Find anomalies (notable posts) for the current influencer through the Boxplot Rule method
# Input: DataFrame of the posts of an influencer
# Output: DataFrame of notable posts of the same influencer
def find_outliers(tmp_df):
    
    i=0
    serie=tmp_df["score"]
    mask = []
    
    for post in tmp_df.itertuples():
        
        # Expanding window to take the last 100 posts or, if there are fewer, all of them
        samples = serie[ i-100 if i -100 -1>0 else 0 : i]
        
        # If there are less than 10 posts by the current influencer, ignore them
        if len(samples)<10:
            i+=1
            mask.append(False)
            continue
            
        # Boxplot Rule formula
        Q1 = np.quantile(samples, 0.25)
        Q3 = np.quantile(samples, 0.75)
        IQR = Q3 - Q1
        upper_limit=Q3 + 2 * IQR
        
        if post.score > upper_limit:
            mask.append(True)
        else:
            mask.append(False)
            
        i+=1
            
    anomaly_df = tmp_df[mask]
    
    return anomaly_df


In [None]:
# CREATE A GRAPH OF NOTABLE POSTS PER WEEK AND FIND COMMUNITIES IN EACH GRAPH - CLUSTERING PHASE

# Create a graph for each week:
#  - each node is a post
#  - each edge represents at least an hashtag in common between the text in their descriptions
# Input:  DataFrame of notable posts (obtained in the Anomaly Detection phase)
# Output: Dictionary (key,value) -> key:   string "year_week"
#                                          value: NetworkX Graph object
def create_graphs (df_outliers_over_hashtags):

    graphs_over_list=dict()
    graphs_over_post=dict()
    
    for group_name, df_group in df_outliers_over_hashtags.groupby(["year","week"]):
        
        G1 = nx.Graph()
        G2 = nx.Graph()
        tag_to_posts_list = defaultdict(set)
        tag_to_posts_ids = defaultdict(set)
        
        for row_index, x in df_group.iterrows():
            
            week=str(int(x.week))
            year=str(int(x.year))
            key=str(year+"_"+week)
            list_hashtags = (x.list_hashtags)
            
            post_list = " ".join(list_hashtags)
            post_id = x.postID
            
            if len(list_hashtags) > 0:
                G1.add_node(post_list)
                G2.add_node(post_id)
                for tag in list_hashtags:
                    tag_to_posts_list[tag].add(post_list)
                    tag_to_posts_ids[tag].add(post_id)
                    
        for tag in tag_to_posts_list:
            for post_ext1,post_ext2 in zip(tag_to_posts_list[tag],tag_to_posts_ids[tag]):
                for post_int1,post_int2 in zip(tag_to_posts_list[tag],tag_to_posts_ids[tag]):
                    if post_ext2 != post_int2:
                        weight=distance(post_ext1, post_int1)
                        G1.add_edge(post_ext1, post_int1, weight=weight)
                        G2.add_edge(post_ext2, post_int2, weight=weight)
     
        graphs_over_list[key] = G1 # graph in which each node is the list of hashtags of the post
        graphs_over_post[key] = G2 # graph in which each node is a post (this graph is "parallel" to the previous)
            
    return graphs_over_post


# Weight-function to ompute weight for each edge
# Input:  hashtags lists of the two posts linked by the edge for which to calculate the weight
# Output: weight of the edge
def distance(lst1, lst2):
    
    s1 = set(lst1)
    s2 = set(lst2)
    
    if(s1==s2):
        return 0
    
    if(len(s1.union(s2))==0):
        return 1.0
    
    # Partial intersections are a measure of how similar the words not present in both hashtag lists are, 
    # and are calculated by considering the length of the common letter blocks between the words
    partial_intersections=0
    
    for word1 in s1-s1.intersection(s2):
        combinations=0
        for word2 in s2-s1.intersection(s2):
            match = SequenceMatcher(None, word1, word2).get_matching_blocks()
            match_sizes_sum = sum([m.size for m in match if m.size>2])
            if(match_sizes_sum>0):
                partial_intersections=partial_intersections+float(match_sizes_sum/(len(word1)+len(word2)))
                combinations=combinations+1
        if(combinations>0):
            partial_intersections=float(partial_intersections/combinations)
      
    # Jaccard Distance formula
    return 1-float((len(s1.intersection(s2))+partial_intersections) / len(s1.union(s2)))
        

# Find Louvain communities for each weekly graph and draws them through NetworkX spring_layout
# Input:  dictionary of weekly graphs, DataFrame of notable posts (obtained in the Anomaly Detection phase)
# Output: dictionary of DataFrames of notable posts clustered in the corresponding communities (key,value) -> key:   string "year_week"
#                                                                                                             value: DataFrame
def draw_communities(graphs, df_outliers):
    
    communities_dict=dict()
    
    for key in sorted(graphs.keys()):
        
        year=int(key.split("_")[0])
        week=int(key.split("_")[1])
        
        tot_posts=int(df_outliers[(df_outliers["year"]==year)&(df_outliers["week"]==week)]["n_anomalies"].drop_duplicates())
    
        G = graphs[key]
        error=0
        
        try:
            communities_louvain = community_louvain.best_partition(G)
        except ZeroDivisionError:
            error=1
            
        mod=-1

        node_groups = []
                
        # Louvain algorithm for community detection
        if(error!=1):
            for node,com in sorted(communities_louvain.items(), key=lambda item: item[1]):
                node_groups.append([])
                node_groups[com].append(node)
            
            degree = dict(G.degree(weight="weight"))
            clustering_coeff=nx.clustering(G)
            deg_sum = sum(degree.values())
            if(deg_sum!=0):
                mod=modularity(G, node_groups)
            else:
                mod=-1
        
        useful_com_2 = [] # communities containing at least 2 posts
        for com in node_groups:
            if(len(list(com))>1):
                useful_com_2.append(list(com))
                
        # Selecting colors to draw the graphs and the communities  
        nodes_color_map = []
        edge_color_map = []
        colors = plt.cm.get_cmap("tab20", len(useful_com_2))
        node_groups=sorted(node_groups, key=len, reverse=True)
        
        for node in G:
            found=0
            for i in range(len(node_groups)):
                if node in node_groups[i]:
                    if(len(node_groups[i])>1):
                        nodes_color_map.append(colors(i))
                        edge_color_map.append(colors(i))
                        found=1
                    break
            if found==0:
                nodes_color_map.append("white")
                edge_color_map.append("black")
              
        # Drawing the graph and the communities
        plt.figure(figsize=(12,7))   
        pos = nx.spring_layout(G, k=0.2, iterations=50)
        nx.draw(G, pos, node_color=nodes_color_map, edgecolors=edge_color_map, node_size=100, with_labels=False)
        plt.show()
        
        # Building the final DataFrame
        communities_df=df_outliers
        communities_df=communities_df[(communities_df["year"]==year) & (communities_df["week"]==week)]
        communities_df["community"]=-1
        communities_df["degree"]=-1
        communities_df["clustering_coeff"]=-1
        communities_df["modularity"]=mod
        if(len(node_groups)==1):
            communities_df["n_communities"]=0
        else:
            communities_df["n_communities"]=len(node_groups)
        communities_df["n_communities2"]=len(useful_com_2)
        
        mean_degree=np.array(list(degree.values())).mean()
        median_degree=np.median(list(degree.values()))
        communities_df["mean_degree"]=mean_degree
        communities_df["median_degree"]=median_degree
        mean_clustering_coeff=np.array(list(clustering_coeff.values())).mean()
        median_clustering_coeff=np.median(list(clustering_coeff.values()))
        communities_df["mean_clustering_coeff"]=mean_clustering_coeff
        communities_df["median_clustering_coeff"]=median_clustering_coeff
        
        # Printing the statistics for the current graph
        print("Year: "+str(year))
        print("Week: "+str(week))
        print("Total number of posts of the week: "+str(tot_posts))
        print("Nodes: "+str(len(G.nodes)))
        print("Edges: "+str(len(G.edges)))
        print("Communities: "+str(len(node_groups)))
        print("Communities with at least 2 nodes (posts): "+str(len(useful_com_2)))
        print("Modularity: "+str(mod))
        print("Mean degree: "+str(mean_degree))
        print("Median degree: "+str(median_degree))
        print("Mean clustering coefficient: "+str(mean_clustering_coeff))
        print("Median clustering coefficient: "+str(median_clustering_coeff))
        
        i=0
        for community in useful_com_2:
            for post in community:
                communities_df.loc[communities_df["postID"]==post, "community"]=i
                communities_df.loc[communities_df["postID"]==post, "degree"]=G.degree[post]
                communities_df.loc[communities_df["postID"]==post, "clustering_coeff"]=nx.clustering(G,post)
            i=i+1
           
        communities_dict[key]=communities_df.sort_values(by=["community"])
    
    return communities_dict


# Alternative method to draw graphs and communities (nodes with different shapes per community)
# Input: NetworkX graph
# Output: draws the graph and the communities
def draw_graph(G):
    
    G= G.copy()
    print ("Nodes all:", len (G.nodes))
    
    G.remove_nodes_from(list(nx.isolates(G)))
    
    print ("Nodes with edges:", len (G.nodes))
    print ("Edges:", len (G.edges))
    
    partition = community_louvain.best_partition(G)
    shapes = 'so^>v<dph8so^>v<dph8so^>v<dph8so^>v<dph8so^>v<dph8so^>v<dph8so^>v<dph8so^>v<dph8so^>v<dph8'

    plt.figure(figsize=(6,3))
    plt.axis('off')

    pos = nx.nx_pydot.graphviz_layout(G)
    
    # Color the nodes according to their partition
    cmap = cm.get_cmap('jet')
    nx.draw_networkx_edges(G, pos, alpha=0.5)
    i=0
    for node, color in partition.items():
        nx.draw_networkx_nodes(G, pos, [node], node_size=100,
                               node_color=[cmap(color/len(set(partition.values())) )],
                               node_shape=shapes[color])
        i+=1
        
    plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0, hspace=0)
    

In [None]:
# Plot wordclouds of the current week per community
# Input: DataFrame of the notable posts of the week clustered in communities, a string in the form "year_week"
# Output: plots the wordclouds
def wordcloud_per_week(df_week, prefix):
    
    communities=df_week[df_week["community"]!=-1]["community"].unique()
    
    for i,com in enumerate(communities):
        
        lists_words=df_week[df_week["community"]==com]["list_hashtags"].to_list()
        
        words=[]
        for lst in lists_words:
            for w in lst:
                words.append(w)
            
        print(f"Community: {i}. Posts: {len(df_week[df_week['community']==com])}")
        print("Profiles:", ", ".join(df_week[df_week["community"]==com]["name"].drop_duplicates()))
        frequencies=dict()
        
        for w in set(words):
            f=words.count(w)
            frequencies[w]=f
            
        # Wordclouds of the current week
        if len(frequencies) > 0:
            fig=plt.figure(figsize=(5,2.5))
            wordcloud = WordCloud(background_color=None, mode="RGBA", collocations=False, mask=None)
            wordcloud.generate_from_frequencies(frequencies=frequencies)

            plt.imshow(wordcloud, cmap=plt.cm.gray, interpolation="bilinear")
            plt.axis("off")
            plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0, hspace=0)
            plt.savefig(f"wordcloud_{prefix}_{i}.pdf", dpi=1000)
            plt.show()
            

## Dataset transformation

* **Input**: a Pandas DataFrame (df_influencers) with columns:
    * **postID**: id of the post;
    * **AccountID**: id of the influencer account that published the post;
    * **name**: name of the influencer account that published the post;
    * **date**: exact publication date of the post;
    * **year**: year of publication of the post;
    * **week**: week of publication of of the post;
    * **description**: text of the post;
    * **reactions**: number of "likes" and "comments" effectively received by the post;
    * **followers**: number of followers of the influencer at the time of the publication of the post.
    
    
* **Output**: the same Pandas DataFrame with the following columns in addition:
    * **list_hashtags**: the list of hashtags extracted from the text of the post;
    * **expected**: number of "expected" reactions of the post;
    * **score**: performance "score" of the post (effective reactions/expected reactions).

In [None]:
# Remove posts of influencers with zero followers at the publication date of the post
df_influencers_purged=remove_posts_followers_zero(df_influencers)

In [None]:
# Extract list of hashtags from each post
df_influencers_purged=words_finder(df_influencers_purged)

### Compute expected reactions for each post

In [None]:
df_expected=expected_reactions_per_infl(df_influencers_purged)
df_expected=remove_posts_expected_zero(df_expected)

### Compute performance score for each post

In [None]:
df_expected["score"]=df_expected["reactions"]/df_expected["expected"]

## ANOMALY DETECTION
Apply the Boxplot Rule method to extract **notable posts** for each influencer and save them in a csv file ("anomalies.csv").

* **Output**: the same previous Pandas DataFrame with the following column in addition:
    * **n_anomalies**: number of anomalies for each week.

In [None]:
df_outliers=Boxplot_rule_per_infl(df_expected)
df_outliers.to_csv("anomalies.csv", index=False, header=True, quoting=csv.QUOTE_NONNUMERIC)

In [None]:
df_tojoin=df_outliers.groupby(["year","week"]).agg(n_anomalies=pd.NamedAgg(column="postID", aggfunc="nunique"))
df_outliers=df_outliers.join(df_tojoin, on=["year","week"], how="left")

## CLUSTERING (Graphs creation and Louvain Community Detection algorithm)

* **Output**: the same previous Pandas DataFrame with the following columns in addition:
    * **community**: identification number of the community to which the post belongs;
    * **degree**: degree of the node (post) in the graph;
    * **clustering_coeff**: clustering coefficient of the node (post) in the graph;
    * **modularity**: modularity of the communities in the graph;
    * **n_communities**: total number of communities in the graph;
    * **n_communities2**: number of communities containing at least 2 nodes (posts) in the graph;
    * **mean_degree**: mean of the degree of the nodes in the graph;
    * **median_degree**: median of the degree of the nodes in the graph;
    * **mean_clustering_coeff**: mean of the clustering coefficient of the nodes in the graph;
    * **median_clustering_coeff**: median of the clustering coefficient of the nodes in the graph.
    
Here, the term "graph" refers to the graph of notable posts of each week.
The ouput DataFrame is then saved in a csv file ("communities.csv").

In [None]:
# Ordering the dataset
df_outliers=df_outliers.sort_values(by=["year","week"])

# Get graphs
graphs_post=create_graphs(df_outliers)

# Get communities by Louvain algorithm
df_communities=draw_communities(graphs_post, df_outliers)

df_communities.to_csv("communities.csv", index=False, header=True, quoting=csv.QUOTE_NONNUMERIC)

## WORDCLOUDS per community per week

In [None]:
# Choose a year and a week and plot the wordcloud for each community for the selected year and week
key="2020_12"
df_week=df_communities[key]
df_week=df_week[(df_week["year"]==2020) & (df_week["week"]==12)]

wordcloud_per_week(df_week, prefix=key)