This code clusters influencers based on keywords and then checks the similarity score between each cluster and the brands, the idea is for the recommended influencers to be of the cluster with the highest similarity.

In the event when we dont want to consider that, we have taken an average of the next top three and found the closest similarity score to that one to select the next best cluster. Taking an average of all the scores negates the essence of using the similarity score in the first place

We have also proposed a formula to calculate for the competitive advantage

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [None]:
#Reading dataset 
data=pd.read_csv(r"/content/influencer_keywords_nltk.csv", encoding='latin-1', low_memory=False)

In [None]:
data.drop(data.columns[[2]], axis=1, inplace=True)
data['Word'] = data.groupby(['Influencer Name'])['Word'].transform(lambda x: ', '.join(x))
data = data.drop_duplicates().reset_index(drop = True)
data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words={'english'})
X = vectorizer.fit_transform(data['Word'])

In [None]:
kmeans_kwargs = {
     "init": "random",
     "n_init": 10,
     "max_iter": 300,
     "random_state": 42,
     }

In [None]:
# A list holds the silhouette coefficients for each k
silhouette_coefficients = []

for k in range(2, 12):
  kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
  kmeans.fit(X)
  score = silhouette_score(X, kmeans.labels_)
  silhouette_coefficients.append(score)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(2, 12), silhouette_coefficients)
plt.xticks(range(2, 12))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(X)
labels=kmeans.labels_
Influencer=pd.DataFrame(list(zip(data['Influencer Name'],labels)),columns=['Influencer Name','cluster'])
print(Influencer.sort_values(by=['cluster']))

       Influencer Name  cluster
27        thebodycoach        0
2         aliceliveing        0
22             ohpolly        0
7       courtneydblack        0
29        _jackfowler_        1
5          chessieking        1
6           chloe.khan        1
17         katiepiper_        1
23        oliviadbowen        2
19           lucymeck1        2
16        jesswright77        2
0         aaroncgshore        2
10           ini.helen        2
4       charlottedawsy        2
8   danosborneofficial        2
20      mac_griffiths_        3
13        jamessmithpt        3
28     thefitnesschef_        3
24         rogersnipes        3
14           jesshunt2        4
11       itsalwayshana        5
9       gabbydawnallen        5
1          adamcollard        5
21     mattdoesfitness        6
18         korisampson        6
12         jamesgshore        6
15     jessica_rose_uk        7
3           brown.elle        8
26            sylvijaa        8
25       slimmingworld        9


In [None]:
new_column = kmeans.fit_predict(X)
data["Cluster"] = new_column
new_influnencer_cluster=data
new_influnencer_cluster

Unnamed: 0,Influencer Name,Word,Cluster
0,aaroncgshore,"little, talia, happy, oatway, big, beautiful, ...",2
1,adamcollard,"sculpt, sets, rest, db, ups, time, kg, workout...",5
2,aliceliveing,"body, workout, try, week, side, know, feel, ti...",0
3,brown.elle,"ad, code, hair, summer, fashionnova, fashionno...",8
4,charlottedawsy,"chuffin, love, best, noah, feel, much, beltin,...",2
5,chessieking,"every, body, weeks, first, feel, love, mat, ti...",1
6,chloe.khan,"ad, new, pic, best, dress, today, happy, aaveh...",1
7,courtneydblack,"calories, workout, every, time, need, body, fe...",0
8,danosborneofficial,"little, love, amazing, go, today, time, best, ...",2
9,gabbydawnallen,"time, body, love, workout, leg, shapeupwithgab...",5


In [None]:
new_influnencer_cluster.to_csv('ClusteredInfluencers_Keywords.csv', index=False)


In [None]:
#Reading dataset 
brand_data=pd.read_csv(r"/content/competitor_keywords_nltk.csv", encoding='latin-1', low_memory=False)
brand_data.head()

In [None]:
brand_data.drop(brand_data.columns[[2]], axis=1, inplace=True)
brand_data['Word'] = brand_data.groupby(['Competitor Name'])['Word'].transform(lambda x: ', '.join(x))
brand_data = brand_data.drop_duplicates().reset_index(drop = True)

In [None]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.6 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 10.6 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 41.3 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 62.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 24.7 MB/s 
Building wheels for collected pa

In [None]:
import itertools
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow_hub as hub
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler

In [None]:
def get_median_similarity(model, ps1, ps2):
    """Calculates the average of the similarity score based on all the combinations of these two lists of texts.
    Using the Universal sentence encoder model."""
    sm_scores = []
    try:
        sembs_1 = model(ps1).numpy()  # Sentence embeddings for the first list of posts
        sembs_2 = model(ps2).numpy()  # Sentence embeddings for the second list of posts

        for semb in sembs_1:
            sm_scores.extend(cosine_similarity([semb], sembs_2))
    except Exception as e:
        print(e)
        # print(ps1)
        # print(ps2)
    return np.median(sm_scores)

In [None]:
def competitor_influencerCluster_similarity():
    """Do the similarity analysis between all the competitors and influencer clusters"""
    df_competitors = brand_data
    df_influencersclusters = new_influnencer_cluster
    # Dropping the null values if there are any
   # df_influencersclusters.dropna(inplace=True)
    #df_competitors.dropna(inplace=True)
    #df_competitors = df_competitors[df_competitors['platform'] != 'website']

    # Loading the sentence transformers model.
    bert_model = SentenceTransformer('bert-base-nli-mean-tokens')
    use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')


    # Getting unique values of competitor and influencer names as a list
    cmptr_list = df_competitors['Competitor Name'].unique().tolist()
    inflcr_list = df_influencersclusters['Cluster'].unique().tolist()

    final_scores = []
    # Iterating through all the competitors
    for cmptr in cmptr_list:
        # Getting all the posts belonging to the competitor given by 'cmptr'
        cmptr_posts = df_competitors.loc[df_competitors['Competitor Name'] == cmptr, 'Word'].tolist()
        # Iterating through all the influencers
        for inflcr in inflcr_list:
            # Getting all the posts belonging to the influencer cluster given by 'inflcr'
            inflcr_posts = df_influencersclusters.loc[df_influencersclusters['Cluster'] == inflcr, 'Word'].tolist()
            print('Finding similarity between the competitor', cmptr, ' and the influencer cluster', inflcr, '...')
            # Appending to the list of similarity scores for each combination of competitor and influencer cluster.
            final_scores.append(
                (cmptr, inflcr, get_median_similarity(model=use_model, ps1=cmptr_posts, ps2=inflcr_posts)))

            # Saving the results for every iteration.
            df_similarity = pd.DataFrame(final_scores,
                                         columns=['Competitor Name', 'Cluster', 'similarity_score'])
            df_similarity.to_csv('competitor_influencercluster_similarity.csv', index=False)
competitor_influencerCluster_similarity()


Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Finding similarity between the competitor bulk  and the influencer cluster 2 ...
Finding similarity between the competitor bulk  and the influencer cluster 5 ...
Finding similarity between the competitor bulk  and the influencer cluster 0 ...
Finding similarity between the competitor bulk  and the influencer cluster 8 ...
Finding similarity between the competitor bulk  and the influencer cluster 1 ...
Finding similarity between the competitor bulk  and the influencer cluster 6 ...
Finding similarity between the competitor bulk  and the influencer cluster 3 ...
Finding similarity between the competitor bulk  and the influencer cluster 4 ...
Finding similarity between the competitor bulk  and the influencer cluster 7 ...
Finding similarity between the competitor bulk  and the influencer cluster 9 ...
Finding similarity between the competitor formnutrition  and the influencer cluster 2 ...
Finding similarity between the competitor formnutrition  and the influencer cluster 5 ...
Finding si

In [None]:
clustered_similarty=pd.read_csv(r"/content/competitor_influencercluster_similarity.csv", encoding='latin-1', low_memory=False)
clustered_similarty

Unnamed: 0,Competitor Name,Cluster,similarity_score
0,bulk,2,0.037149
1,bulk,5,0.256332
2,bulk,0,0.213946
3,bulk,8,0.157420
4,bulk,1,0.083583
...,...,...,...
95,puresport,6,0.069221
96,puresport,3,0.152867
97,puresport,4,0.129300
98,puresport,7,-0.010580


Proposed calculation for competitive advantage 

In [None]:
#group the datda by clusters
grouped_df = clustered_similarty.groupby('Cluster')
#create an empty dataframe
Competitive_advantage = pd.DataFrame()
# iterate over the groups and calculate the average value of column 'similarity_score' for each group
for name, group in grouped_df:
    # calculate the average value of column 'similarity_score' in the current group excluding each row
 
    average_value = (group['similarity_score'].sum() - group['similarity_score']) / (len(group) - 1)
    # use the assign() method to add a new column to the DataFrame that contains the difference between the average value and the value in column 'similarity_score' for each row
    group = group.assign(difference=group['similarity_score'].sub(average_value))

    Competitive_advantage = Competitive_advantage.append(group)
    # print the group DataFrame as csv
    Competitive_advantage.to_csv('Competitive Advantage.csv', index=False)

In [None]:
Comp_ad=pd.read_csv(r"/content/Competitive Advantage.csv", encoding='latin-1', low_memory=False)
Comp_ad

Unnamed: 0,Competitor Name,Cluster,similarity_score,difference
0,bulk,0,0.213946,-0.083469
1,formnutrition,0,0.284316,-0.005280
2,indisupplements,0,0.318617,0.032833
3,medterra.international,0,0.202332,-0.096373
4,motionnutrition,0,0.310169,0.023445
...,...,...,...,...
95,neat_nutrition,9,0.329348,0.036295
96,neurohacker,9,0.313890,0.019119
97,thenue_co,9,0.275491,-0.023546
98,liveinnermost,9,0.308622,0.013266


An alternative to recommend a cluster of influencers for a brand focusing on the top 4 brands

In [None]:
 # select all data that contain a specific competitor
filtered_data=clustered_similarty[clustered_similarty['Competitor Name'].str.contains('neurohacker')]
new_fd = filtered_data.sort_values(by=['similarity_score'], ascending=False)
col = new_fd['similarity_score']

# Select the second to fourth values in the column
vals = col[1:4]

# Calculate the mean of the values
mean = vals.mean()
#Calculate the closest value to the mean
closest = min(new_fd['similarity_score'], key=lambda x: abs(x-mean))
Average_top_cluster=new_fd.loc[new_fd['similarity_score'] == closest ] #print the row with the closest top average value
Average_top_cluster

Unnamed: 0,Competitor Name,Cluster,similarity_score
62,neurohacker,0,0.253629
