In [37]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE

In [44]:
load_dotenv()
DB_URL = os.getenv("DB_URL")
engine = create_engine(DB_URL)
query_main = """
    select c.ContactPersoonId, i.CampagneId, i.CampagneNaam, ca.Startdatum, a.Ondernemingsaard, a.Ondernemingstype, a.PrimaireActiviteit, f.Naam as Functie
    from Contactfiche c
    join Account a on a.AccountId = c.AccountId
    join Inschrijving i on i.ContactficheId = c.ContactPersoonId
    join Campagne ca on ca.CampagneId = i.CampagneId
    join ContactficheFunctie cf on cf.ContactpersoonId = c.ContactPersoonId
    join Functie f on f.FunctieId = cf.FunctieId
    where i.CampagneId is not null;
"""
query_pageviews = """
    with pageview_count as(
        select p.PageTitle, count(p.ContactId) as count
        from Pageviews p
        group by p.PageTitle
    )
    select PageTitle, ContactId
    from Pageviews
    where Pagetitle in (select top (2500) PageTitle
    from pageview_count
    order by count desc)
"""

df = pd.read_sql(query_main, engine)
df.set_index('ContactPersoonId', inplace=True)
df["rating"] = 5

df_pageviews = pd.read_sql(query_pageviews, engine)
df_pageviews.set_index('ContactId', inplace=True)
df_pageviews["rating"] = 2

display(df.head())
display(df_pageviews.head())

Unnamed: 0_level_0,CampagneId,CampagneNaam,Startdatum,Ondernemingsaard,Ondernemingstype,PrimaireActiviteit,Functie,rating
ContactPersoonId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
D7DDF02E-9F72-E111-B43A-00505680000A,5CC8251F-104B-E911-80FC-001DD8B72B62,OV-NW-Voka Politica Gent Zuid 2019,2019-03-20 10:30:00,Productie,Bedrijf,"Chemie, petrochemie",Medewerker,5
85A2204B-733D-EC11-8125-001DD8B72B61,A8949A21-6CEE-ED11-8849-6045BD8956C9,OV-NW&IN-EDIH-DIGITALIS-XpertFinder-Data-AI,2023-10-19 08:00:00,Diensten,Bedrijf,Consultancy,Medewerker Commercieel,5
CEB69229-0CFB-E511-96DE-005056B06EB4,B4E3E30A-E6CA-EC11-A7B5-000D3A20A90F,OV-NW/DO-Quick Refresh-Webinar-Energiedelen,2022-06-15 11:00:00,Productie & Diensten,,Bouw,Bedrijfsleider,5
FE6F2928-3E72-E111-B43A-00505680000A,3BA33E76-B8B4-EC11-983F-00224883C04D,OV-Management Assistant Day 2023,2023-04-27 09:00:00,Diensten,Bedrijf,Bouw,Management Assistent,5
0164420E-9596-EA11-8111-001DD8B72B62,74313A3B-E88D-EA11-810F-001DD8B72B61,OV-Webinar: E-commerce op de Chinese markt,2020-05-20 15:30:00,,Bedrijf,,Medewerker Commercieel,5


Unnamed: 0_level_0,PageTitle,rating
ContactId,Unnamed: 1_level_1,Unnamed: 2_level_1
71C7CDFA-379C-EB11-811E-001DD8B72B62,Weer hogere loonindexering op komst in 2024 | ...,2
2A04879B-1141-EC11-8125-001DD8B72B61,Nieuwjaarsinvitito 2023 | Voka,2
1E9BCBC0-9025-EC11-8124-001DD8B72B61,Netwerkevents | Voka,2
3F1EB182-8E3C-E811-80EF-001DD8B72B62,Marketing | Voka,2
5073196D-6611-E811-80EF-001DD8B72B61,Bedrijfsafval: dit verandert op 1 januari | Voka,2


In [39]:
df_pivot_main = pd.pivot_table(df, index='ContactPersoonId', columns=['Ondernemingsaard', 'Ondernemingstype', 'PrimaireActiviteit', 'Functie'], values='rating', fill_value = 0)
df_pivot_main.sort_index(inplace=True)
print(len(df_pivot_main.index))
df_pivot_main

13514


Ondernemingsaard,Diensten,Diensten,Diensten,Diensten,Diensten,Diensten,Diensten,Diensten,Diensten,Diensten,...,Productie & Diensten,Productie & Diensten,Productie & Diensten,Productie & Diensten,Productie & Diensten,Productie & Diensten,Productie & Diensten,Productie & Diensten,Productie & Diensten,Productie & Diensten
Ondernemingstype,Bedrijf,Bedrijf,Bedrijf,Bedrijf,Bedrijf,Bedrijf,Bedrijf,Bedrijf,Bedrijf,Bedrijf,...,Social Profit,Social Profit,Social Profit,Social Profit,Social Profit,Social Profit,Social Profit,Vrije beroepen,Vrije beroepen,Vrije beroepen
PrimaireActiviteit,Accountancy & boekhouding,Accountancy & boekhouding,Accountancy & boekhouding,Accountancy & boekhouding,Accountancy & boekhouding,Accountancy & boekhouding,Accountancy & boekhouding,Accountancy & boekhouding,Accountancy & boekhouding,Accountancy & boekhouding,...,Verenigingen en maatschappelijke organisaties,Zorg,Zorg,Zorg,Zorg,Zorg,Zorg,Accountancy & boekhouding,Accountancy & boekhouding,Farmacie
Functie,Bedrijfsleider,Bestuurder,Contact Lidmaatschap,Directie of kaderlid,Management Assistent,Medewerker,Medewerker Commercieel,Medewerker Communicatie,Medewerker Financieel,Medewerker Juridische dienst,...,Verantwoordelijke Commercieel,Bedrijfsleider,Contact HealthCommunity,Contact Lidmaatschap,Medewerker Financieel,Verantwoordelijke Commercieel,Verantwoordelijke Productie,Medewerker Duurzaamheid / Milieu & Energie,"Verantwoordelijke Duurzaamheid, Milieu & Energie",Verantwoordelijke Financieel
ContactPersoonId,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
00169619-E322-E911-80FB-001DD8B72B62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0017416A-2C6E-E111-B43A-00505680000A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0019C15A-6481-E611-80DE-001DD8B72B61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00223C8E-467F-E311-BBFD-005056B06EB4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00231824-53EA-ED11-8849-6045BD895420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FFF0C643-DAE4-ED11-A7C7-000D3A4AB78E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FFF68074-EB93-E911-80FF-001DD8B72B62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FFF68536-5DE0-E111-8A53-984BE17C2819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FFFAE2B6-11D5-EC11-A7B5-000D3ABD1F85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
# df_pivot_pageviews = pd.pivot_table(df_pageviews, index='ContactId', columns=['PageTitle'], values='rating', fill_value = 0)
# df_pivot_pageviews.sort_index(inplace=True)
# print(len(df_pivot_pageviews.index))
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

# Encode categorical variable
label_encoder = LabelEncoder()
df_pageviews['CategoryEncoded'] = label_encoder.fit_transform(df_pageviews['PageTitle'])
display(df_pageviews)

# # Apply PCA for dimensionality reduction
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(df_pageviews[['rating', 'CategoryEncoded']])

# # Create a new DataFrame with the reduced dimensions
pca_df = pd.DataFrame(data=pca_result, columns=['Dimension 1', 'Dimension 2'], index=df_pageviews.index)
grouped_data = pca_df.groupby(pca_df.index).mean()
grouped_data
# pca_df

# # # Concatenate the original data with the reduced dimensions
# result_df = pd.concat([df_pageviews, pca_df], axis=1)

Unnamed: 0_level_0,PageTitle,rating,CategoryEncoded
ContactId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
71C7CDFA-379C-EB11-811E-001DD8B72B62,Weer hogere loonindexering op komst in 2024 | ...,2,2235
2A04879B-1141-EC11-8125-001DD8B72B61,Nieuwjaarsinvitito 2023 | Voka,2,1320
1E9BCBC0-9025-EC11-8124-001DD8B72B61,Netwerkevents | Voka,2,1292
3F1EB182-8E3C-E811-80EF-001DD8B72B62,Marketing | Voka,2,1157
5073196D-6611-E811-80EF-001DD8B72B61,Bedrijfsafval: dit verandert op 1 januari | Voka,2,267
...,...,...,...
22EB0CF3-856A-E911-80FE-001DD8B72B61,Voka nationaal | Voka,2,2114
43ADE60D-B97F-EA11-810E-001DD8B72B61,Milieu & Water | Voka,2,1222
10C10E87-6E5A-EB11-8117-001DD8B72B61,"Zoekresultaten voor ""Groeiland"" | Voka",2,2350
2511155F-0AD5-EC11-A7B5-000D3ABD1F85,Opleidingsaanbod Voka Mechelen-Kempen | Voka,2,1473


Unnamed: 0_level_0,Dimension 1,Dimension 2
ContactId,Unnamed: 1_level_1,Unnamed: 2_level_1
00013C8A-6F1C-E211-9DAA-005056B06EB4,-89.887022,0.0
00017E48-212C-EC11-8127-001DD8B72B62,-983.220356,0.0
0002EDD7-2405-E511-ABE8-005056B06EB4,1149.779644,0.0
0006CEC0-931A-EB11-8117-001DD8B72B62,622.309056,0.0
000C5DD4-016A-E111-B43A-00505680000A,-204.620356,0.0
...,...,...
FF76B927-E283-E611-80E3-001DD8B72B62,654.279644,0.0
FF9A9C90-DC1A-E711-80E4-001DD8B72B62,-53.220356,0.0
FFD3EBB0-F16C-E111-B43A-00505680000A,544.779644,0.0
FFD66075-2663-EC11-8F8F-000D3A2E7886,-98.470356,0.0


In [None]:
df_pivot = pd.concat([df_pivot_main, grouped_data], axis=1, join='outer').fillna(0)
# df_p = pd.merge(df_pivot, df_pageviews, 'left' )
print(len(df_pivot.index))
df_pivot.head()
# df_pivot.loc[['0017416A-2C6E-E111-B43A-00505680000A']]
# df_pivot.loc[select_contact]

30208


Unnamed: 0,"(Diensten, Bedrijf, Accountancy & boekhouding, Bedrijfsleider)","(Diensten, Bedrijf, Accountancy & boekhouding, Bestuurder)","(Diensten, Bedrijf, Accountancy & boekhouding, Contact Lidmaatschap)","(Diensten, Bedrijf, Accountancy & boekhouding, Directie of kaderlid)","(Diensten, Bedrijf, Accountancy & boekhouding, Management Assistent)","(Diensten, Bedrijf, Accountancy & boekhouding, Medewerker)","(Diensten, Bedrijf, Accountancy & boekhouding, Medewerker Commercieel)","(Diensten, Bedrijf, Accountancy & boekhouding, Medewerker Communicatie)","(Diensten, Bedrijf, Accountancy & boekhouding, Medewerker Financieel)","(Diensten, Bedrijf, Accountancy & boekhouding, Medewerker Juridische dienst)",...,"(Productie & Diensten, Social Profit, Zorg, Contact HealthCommunity)","(Productie & Diensten, Social Profit, Zorg, Contact Lidmaatschap)","(Productie & Diensten, Social Profit, Zorg, Medewerker Financieel)","(Productie & Diensten, Social Profit, Zorg, Verantwoordelijke Commercieel)","(Productie & Diensten, Social Profit, Zorg, Verantwoordelijke Productie)","(Productie & Diensten, Vrije beroepen, Accountancy & boekhouding, Medewerker Duurzaamheid / Milieu & Energie)","(Productie & Diensten, Vrije beroepen, Accountancy & boekhouding, Verantwoordelijke Duurzaamheid, Milieu & Energie)","(Productie & Diensten, Vrije beroepen, Farmacie, Verantwoordelijke Financieel)",Dimension 1,Dimension 2
00169619-E322-E911-80FB-001DD8B72B62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0017416A-2C6E-E111-B43A-00505680000A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,334.765755,0.0
0019C15A-6481-E611-80DE-001DD8B72B61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00223C8E-467F-E311-BBFD-005056B06EB4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00231824-53EA-ED11-8849-6045BD895420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
select_contact = ['DA252429-E5A6-ED11-AAD1-6045BD8956C9', 'ECEECDC0-BE17-EC11-8123-001DD8B72B61']
ss = df_pivot.loc[select_contact]
similarity_matrix = cosine_similarity(df_pivot, ss)
similarity_matrix_df = pd.DataFrame(similarity_matrix, index=df_pivot.index, columns=ss.index)
similarity_matrix_df

# Stack the DataFrame to create a MultiIndex
stacked_similarity_df = similarity_matrix_df.stack()

# Create a DataFrame from the stacked series
high_similarity_df = pd.DataFrame(stacked_similarity_df[stacked_similarity_df > 0.75])

# Reset the index to get a DataFrame with the secondary index
# high_similarity_df.reset_index(inplace=True)
high_similarity_df.index.names = ['Similars', 'Selected']
high_similarity_df.reset_index(inplace=True)
high_similarity_df.set_index(['Selected', 'Similars'], inplace=True)
high_similarity_df.sort_index(level='Selected', inplace=True)
high_similarity_df

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Selected,Similars,Unnamed: 2_level_1
DA252429-E5A6-ED11-AAD1-6045BD8956C9,06D904B5-90CE-EB11-8120-001DD8B72B61,1.000000
DA252429-E5A6-ED11-AAD1-6045BD8956C9,37EC055D-4419-E711-80E4-001DD8B72B62,1.000000
DA252429-E5A6-ED11-AAD1-6045BD8956C9,62E7E675-B69C-ED11-AAD1-6045BD8952CE,1.000000
DA252429-E5A6-ED11-AAD1-6045BD8956C9,DA252429-E5A6-ED11-AAD1-6045BD8956C9,1.000000
DA252429-E5A6-ED11-AAD1-6045BD8956C9,F43F1643-495B-ED11-9562-6045BD895BFB,1.000000
...,...,...
ECEECDC0-BE17-EC11-8123-001DD8B72B61,FAD49C68-CE5D-E811-80F0-001DD8B72B62,0.816497
ECEECDC0-BE17-EC11-8123-001DD8B72B61,FB237511-4BEE-E411-90DE-005056B06EC4,1.000000
ECEECDC0-BE17-EC11-8123-001DD8B72B61,FE06E8C2-5E89-E611-80E3-001DD8B72B62,0.816497
ECEECDC0-BE17-EC11-8123-001DD8B72B61,FF5712A0-4A6B-E111-B43A-00505680000A,1.000000


In [48]:
print(select_contact)
done_campaigns = df.loc[select_contact, ["CampagneId"]]
done_campaigns

['DA252429-E5A6-ED11-AAD1-6045BD8956C9', 'ECEECDC0-BE17-EC11-8123-001DD8B72B61']


Unnamed: 0_level_0,CampagneId
ContactPersoonId,Unnamed: 1_level_1
DA252429-E5A6-ED11-AAD1-6045BD8956C9,8F59ADE1-8A92-ED11-AAD1-6045BD895CDC
ECEECDC0-BE17-EC11-8123-001DD8B72B61,7EDEA493-7845-EE11-BE6E-6045BD8956AE
ECEECDC0-BE17-EC11-8123-001DD8B72B61,4785222F-0963-EC11-8F8F-000D3A2E7886
ECEECDC0-BE17-EC11-8123-001DD8B72B61,4785222F-0963-EC11-8F8F-000D3A2E7886
ECEECDC0-BE17-EC11-8123-001DD8B72B61,7EDEA493-7845-EE11-BE6E-6045BD8956AE


In [49]:
results = []
for selected, s_df in high_similarity_df.groupby(level=0):
    t = df[df.index.isin(s_df.reset_index()['Similars'])].replace(0, np.nan).dropna(axis=1, how='all')
    t.sort_values('Startdatum', ascending=False, inplace=True)
    t = t[t['Startdatum'] > '2023-10-1']
    display(t[['CampagneId', 'CampagneNaam', 'Startdatum']])

    results.append((selected, t))

Unnamed: 0_level_0,CampagneId,CampagneNaam,Startdatum
ContactPersoonId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


Unnamed: 0_level_0,CampagneId,CampagneNaam,Startdatum
ContactPersoonId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ED0E3C10-91B0-EA11-8110-001DD8B72B61,0E4173C9-D14B-EE11-BE6F-6045BD8956C9,OV-NW-Rode Loper Selectie-voetbalmatch - KAA G...,2024-02-24 18:30:00
ED0E3C10-91B0-EA11-8110-001DD8B72B61,0E4173C9-D14B-EE11-BE6F-6045BD8956C9,OV-NW-Rode Loper Selectie-voetbalmatch - KAA G...,2024-02-24 18:30:00
ED0E3C10-91B0-EA11-8110-001DD8B72B61,BFC428BF-8150-EE11-BE6E-6045BD8952CE,OV-Haven-North Sea Port Logistics & Supply Cha...,2023-12-14 15:00:00
ED0E3C10-91B0-EA11-8110-001DD8B72B61,BFC428BF-8150-EE11-BE6E-6045BD8952CE,OV-Haven-North Sea Port Logistics & Supply Cha...,2023-12-14 15:00:00
E84E74D0-C662-E511-895A-005056B06EC4,69AF93B9-CC05-EE11-8F6E-6045BD895420,OV-JO Breakfastclub december 2023: What's New ...,2023-12-06 08:00:00
E84E74D0-C662-E511-895A-005056B06EC4,69AF93B9-CC05-EE11-8F6E-6045BD895420,OV-JO Breakfastclub december 2023: What's New ...,2023-12-06 08:00:00
D50338EE-C637-EC11-8125-001DD8B72B61,08563B4A-6871-ED11-9561-6045BD8952CE,OV-Plato Academy 2023 - Workshop 4,2023-12-04 17:00:00
D50338EE-C637-EC11-8125-001DD8B72B61,08563B4A-6871-ED11-9561-6045BD8952CE,OV-Plato Academy 2023 - Workshop 4,2023-12-04 17:00:00
EFA5849E-4E04-E511-ABE8-005056B06EB4,08563B4A-6871-ED11-9561-6045BD8952CE,OV-Plato Academy 2023 - Workshop 4,2023-12-04 17:00:00
EFA5849E-4E04-E511-ABE8-005056B06EB4,08563B4A-6871-ED11-9561-6045BD8952CE,OV-Plato Academy 2023 - Workshop 4,2023-12-04 17:00:00


In [50]:
for s, x in results:
    similar_campagnes_not_done = x[~x['CampagneId'].isin(done_campaigns.loc[s])][['CampagneId', 'CampagneNaam']]
    similar_campagnes_not_done.drop_duplicates(inplace=True)
    x = similar_campagnes_not_done.reset_index()[['CampagneId', 'CampagneNaam']].set_index('CampagneId')
    display(x)

Unnamed: 0_level_0,CampagneNaam
CampagneId,Unnamed: 1_level_1


Unnamed: 0_level_0,CampagneNaam
CampagneId,Unnamed: 1_level_1
0E4173C9-D14B-EE11-BE6F-6045BD8956C9,OV-NW-Rode Loper Selectie-voetbalmatch - KAA G...
BFC428BF-8150-EE11-BE6E-6045BD8952CE,OV-Haven-North Sea Port Logistics & Supply Cha...
69AF93B9-CC05-EE11-8F6E-6045BD895420,OV-JO Breakfastclub december 2023: What's New ...
08563B4A-6871-ED11-9561-6045BD8952CE,OV-Plato Academy 2023 - Workshop 4
D9834556-240C-EE11-8F6E-6045BD8956C9,OV-JO Community Event Starten NJ 2023
7EDEA493-7845-EE11-BE6E-6045BD8956AE,OV-NW-Vaart Club November 2023
B3865100-DEEF-ED11-8849-6045BD895233,OV-NW-Voka Netwekkers bij Well Played
93CED7C9-F2F3-ED11-8849-6045BD8956C9,OV-MATCH-Welkom@Voka- 7 november 2023
2AE7357B-CBB8-ED11-83FF-6045BD895D85,OV-NW-Voka Visit-Meetjesland-Marie Méro
894C5996-ACB9-ED11-83FF-6045BD8956C9,OV-Lab - Aan de slag als extern adviseur - 202...
