In [2]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.2.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.0 MB)
     |████████████████████████████████| 24.0 MB 1.3 MB/s            
Collecting smart-open>=1.8.1
  Downloading smart_open-6.2.0-py3-none-any.whl (58 kB)
     |████████████████████████████████| 58 kB 6.1 MB/s             
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.2.0 smart-open-6.2.0


In [25]:
import networkx as nx
import random
import pandas as pd
import numpy as np
from typing import List
from tqdm import tqdm
from dask import dataframe as df1
from gensim.models.word2vec import Word2Vec

In [9]:
class DeepWalk:
    def __init__(self, window_size: int, embedding_size: int, walk_length: int, walks_per_node: int):
        """
        :param window_size: window size for the Word2Vec model
        :param embedding_size: size of the final embedding
        :param walk_length: length of the walk
        :param walks_per_node: number of walks per node
        """
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.walk_length = walk_length
        self.walk_per_node = walks_per_node

    def random_walk(self, g: nx.Graph, start: str, use_probabilities: bool = False) -> List[str]:
        """
        Generate a random walk starting on start
        :param g: Graph
        :param start: starting node for the random walk
        :param use_probabilities: if True take into account the weights assigned to each edge to select the next candidate
        :return:
        """
        walk = [start]
        for i in range(self.walk_length):
            neighbours = g.neighbors(walk[i])
            neighs = list(neighbours)
            if use_probabilities:
                probabilities = [g.get_edge_data(walk[i], neig)["weight"] for neig in neighs]
                sum_probabilities = sum(probabilities)
                probabilities = list(map(lambda t: t / sum_probabilities, probabilities))
                p = np.random.choice(neighs, p=probabilities)
            else:
                p = random.choice(neighs)
            walk.append(p)
        return walk

    def get_walks(self, g: nx.Graph, use_probabilities: bool = False) -> List[List[str]]:
        """
        Generate all the random walks
        :param g: Graph
        :param use_probabilities:
        :return:
        """
        random_walks = []
        for _ in range(self.walk_per_node):
            random_nodes = list(g.nodes)
            random.shuffle(random_nodes)
            for node in tqdm(random_nodes):
                random_walks.append(self.random_walk(g=g, start=node, use_probabilities=use_probabilities))
        return random_walks

    def compute_embeddings(self, walks: List[List[str]]):
        """
        Compute the node embeddings for the generated walks
        :param walks: List of walks
        :return:
            """
        model = Word2Vec(sentences=walks, window=self.window_size, vector_size=self.embedding_size)
        return model.wv

In [31]:
# Получить данные из файла FINAL_FEATURES_FRIENDS
df_final_features_friends = df1.read_csv("data/FINAL_FEATURES_FRIENDS.tsv", sep='\t', usecols=["CLIENT_ID", "FRIEND_ID"])
df_final_features_friends.head(100)

Unnamed: 0,CLIENT_ID,FRIEND_ID
0,999819,99981973
1,999819,99981949
2,999819,99981979
3,999819,99981939
4,999819,99981975
...,...,...
95,99978,9997815
96,99978,9997835
97,99978,9997821
98,99978,9997854


In [32]:
# Преобразование из Dask в Pandas
df_final_features_friends = df_final_features_friends.compute()
type(df_final_features_friends)

In [33]:
# Создание графа
G = nx.from_pandas_edgelist(df_final_features_friends, 'CLIENT_ID', 'FRIEND_ID')

In [48]:
# Получение данных из FINAL_TARGETS_DATES_TRAINTEST
df_final_targets_dates_traintest = df1.read_csv("data/FINAL_TARGETS_DATES_TRAINTEST.tsv", sep='\t')

In [49]:
# Преобразование из Dask в Pandas
df_final_targets_dates_traintest = df_final_targets_dates_traintest.compute()
type(df_final_targets_dates_traintest)

pandas.core.frame.DataFrame

In [50]:
# Удаление target = test
remove_data = []
for i in range(len(df_final_targets_dates_traintest['CLIENT_ID'])):
    if (df_final_targets_dates_traintest['TARGET'][i] == 'test'):
        remove_data.append(i)
        
df_final_targets_dates_traintest.drop(remove_data, axis=0, inplace=True)

df_final_targets_dates_traintest.head(50)

Unnamed: 0,CLIENT_ID,RETRO_DT,TARGET
0,1011725,20210501,0
1,1018784,20210501,1
2,1021812,20210501,0
3,1024003,20210501,0
5,1026390,20210501,1
6,1027507,20210501,1
7,1028552,20210501,0
9,1036872,20210501,0
10,1041608,20210501,0
11,1045369,20210501,1


In [51]:
# Создание графа
graph_final_targets_dates_traintest = nx.from_pandas_edgelist(df_final_targets_dates_traintest, 'CLIENT_ID', 'TARGET')

In [52]:
dw = DeepWalk(window_size=5, 
              embedding_size=10,
              walk_length=2, 
              walks_per_node=26)
walks = dw.get_walks(graph_final_targets_dates_traintest)

100%|██████████| 127434/127434 [02:10<00:00, 973.22it/s] 
100%|██████████| 127434/127434 [02:08<00:00, 994.01it/s] 
100%|██████████| 127434/127434 [02:10<00:00, 978.98it/s] 
100%|██████████| 127434/127434 [02:10<00:00, 977.16it/s] 
100%|██████████| 127434/127434 [02:09<00:00, 981.45it/s] 
100%|██████████| 127434/127434 [02:09<00:00, 985.28it/s] 
100%|██████████| 127434/127434 [02:06<00:00, 1004.58it/s]
100%|██████████| 127434/127434 [02:10<00:00, 980.09it/s] 
100%|██████████| 127434/127434 [02:09<00:00, 982.13it/s] 
100%|██████████| 127434/127434 [02:09<00:00, 987.43it/s] 
100%|██████████| 127434/127434 [02:10<00:00, 973.13it/s] 
100%|██████████| 127434/127434 [02:09<00:00, 984.62it/s] 
100%|██████████| 127434/127434 [02:09<00:00, 983.27it/s] 
100%|██████████| 127434/127434 [02:09<00:00, 980.59it/s] 
100%|██████████| 127434/127434 [02:08<00:00, 991.42it/s] 
100%|██████████| 127434/127434 [02:08<00:00, 992.32it/s] 
100%|██████████| 127434/127434 [02:08<00:00, 989.02it/s] 
100%|█████████

In [53]:
embeddings = dw.compute_embeddings(walks)

In [54]:
embeddings

<gensim.models.keyedvectors.KeyedVectors at 0x7f7d82eed910>