In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/RecSysPractice')

In [None]:
# !git clone https://github.com/shenweichen/GraphEmbedding.git

In [None]:
import pandas as pd
import gensim, tqdm, time

### Load the rating data: 

In [None]:
file_path = r"originalDataset/resources"
rawSampleDataPath = file_path + "/webroot/sampledata/ratings.csv"

rawSampleData = pd.read_csv(rawSampleDataPath)
rawSampleData.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
5,1,112,3.5,1094785740
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826


### Generate sequence: 

In [None]:
df_group = rawSampleData[rawSampleData.rating >= 3.5]\
.sort_values(["userId", "timestamp"], ascending = True)\
.groupby(["userId"])["movieId"]\
.apply(lambda x: " ".join([str(y) for y in x]) )\
.reset_index()
df_group.head()

Unnamed: 0,userId,movieId
0,1,924 919 337 151 112 50 541 593 29 293 47 296 3...
1,2,62 110 589 70 908 480 266 3 260 541 924
2,3,589 858 904 919 260 318 924 953 50 32 541 457 ...
3,4,10 356 454 480 589 377 586 350 368 370 594 520...
4,5,62 141 736 780 671 832 150 590 380 457 480 595...


### Generate edge list: 

In [None]:
import collections
dic = collections.defaultdict(list)
## 把序列打碎:
# eg:
# watch sequence:['858', '50', '593', '457']
# return:[['858', '50'],['50', '593'],['593', '457']]
## 然后存进一个{"858": ["50"], "50": ["593"]}这样的数据结构里面. 
for index, row in df_group.iterrows():
    previousItem = ''
    for item in row[1].split():
        if not previousItem:
            previousItem = item
        else:
            dic[previousItem].append(item)
            previousItem = item

In [None]:
## 把上面那个数据结构dic里面的数据, 保存成文件.
## 这个文件存的就是所谓的edges了. 
with open("edgeList.txt", "w") as el:
    for movieId in sorted(dic.keys()):
        for movieId_ in set(dic[movieId]):
            el.write("{} {}\n".format(movieId, movieId_))

In [15]:
# os.chdir("GraphEmbedding")

In [None]:
# os.getcwd()

'/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/RecSysPractice'

### Use other people's implementation to do deepwalk: 

https://github.com/shenweichen/GraphEmbedding

In [None]:
import numpy as np

from GraphEmbedding.ge.classify import read_node_label, Classifier
from GraphEmbedding.ge import DeepWalk
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE

In [14]:
## https://github.com/shenweichen/GraphEmbedding/blob/7e26f8a0b648cfe0ad3a6f13fd1aced8c1218503/examples/deepwalk_wiki.py

G = nx.read_edgelist('edgeList.txt',
                      create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
startTime = time.time()
model = DeepWalk(G, walk_length=10, num_walks=20000, workers=1)
endWalking = time.time()
print("Walking takes {:.2} seconds...".format(endWalking - startTime))
model.train(window_size=5, iter=3, embed_size=10)
endTraining = time.time()
print("Training takes {:.2} seconds...".format(endTraining - endWalking))
embeddings = model.get_embeddings()

with open("DeepWalkEmb.csv", 'w') as f:
    for key, vals in embeddings.items():
        movie_id = key
        vectors = " ".join([str(_) for _ in vals])
        f.write("{}:{}\n".format(movie_id, vectors))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 16.8min finished


Walking takes 1e+03 seconds...
Learning embedding vectors...
Learning embedding vectors done!
Training takes 2.3e+03 seconds...
