<a href="https://colab.research.google.com/github/TiphaineV/gpeg/blob/heavy/notebook-2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Loading Files

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
import scipy.sparse as sparse

In [None]:
!pip install memory-profiler

Collecting memory-profiler
  Downloading https://files.pythonhosted.org/packages/8f/fd/d92b3295657f8837e0177e7b48b32d6651436f0293af42b76d134c3bb489/memory_profiler-0.58.0.tar.gz
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25l[?25hdone
  Created wheel for memory-profiler: filename=memory_profiler-0.58.0-cp36-none-any.whl size=30181 sha256=ac469df28e563b32e656f577ff0f8185b66f0c9f5dc1391d8fce814155a47629
  Stored in directory: /root/.cache/pip/wheels/02/e4/0b/aaab481fc5dd2a4ea59e78bc7231bb6aae7635ca7ee79f8ae5
Successfully built memory-profiler
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.58.0


In [None]:
%load_ext memory_profiler

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
id = '1SujkUIqpPKg9LlJ8s77upbeTKkenmrLc'

In [None]:
# -- some cleaning
needed = True
if needed :
  %rm -R gpeg/
  %rm context.py
  %rm fastGraph.py
  %rm node.py
  %rm _recSystems.py
  %rm trivialClf.py
  %rm edge.py
  %rm main.py 
  %rm scorer.py

rm: cannot remove 'gpeg/': No such file or directory
rm: cannot remove 'context.py': No such file or directory
rm: cannot remove 'fastGraph.py': No such file or directory
rm: cannot remove 'node.py': No such file or directory
rm: cannot remove '_recSystems.py': No such file or directory
rm: cannot remove 'trivialClf.py': No such file or directory
rm: cannot remove 'edge.py': No such file or directory
rm: cannot remove 'main.py': No such file or directory
rm: cannot remove 'scorer.py': No such file or directory


In [None]:
!git clone -b heavy https://github.com/TiphaineV/gpeg.git

Cloning into 'gpeg'...
remote: Enumerating objects: 275, done.[K
remote: Counting objects: 100% (275/275), done.[K
remote: Compressing objects: 100% (177/177), done.[K
remote: Total 441 (delta 185), reused 166 (delta 96), pack-reused 166[K
Receiving objects: 100% (441/441), 1021.58 KiB | 7.86 MiB/s, done.
Resolving deltas: 100% (273/273), done.


In [None]:
cd gpeg/fast_implementation/

/content/gpeg/fast_implementation


In [None]:
# Modules
import numpy as np
from _recSystems import _Clf
from fastGraph import Graph
from trivialClf import TrivialClf
from scorer import ClfScorer

In [None]:
# -- Loading userData (heavy, takes around a minute)
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('userData.csv')

In [15]:
userDataStream = pd.read_csv('userData.csv', chunksize= 1e6) # please don't change chuksize

## Building Adjency Matrix

In [16]:
%%time
# -- Graph construction, nChunk * 1e6 rows are processed from movieLens 20M.
# -- Takes around 2'30 for one chunk
%memit graph = Graph(userDataStream, nChunk=20)

Graph init ...
Processing chunk 0.
Processing chunk 1.
Processing chunk 2.
Processing chunk 3.
Processing chunk 4.
Processing chunk 5.
Processing chunk 6.
Processing chunk 7.
Processing chunk 8.
Processing chunk 9.
Processing chunk 10.
Processing chunk 11.
Processing chunk 12.
Processing chunk 13.
Processing chunk 14.
Processing chunk 15.
Processing chunk 16.
Processing chunk 17.
Processing chunk 18.
Processing chunk 19.
peak memory: 4816.67 MiB, increment: 2432.86 MiB
CPU times: user 48min 56s, sys: 9.94 s, total: 49min 6s
Wall time: 49min 20s


In [28]:
sparse.save_npz('adjency-20M1.npz', graph.adjency)

## Graph split

In [18]:
%%time
# -- Parameters
alpha = 0.1 # test proportion in the split

# -- train_test_split
%memit trainEdges, testEdges = graph.train_test_split(alpha= alpha)

peak memory: 4334.77 MiB, increment: 1038.17 MiB
CPU times: user 53.1 s, sys: 2.17 s, total: 55.2 s
Wall time: 55.5 s


## Classification

In [19]:
# -- Loading userData
%memit userData = pd.read_csv('userData.csv')

peak memory: 6809.99 MiB, increment: 3085.10 MiB


In [20]:
print(userData.head(10))

   Unnamed: 0  userId  movieId  rating     timestamp_rating  tag timestamp_tag
0           0       1        2     3.5  2005-04-02 23:53:47  NaN           NaN
1           1       1       29     3.5  2005-04-02 23:31:16  NaN           NaN
2           2       1       32     3.5  2005-04-02 23:33:39  NaN           NaN
3           3       1       47     3.5  2005-04-02 23:32:07  NaN           NaN
4           4       1       50     3.5  2005-04-02 23:29:40  NaN           NaN
5           5       1      112     3.5  2004-09-10 03:09:00  NaN           NaN
6           6       1      151     4.0  2004-09-10 03:08:54  NaN           NaN
7           7       1      223     4.0  2005-04-02 23:46:13  NaN           NaN
8           8       1      253     4.0  2005-04-02 23:35:40  NaN           NaN
9           9       1      260     4.0  2005-04-02 23:33:46  NaN           NaN


In [21]:
# -- Fitting recommender system
clf = TrivialClf(userData, graph.adjency)
%memit clf.fit(trainEdges)

peak memory: 8511.39 MiB, increment: 1701.16 MiB


## Scoring predictions

In [22]:
# -- Prediction
yPred = clf.predict(testEdges)
yTrue = clf._get_labels(testEdges)

random prop 0.884362


In [23]:
print(classification_report(yTrue, yPred))

              precision    recall  f1-score   support

           0       0.94      0.68      0.79   1878724
           1       0.06      0.32      0.10    121276

    accuracy                           0.66   2000000
   macro avg       0.50      0.50      0.44   2000000
weighted avg       0.89      0.66      0.75   2000000



In [24]:
sample = np.random.choice(range(len(yPred)), size = 10)
print(sample)

[  75121  807253 1504984 1818109 1446333 1342727 1233002  488609 1412741
  556195]


In [25]:
print('Predictions \n', yPred.iloc[sample].astype('uint8'))

Predictions 
 84581      1
914097     0
1701768    0
424345     0
1635326    1
1518211    1
1394552    0
552907     1
1597069    0
629047     0
dtype: uint8


In [26]:
print('Ground truth labels \n', yTrue.iloc[sample])

Ground truth labels 
 75121      0
807253     0
1504984    0
1818109    0
1446333    0
1342727    0
1233002    0
488609     0
1412741    0
556195     0
Name: rating, dtype: uint8


In [27]:
# -- Scoring
scorer = ClfScorer()
%memit score = scorer.score(clf, testEdges)

random prop 0.884362
              precision    recall  f1-score   support

     class 0       0.94      0.68      0.79   1878724
     class 1       0.06      0.32      0.10    121276

    accuracy                           0.66   2000000
   macro avg       0.50      0.50      0.44   2000000
weighted avg       0.89      0.66      0.75   2000000

peak memory: 8721.69 MiB, increment: 0.06 MiB
