<a href="https://colab.research.google.com/github/TiphaineV/gpeg/blob/heavy/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Loading Files

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
import scipy.sparse as sparse

In [2]:
!pip install memory-profiler



In [3]:
%load_ext memory_profiler

In [4]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [5]:
id = '1SujkUIqpPKg9LlJ8s77upbeTKkenmrLc'

In [6]:
# -- some cleaning
needed = True
if needed :
  %rm -R gpeg/
  %rm context.py
  %rm fastGraph.py
  %rm node.py
  %rm _recSystems.py
  %rm trivialClf.py
  %rm edge.py
  %rm main.py 
  %rm scorer.py

rm: cannot remove 'context.py': No such file or directory
rm: cannot remove 'fastGraph.py': No such file or directory
rm: cannot remove 'node.py': No such file or directory
rm: cannot remove '_recSystems.py': No such file or directory
rm: cannot remove 'trivialClf.py': No such file or directory
rm: cannot remove 'edge.py': No such file or directory
rm: cannot remove 'main.py': No such file or directory
rm: cannot remove 'scorer.py': No such file or directory


In [7]:
!git clone -b heavy https://github.com/TiphaineV/gpeg.git

Cloning into 'gpeg'...
remote: Enumerating objects: 272, done.[K
remote: Counting objects: 100% (272/272), done.[K
remote: Compressing objects: 100% (174/174), done.[K
remote: Total 438 (delta 184), reused 167 (delta 96), pack-reused 166[K
Receiving objects: 100% (438/438), 1016.97 KiB | 13.93 MiB/s, done.
Resolving deltas: 100% (272/272), done.


In [8]:
cd gpeg/fast_implementation/

/content/gpeg/fast_implementation


In [9]:
# Modules
import numpy as np
from _recSystems import _Clf
from fastGraph import Graph
from trivialClf import TrivialClf
from scorer import ClfScorer

In [10]:
# -- Loading userData (heavy, takes around a minute)
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('userData.csv')

In [11]:
userDataStream = pd.read_csv('userData.csv', chunksize= 1e6) # please don't change chuksize

## Building Adjency Matrix

In [12]:
%%time
# -- Graph construction, nChunk * 1e6 rows are processed from movieLens 20M.
# -- Takes around 2'30 for one chunk
%memit graph = Graph(userDataStream, nChunk=1)

Graph init ...
Processing chunk 0.
peak memory: 2446.71 MiB, increment: 151.02 MiB
CPU times: user 2min 30s, sys: 551 ms, total: 2min 30s
Wall time: 2min 31s


## Graph split

In [13]:
%%time
# -- Parameters
alpha = 0.1 # test proportion in the split

# -- train_test_split
%memit trainEdges, testEdges = graph.train_test_split(alpha= alpha)

peak memory: 2345.50 MiB, increment: 29.86 MiB
CPU times: user 1.39 s, sys: 89 ms, total: 1.48 s
Wall time: 1.62 s


## Classification

In [14]:
# -- Loading userData
%memit userData = pd.read_csv('userData.csv')

peak memory: 5474.33 MiB, increment: 3156.94 MiB


In [15]:
print(userData.head(10))

   Unnamed: 0  userId  movieId  rating     timestamp_rating  tag timestamp_tag
0           0       1        2     3.5  2005-04-02 23:53:47  NaN           NaN
1           1       1       29     3.5  2005-04-02 23:31:16  NaN           NaN
2           2       1       32     3.5  2005-04-02 23:33:39  NaN           NaN
3           3       1       47     3.5  2005-04-02 23:32:07  NaN           NaN
4           4       1       50     3.5  2005-04-02 23:29:40  NaN           NaN
5           5       1      112     3.5  2004-09-10 03:09:00  NaN           NaN
6           6       1      151     4.0  2004-09-10 03:08:54  NaN           NaN
7           7       1      223     4.0  2005-04-02 23:46:13  NaN           NaN
8           8       1      253     4.0  2005-04-02 23:35:40  NaN           NaN
9           9       1      260     4.0  2005-04-02 23:33:46  NaN           NaN


In [16]:
# -- Fitting recommender system
clf = TrivialClf(userData, graph.adjency)
%memit clf.fit(trainEdges)

peak memory: 5500.51 MiB, increment: 25.99 MiB


## Scoring predictions

In [17]:
# -- Prediction
yPred = clf.predict(testEdges)
yTrue = clf._get_labels(testEdges)

random prop 0.88736


In [19]:
print(classification_report(yTrue, yPred))

              precision    recall  f1-score   support

           0       0.94      0.65      0.77     94175
           1       0.06      0.34      0.10      5825

    accuracy                           0.63    100000
   macro avg       0.50      0.49      0.43    100000
weighted avg       0.89      0.63      0.73    100000



In [20]:
sample = np.random.choice(range(len(yPred)), size = 10)
print(sample)

[50670 65863 80671 91239 51540 78444 94907 27677 23615 20275]


In [21]:
print('Predictions \n', yPred.iloc[sample].astype('uint8'))

Predictions 
 57136    1
74354    0
90993    0
23225    0
58275    0
88477    0
54712    0
31022    1
26520    1
22708    0
dtype: uint8


In [22]:
print('Ground truth labels \n', yTrue.iloc[sample])

Ground truth labels 
 50670    1
65863    0
80671    0
91239    0
51540    0
78444    0
94907    0
27677    0
23615    0
20275    0
Name: rating, dtype: uint8


In [23]:
# -- Scoring
scorer = ClfScorer()
%memit score = scorer.score(clf, testEdges)

random prop 0.88736
              precision    recall  f1-score   support

     class 0       0.94      0.65      0.77     94175
     class 1       0.06      0.34      0.10      5825

    accuracy                           0.63    100000
   macro avg       0.50      0.50      0.43    100000
weighted avg       0.89      0.63      0.73    100000

peak memory: 5476.30 MiB, increment: 0.19 MiB


In [27]:
from google.colab import files
files.download('adjency-5M.npz')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>