In [None]:
%%capture
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Avoids scroll-in-the-scroll in the entire Notebook
from IPython.display import Javascript
def resize_colab_cell():
  display(Javascript('google.colab.output.setIframeHeight(0, true, {maxHeight: 400})'))
get_ipython().events.register('pre_run_cell', resize_colab_cell)
     

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
%matplotlib inline
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
# create the session
conf = SparkConf().set("spark.ui.port", "4050")
# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

<IPython.core.display.Javascript object>

In [None]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip
get_ipython().system_raw('./ngrok http 4050 &')
!curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

<IPython.core.display.Javascript object>

--2023-03-28 14:37:02--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 54.161.241.46, 54.237.133.81, 18.205.222.128, ...
Connecting to bin.equinox.io (bin.equinox.io)|54.161.241.46|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13921656 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2023-03-28 14:37:02 (51.8 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [13921656/13921656]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   
https://d6b0-34-86-224-141.ngrok.io


In [None]:
def line_to_friend_ownership(line):
    split = line.split()
    user_id = int(split[0])
    if len(split) == 1:
        friends = []
    else:
        friends = list(map(lambda x: int(x), split[1].split(',')))
    return user_id, friends

def friend_ownership_to_connection(f_o):
    user_id = f_o[0]
    friends = f_o[1]
    connections = []
    for friend_id in friends:
        key = (user_id, friend_id)
        if user_id > friend_id:
            key = (friend_id, user_id)
        connections.append((key, 0))  # they are friends, value=0
    for friend_pair in itertools.combinations(friends, 2):
        friend_0 = friend_pair[0]
        friend_1 = friend_pair[1]
        key = (friend_0, friend_1)
        if friend_0 > friend_1:
            key = (friend_1, friend_0)
        connections.append((key, 1))  # they have mutual friends, value=1
    return connections

def mutual_friend_count_to_recommendation(f):
    pair = f[0]
    friend0 = pair[0]
    friend1 = pair[1]
    noMutFriends = f[1]
    rec0 = (friend0, (friend1, noMutFriends))
    rec1 = (friend1, (friend0, noMutFriends))
    return [rec0, rec1]

def recommendation_to_sorted_truncated(recs):
    recs.sort(key=lambda x: (-x[1], x[0]))
    return list(map(lambda x: x[0], recs))[:10]

<IPython.core.display.Javascript object>

In [None]:
# Read from text file
lines = sc.textFile("/content/soc-LiveJournal1Adj.txt")

# Map each line to the form: (user_id, [friend_id_0, friend_id_1, ...])
friend_ownership = lines.map(line_to_friend_ownership).filter(lambda friend: '' != friend[1])#.filter(lambda friend: 1000> friend[0]) #take 1000 samples for testing

# Map friend ownerships to instances of ((user_id, friend_id), VALUE).
# VALUE = 0 => pairs are already friends.
# VALUE = 1 => pairs have mutual friends.
friend_edges = friend_ownership.flatMap(friend_ownership_to_connection)
friend_edges.cache()

# Filter pairs that are already friends
mutual_friend = friend_edges.groupByKey() \
    .filter(lambda edge: 0 not in edge[1]) \
    .flatMap(lambda x: [(x[0],item) for item in x[1]]) # flat it to count total mutual friends No; use map directly causes bugs

# Count mutual friends by adding up values
mutual_friend_counts = mutual_friend.reduceByKey( lambda x,y : x+y)

# Create the recommendation objects, group them by key, then sort and 
recommendations = mutual_friend_counts.flatMap(mutual_friend_count_to_recommendation).groupByKey() 

# Truncate the recommendations to the 10 most highly recommended.
recommendations10 = recommendations.map(lambda m: (m[0], recommendation_to_sorted_truncated(list(m[1])))).sortByKey() 

# Include in your writeup the recommendations for the users with following user IDs: 924, 8941, 8942, 9019, 9020, 9021, 9022, 9990, 9992, 9993.
results = recommendations10.filter(lambda recommendations: recommendations[0] in [924, 8941, 8942, 9019, 9020, 9021, 9022, 9990, 9992, 9993])


<IPython.core.display.Javascript object>

In [None]:
results.collect()

<IPython.core.display.Javascript object>

[(924, [439, 2409, 6995, 11860, 15416, 43748, 45881]),
 (8941, [8943, 8944, 8940]),
 (8942, [8939, 8940, 8943, 8944]),
 (9019, [9022, 317, 9023]),
 (9020, [9021, 9016, 9017, 9022, 317, 9023]),
 (9021, [9020, 9016, 9017, 9022, 317, 9023]),
 (9022, [9019, 9020, 9021, 317, 9016, 9017, 9023]),
 (9990, [13134, 13478, 13877, 34299, 34485, 34642, 37941]),
 (9992, [9987, 9989, 35667, 9991]),
 (9993, [9991, 13134, 13478, 13877, 34299, 34485, 34642, 37941])]