## **Social Network Analysis**

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # Install Java
!wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz # Install Apache Spark
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark

import os
from pyspark.sql import SparkSession

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

import findspark
findspark.init()

# Initialize Spark session with GraphFrames package from the new repository
spark = SparkSession.builder \
    .appName("Venmo Transactions") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.3-spark3.0-s_2.12") \
    .config("spark.jars.repositories", "https://repos.spark-packages.org/") \
    .getOrCreate()

# Import GraphFrame after the session is created
from graphframes import GraphFrame



In [None]:
!pip install graphframes



In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/431.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/431.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: emoji
Successfully installed emoji-2.12.1


In [None]:
import pandas as pd
import re
import emoji
from collections import Counter
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lit, array, explode, collect_list, collect_set, array_distinct, flatten, array_remove, struct, count, sum as sql_sum, months_between, min, ceil, to_date, avg, stddev, size
from pyspark.sql.types import StringType, ArrayType, IntegerType, StructType, StructField, FloatType
import matplotlib.pyplot as plt
from pyspark.sql.window import Window

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
drive_path = '/content/drive/My Drive/VenmoSample.snappy.parquet'
venmo_data = spark.read.parquet(drive_path)

In [None]:
venmo_data.show(20)

+--------+-------+----------------+-------------------+--------------------+-----------+--------------------+
|   user1|  user2|transaction_type|           datetime|         description|is_business|            story_id|
+--------+-------+----------------+-------------------+--------------------+-----------+--------------------+
| 1218774|1528945|         payment|2015-11-27 10:48:19|                Uber|      false|5657c473cd03c9af2...|
| 5109483|4782303|         payment|2015-06-17 11:37:04|              Costco|      false|5580f9702b64f70ab...|
| 4322148|3392963|         payment|2015-06-19 07:05:31|        Sweaty balls|      false|55835ccb1a624b14a...|
|  469894|1333620|          charge|2016-06-03 23:34:13|                  🎥|      false|5751b185cd03c9af2...|
| 2960727|3442373|         payment|2016-05-29 23:23:42|                   ⚡|      false|574b178ecd03c9af2...|
| 3977544|2709470|         payment|2016-09-29 22:12:07|          Chipotlaid|      false|57ed2f4723e064eac...|
| 3766386|4

#### **Write a script to find a user’s friends and friends of friends (Friend definition: A user’s friend is someone who has transacted with the user, either sending money to the user or receiving money from the user). Describe your algorithm and calculate its computational complexity. Can you do it better?**


Algorithm:

- Filter Transactions: Extract all transactions involving the given user (either as the sender or receiver).

- Extract Friends: From the filtered transactions, identify distinct friends of the user. This is achieved by checking which user is not the given user in each transaction, as the other user in the transaction is considered a friend.

- Extract Friends of Friends: Join the identified friends with the original transaction data to find all transactions involving these friends.
From these transactions, identify the friends of friends by selecting users who are not the original friend in each transaction. Ensure that these friends of friends are distinct.

- Remove Duplicates: Remove any users who are both friends and friends of friends to ensure the lists are mutually exclusive.

In [None]:
def find_friends_and_friends_of_friends(user_id):
    # Filter transactions involving the given user
    user_transactions = venmo_data \
        .filter((venmo_data.user1 == user_id) | (venmo_data.user2 == user_id))

    # Extract distinct friends
    friends = user_transactions \
        .selectExpr("case when user1 = {} then user2 else user1 end as friend".format(user_id)) \
        .distinct()

    # Extract distinct friends of friends
    friends_of_friends = friends \
        .join(venmo_data, (friends.friend == venmo_data.user1) | (friends.friend == venmo_data.user2)) \
        .selectExpr("case when user1 = friend then user2 else user1 end as friend_of_friend") \
        .distinct()

    # Remove duplicates
    friends_of_friends = friends_of_friends.subtract(friends)

    return friends, friends_of_friends

# Let's check this function for a sample user
user_id = 1218774
user_friends, user_friends_of_friends = find_friends_and_friends_of_friends(user_id)
print("User's Friends:")
user_friends.show()
print("User's Friends of Friends:")
user_friends_of_friends.show()

User's Friends:
+-------+
| friend|
+-------+
|6784812|
|2299797|
|1528945|
| 825037|
|2248062|
+-------+

User's Friends of Friends:
+----------------+
|friend_of_friend|
+----------------+
|         2366018|
|          974023|
|         1218774|
|          473667|
|         1282126|
|          502595|
|         3485121|
|         3703726|
|         4873067|
|         2420441|
|         4159821|
|         4616676|
|         8718297|
|         2468982|
|        10549093|
|         2097475|
|         8747467|
|        11265984|
|         4445092|
|          694525|
+----------------+
only showing top 20 rows



Computational Complexity:

- Filtering Transactions: This step involves scanning the entire dataset to find transactions involving the user. If there are T transactions, this step is O(T).

- Extracting Friends: Extracting distinct friends involves checking each transaction involving the user, leading to a complexity of O(F), where
F is the number of friends (transactions involving the user).

- Extracting Friends of Friends: This step involves joining the friends with the transaction dataset and can be approximated as O(F×T), where F is the number of friends and T is the number of transactions.

- Removing Duplicates: Removing duplicates involves a set difference operation, which is O(FF), where FF is the number of friends of friends.

We can improve this algorithm by reducing redundant operations and ensuring efficient joins. One possible optimization is to limit the dataset size in joins by pre-filtering relevant transactions.

Complexity of the improve algorithm:

- Filtering Transactions:
O(T) for initial user transactions and filtering transactions for friends.

- Extracting Friends: O(F).

- Extracting Friends of Friends: Reduced to O(T′), where T′ is the subset of transactions involving friends, typically much smaller than T.

- Removing Duplicates: O(FF).

Improved code:

In [None]:
def find_friends_and_friends_of_friends_optimized(user_id):
    # Filter transactions involving the given user
    user_transactions = venmo_data \
        .filter((venmo_data.user1 == user_id) | (venmo_data.user2 == user_id))

    # Extract distinct friends
    friends = user_transactions \
        .selectExpr("case when user1 = {} then user2 else user1 end as friend".format(user_id)) \
        .distinct()

    # Filter transactions involving friends
    friend_ids = friends.rdd.map(lambda row: row.friend).collect()
    friends_transactions = venmo_data \
        .filter((venmo_data.user1.isin(friend_ids)) | (venmo_data.user2.isin(friend_ids)))

    # Extract distinct friends of friends
    friends_of_friends = friends_transactions \
        .selectExpr("case when user1 in ({}) then user2 else user1 end as friend_of_friend".format(",".join(map(str, friend_ids)))) \
        .distinct()

    # Remove duplicates
    friends_of_friends = friends_of_friends.subtract(friends)

    return friends, friends_of_friends

# Let's check this function for a sample user
user_id = 1218774
user_friends, user_friends_of_friends = find_friends_and_friends_of_friends_optimized(user_id)
print("User's Friends:")
user_friends.show()
print("User's Friends of Friends:")
user_friends_of_friends.show()

User's Friends:
+-------+
| friend|
+-------+
|6784812|
|2299797|
|1528945|
| 825037|
|2248062|
+-------+

User's Friends of Friends:
+----------------+
|friend_of_friend|
+----------------+
|         2366018|
|          974023|
|         1094209|
|         1218774|
|          473667|
|          963800|
|         3953567|
|         1282126|
|          502595|
|         3485121|
|         3703726|
|         1682016|
|         2760507|
|         4873067|
|         2420441|
|         1597784|
|         1098208|
|          921063|
|         4159821|
|         3414812|
+----------------+
only showing top 20 rows



#### **Now, that you have the list of each user’s friends and friends of friends, you are in position to calculate many social network variables. Use the dynamic analysis from before, and calculate the following social network metrics across a user’s lifetime in Venmo (from 0 up to 12 months).**

#### **i) Number of friends and number of friends of friends.**

In [None]:
from pyspark.sql.functions import month, year
from datetime import datetime
from dateutil.relativedelta import relativedelta
import calendar

def calculate_social_network_metrics(user_id, start_date=None, venmo_data=None):
    if start_date is None:
        start_date = datetime.now()

    # Initialize empty lists to store results
    num_friends_per_month = []
    num_friends_of_friends_per_month = []

    # Iterate over each month from 0 to 12
    for i in range(13):
        # Calculate the start date and end date for the current month
        current_date = start_date - relativedelta(months=i)
        start_date_str = current_date.replace(day=1).strftime('%Y-%m-%d')
        end_date_str = current_date.replace(day=calendar.monthrange(current_date.year, current_date.month)[1]).strftime('%Y-%m-%d')

        # Filter transactions within the current month
        monthly_transactions = venmo_data \
            .filter((year("datetime") == current_date.year) & (month("datetime") == current_date.month))

        # Calculate friends and friends of friends for the current month
        user_friends, user_friends_of_friends = find_friends_and_friends_of_friends(user_id)

        # Count the number of distinct friends and friends of friends
        num_friends = user_friends.count()
        num_friends_of_friends = user_friends_of_friends.count()

        # Append the counts to the result lists
        num_friends_per_month.append((start_date_str, num_friends))
        num_friends_of_friends_per_month.append((start_date_str, num_friends_of_friends))

    return num_friends_per_month, num_friends_of_friends_per_month

# Sample user_id
user_id = 1218774

# Calculate social network metrics for the sample user
num_friends_per_month, num_friends_of_friends_per_month = calculate_social_network_metrics(user_id, venmo_data=venmo_data)

# Print the results
print("Number of Friends per Month:")
for month, count in num_friends_per_month:
    print("Month {}: {}".format(month, count))

print("\nNumber of Friends of Friends per Month:")
for month, count in num_friends_of_friends_per_month:
    print("Month {}: {}".format(month, count))

Number of Friends per Month:
Month 2024-06-01: 5
Month 2024-05-01: 5
Month 2024-04-01: 5
Month 2024-03-01: 5
Month 2024-02-01: 5
Month 2024-01-01: 5
Month 2023-12-01: 5
Month 2023-11-01: 5
Month 2023-10-01: 5
Month 2023-09-01: 5
Month 2023-08-01: 5
Month 2023-07-01: 5
Month 2023-06-01: 5

Number of Friends of Friends per Month:
Month 2024-06-01: 36
Month 2024-05-01: 36
Month 2024-04-01: 36
Month 2024-03-01: 36
Month 2024-02-01: 36
Month 2024-01-01: 36
Month 2023-12-01: 36
Month 2023-11-01: 36
Month 2023-10-01: 36
Month 2023-09-01: 36
Month 2023-08-01: 36
Month 2023-07-01: 36
Month 2023-06-01: 36


Extracting unique user ids and creating a dataframe with the number of friends and friends of friends.

In [None]:
user_ids = venmo_data.select(col("user1").alias("user_id")).union(venmo_data.select(col("user2").alias("user_id"))).distinct()

# Function to get friends and friends of friends
def find_friends_and_friends_of_friends(user_id, venmo_data):
    # Direct friends
    friends = venmo_data.filter((col("user1") == user_id) | (col("user2") == user_id)) \
                        .select(col("user1"), col("user2")) \
                        .withColumn("friend", lit(user_id)) \
                        .withColumn("direct_friend", explode(array(col("user1"), col("user2")))) \
                        .filter(col("direct_friend") != user_id) \
                        .select("direct_friend").distinct()

    # Friends of friends
    friends_list = [row.direct_friend for row in friends.collect()]
    friends_of_friends = venmo_data.filter((col("user1").isin(friends_list)) | (col("user2").isin(friends_list))) \
                                   .select(col("user1"), col("user2")) \
                                   .withColumn("friend_of_friend", explode(array(col("user1"), col("user2")))) \
                                   .filter(~col("friend_of_friend").isin(friends_list + [user_id])) \
                                   .select("friend_of_friend").distinct()

    return friends, friends_of_friends

# Calculate metrics for each user
results = []
for row in user_ids.collect():
    user_id = row["user_id"]
    user_friends, user_friends_of_friends = find_friends_and_friends_of_friends(user_id, venmo_data)
    num_friends = user_friends.count()
    num_friends_of_friends = user_friends_of_friends.count()
    results.append((user_id, num_friends, num_friends_of_friends))

# Create a DataFrame from the results
columns = ["user_id", "number_of_friends", "number_of_friends_of_friends"]
results_df = spark.createDataFrame(results, columns)

# Show the results DataFrame
results_df.show()

#### **ii) Clustering coefficient of a user's network**

In [None]:
import networkx as nx

def calculate_clustering_coefficient(user_id, venmo_data):
    # Create a graph
    G = nx.Graph()

    # Add edges based on transactions
    transactions = venmo_data.select("user1", "user2").distinct().collect()
    for transaction in transactions:
        G.add_edge(transaction['user1'], transaction['user2'])

    # Calculate clustering coefficient for the user
    clustering_coefficient = nx.clustering(G, user_id)

    return clustering_coefficient

# Sample user_id
user_id = 1218774

# Calculate clustering coefficient for the sample user
clustering_coefficient = calculate_clustering_coefficient(user_id, venmo_data)

# Print the result
print(f"Clustering Coefficient for User {user_id}: {clustering_coefficient:.4f}")

Clustering Coefficient for User 1218774: 0.2000


#### **iii) Calculate the page rank of each user**

Facing issues in getting GraphFrames to run. Some installation issue. We've implemented the code logic.

In [None]:
# Prepare vertices and edges for GraphFrame
vertices = venmo_data.selectExpr("user1 as id").union(venmo_data.selectExpr("user2 as id")).distinct()
edges = venmo_data.selectExpr("user1 as src", "user2 as dst").distinct()

# Create a GraphFrame
graph = GraphFrame(vertices, edges)

# Calculate PageRank
pagerank_results = graph.pageRank(resetProbability=0.15, maxIter=10)

# Show PageRank results
pagerank_results.vertices.orderBy("pagerank", ascending=False).show()

# Calculate PageRank for a specific user
def calculate_user_pagerank(user_id, pagerank_results):
    user_pagerank = pagerank_results.vertices.filter(pagerank_results.vertices.id == user_id).select("pagerank").collect()
    if user_pagerank:
        return user_pagerank[0]["pagerank"]
    else:
        return None

# Sample user_id
user_id = 1218774

# Calculate PageRank for the sample user
user_pagerank = calculate_user_pagerank(user_id, pagerank_results)

# Print the result
print(f"PageRank for User {user_id}: {user_pagerank:.4f}" if user_pagerank is not None else f"User {user_id} not found.")