## Flags per Video

For each video, find how many unique users flagged it. A unique user can be identified using the combination of their first name and last name. Do not consider rows in which there is no flag ID.
<br> <br>
Table: user_flags

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pandas as pd

import os

In [2]:
os.environ['JAVA_HOME'] = "C:/Program Files/Java/jdk-11"

spark = SparkSession.builder.appName('Flags per Video').getOrCreate()

In [3]:
df = spark.read.format('csv') \
    .option('header', 'true') \
    .option('inferSchema', 'true') \
    .load('../Data/user_flags.csv')
    
df.show()

+--------------+-------------+-----------+-------+
|user_firstname|user_lastname|   video_id|flag_id|
+--------------+-------------+-----------+-------+
|       Richard|       Hasson|y6120QOlsfU| 0cazx3|
|          Mark|          May|Ct6BUPvE2sM| 1cn76u|
|          Gina|       Korman|dQw4w9WgXcQ| 1i43zk|
|          Mark|          May|Ct6BUPvE2sM| 1n0vef|
|          Mark|          May|jNQXAC9IVRw| 1sv6ib|
|          Gina|       Korman|dQw4w9WgXcQ| 20xekb|
|          Mark|          May|5qap5aO4i9A| 4cvwuv|
|        Daniel|         Bell|5qap5aO4i9A| 4sd6dv|
|       Richard|       Hasson|y6120QOlsfU| 6jjkvn|
|       Pauline|        Wilks|jNQXAC9IVRw| 7ks264|
|      Courtney|         NULL|dQw4w9WgXcQ|   NULL|
|         Helen|        Hearn|dQw4w9WgXcQ| 8946nx|
|          Mark|      Johnson|y6120QOlsfU| 8wwg0l|
|       Richard|       Hasson|dQw4w9WgXcQ| arydfd|
|          Gina|       Korman|       NULL|   NULL|
|          Mark|      Johnson|y6120QOlsfU| bl40qw|
|       Richard|       Hasson|d

In [28]:
result = df.filter(
    (F.col('flag_id').isNotNull()) & (F.col('video_id').isNotNull())
    ).fillna({
        'user_firstname' : ' ',
        'user_lastname': ' '
        }).withColumn(
            'full_name',
            F.concat('user_firstname', 'user_lastname')
            ).groupBy(
                'video_id'
                ).agg(
                    F.countDistinct('full_name').alias('num_unique_users')
                    ).orderBy(
                        F.col('num_unique_users').desc()
                        )

result.show()

+-----------+----------------+
|   video_id|num_unique_users|
+-----------+----------------+
|y6120QOlsfU|               5|
|dQw4w9WgXcQ|               5|
|jNQXAC9IVRw|               3|
|5qap5aO4i9A|               2|
|Ct6BUPvE2sM|               2|
+-----------+----------------+



In [27]:
# another solution using concat_ws
result1 = df.filter(
    F.col('flag_id').isNotNull()
        ).withColumn(
        'full_name',
        F.concat_ws(' ', F.col('user_firstname'), F.col('user_lastname'))
        ).groupBy(
            'video_id'
            ).agg(
                F.countDistinct('full_name')
                )
            
result1.show()

+-----------+-------------------------+
|   video_id|count(DISTINCT full_name)|
+-----------+-------------------------+
|y6120QOlsfU|                        5|
|5qap5aO4i9A|                        2|
|Ct6BUPvE2sM|                        2|
|dQw4w9WgXcQ|                        5|
|jNQXAC9IVRw|                        3|
+-----------+-------------------------+

