## groupBy function


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, IntegerType, ArrayType

data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
 ]

schema = StructType([
    StructField("Name",StructType([
        StructField("Firstname",StringType()),
        StructField("Middlename", StringType()),
        StructField("Lastname", StringType())
        ])),
    StructField("Languages", ArrayType(StringType())),
    StructField("Country", StringType()),
    StructField("Gender", StringType())
                ])
                
spark = SparkSession.builder.appName("Spark filter").getOrCreate()

df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show(truncate = False)

root
 |-- Name: struct (nullable = true)
 |    |-- Firstname: string (nullable = true)
 |    |-- Middlename: string (nullable = true)
 |    |-- Lastname: string (nullable = true)
 |-- Languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Country: string (nullable = true)
 |-- Gender: string (nullable = true)

+----------------------+------------------+-------+------+
|Name                  |Languages         |Country|Gender|
+----------------------+------------------+-------+------+
|{James, , Smith}      |[Java, Scala, C++]|OH     |M     |
|{Anna, Rose, }        |[Spark, Java, C++]|NY     |F     |
|{Julia, , Williams}   |[CSharp, VB]      |OH     |F     |
|{Maria, Anne, Jones}  |[CSharp, VB]      |NY     |M     |
|{Jen, Mary, Brown}    |[CSharp, VB]      |NY     |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH     |M     |
+----------------------+------------------+-------+------+



In [2]:
df.groupBy('Country').count().show(truncate=False)

+-------+-----+
|Country|count|
+-------+-----+
|OH     |3    |
|NY     |3    |
+-------+-----+



In [3]:
df.groupBy('Country','Gender').count().show(truncate=False)

+-------+------+-----+
|Country|Gender|count|
+-------+------+-----+
|OH     |M     |2    |
|NY     |F     |1    |
|OH     |F     |1    |
|NY     |M     |2    |
+-------+------+-----+



## groupBy agg() function
- Pyspark groupBy agg() function is used to apply more then one aggregate at a time on grouped DataFrame
  

In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[2]').appName("Spark_Exe").getOrCreate()
print(spark)
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"] 
df2 = spark.createDataFrame(data = data,schema = columns)
print(df2.printSchema())
df2.show() # to get the default 20 rows of data

<pyspark.sql.session.SparkSession object at 0x000002405A4109D0>
root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

None
+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [7]:
help(df2.groupBy('salary').agg)

Help on method agg in module pyspark.sql.group:

agg(*exprs: Union[pyspark.sql.column.Column, Dict[str, str]]) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.group.GroupedData instance
    Compute aggregates and returns the result as a :class:`DataFrame`.
    
    The available aggregate functions can be:
    
    1. built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count`
    
    2. group aggregate pandas UDFs, created with :func:`pyspark.sql.functions.pandas_udf`
    
       .. note:: There is no partial aggregation with group aggregate UDFs, i.e.,
           a full shuffle is required. Also, all the data of a group will be loaded into
           memory, so the user should be aware of the potential OOM risk if data is skewed
           and certain groups are too large to fit in memory.
    
       .. seealso:: :func:`pyspark.sql.functions.pandas_udf`
    
    If ``exprs`` is a single :class:`dict` mapping from string to string, then the key
    is the co

In [9]:
df2.groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|     M|    3|
|     F|    2|
+------+-----+



In [17]:
from pyspark.sql.functions import count, max,min
df2.groupBy('gender').agg(count("*").alias('gender_count')).show()

+------+------------+
|gender|gender_count|
+------+------------+
|     M|           3|
|     F|           2|
+------+------------+



In [20]:
from pyspark.sql.functions import count, max,min
df2.groupBy('gender').agg(count("salary").alias('gender_count'),\
                         min('salary').alias('min_salary'),\
                         max('salary').alias('max_salary')).show()

+------+------------+----------+----------+
|gender|gender_count|min_salary|max_salary|
+------+------------+----------+----------+
|     M|           3|      3000|      4000|
|     F|           2|        -1|      4000|
+------+------------+----------+----------+



In [21]:
df2.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [24]:
from pyspark.sql.functions import count, max, min 
df2.groupBy('gender').agg(sort(count("*").alias('gender_count'))).show()

NameError: name 'sort' is not defined

In [27]:
class Solution:
    def CountPairs(self, N, k, arr):        
        i = 0 
        j = N-1
        m = 0 
        count = 0
        while m <= (N//2)+1:
            if arr[i] == arr[j] and i < j and (i+j+2)% k == 0 :
                i +=1
                j -=1
                m +=1
                count += 1
                temp = j
            else:
                if i == j:
                    i +=1
                    j = temp - 1
                else:
                    j -=1
        return count
cls = Solution()
res = cls.CountPairs(5, [1, 2, 3, 2, 1], 3)
print(res)

TypeError: 'int' object is not subscriptable

In [None]:
1 1 1 1 1 1