In [10]:
import pandas as pd
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum,avg,max, col 



In [11]:
spark = SparkSession.builder.appName("spark-practice").getOrCreate()

In [12]:
df = spark.read.csv("countries_dataset.csv", header=True, inferSchema=True)
df.show(5)

+----+---+-------------+-------------+----------+------------+------------------+--------------+
|Rank| ID|      Country|    Continent|Population|     IMF_GDP|            UN_GDP|GDP_per_capita|
+----+---+-------------+-------------+----------+------------+------------------+--------------+
|   1|840|United States|North America| 339996.56| 2.669515E13|      1.8624475E13|      78515.94|
|   2|156|        China|         Asia|1425671.35|2.1865482E13|1.1218281029298E13|      15336.97|
|   3|392|        Japan|         Asia| 123294.51| 5.291351E12| 4.936211827875E12|      42916.35|
|   4|276|      Germany|       Europe|  83294.63| 4.564778E12| 3.477796274497E12|      54802.79|
|   5|356|        India|         Asia|1428627.66|  3.89367E12| 2.259642382872E12|       2725.46|
+----+---+-------------+-------------+----------+------------+------------------+--------------+
only showing top 5 rows



In [13]:
df.dtypes

[('Rank', 'int'),
 ('ID', 'int'),
 ('Country', 'string'),
 ('Continent', 'string'),
 ('Population', 'double'),
 ('IMF_GDP', 'double'),
 ('UN_GDP', 'double'),
 ('GDP_per_capita', 'double')]

In [14]:
df.columns

['Rank',
 'ID',
 'Country',
 'Continent',
 'Population',
 'IMF_GDP',
 'UN_GDP',
 'GDP_per_capita']

In [15]:
df.groupBy('Continent').count().show()

+-------------+-----+
|    Continent|count|
+-------------+-----+
|       Europe|   50|
|       Africa|   54|
|North America|   34|
|South America|   12|
|      Oceania|   17|
|         Asia|   45|
+-------------+-----+



In [16]:
df.groupBy("Continent").mean("IMF_GDP").show()

+-------------+--------------------+
|    Continent|        avg(IMF_GDP)|
+-------------+--------------------+
|       Europe|        5.2915196E11|
|       Africa|5.739488888888888...|
|North America|9.144699117647059E11|
|South America|        3.2317975E11|
|      Oceania|1.264089411764705...|
|         Asia|9.742229333333334E11|
+-------------+--------------------+



In [17]:
df.groupBy("Continent").agg( 
         avg("IMF_GDP").alias("avg_imf_gdp"),  
         sum("IMF_GDP").alias("sum_imf_gdp"),  
         max("IMF_GDP").alias("max_imf_gdp")  
     ).show(truncate=False)

+-------------+---------------------+------------+------------+
|Continent    |avg_imf_gdp          |sum_imf_gdp |max_imf_gdp |
+-------------+---------------------+------------+------------+
|Europe       |5.2915196E11         |2.6457598E13|4.564778E12 |
|Africa       |5.7394888888888885E10|3.099324E12 |5.80121E11  |
|North America|9.144699117647059E11 |3.1091977E13|2.669515E13 |
|South America|3.2317975E11         |3.878157E12 |1.980483E12 |
|Oceania      |1.2640894117647058E11|2.148952E12 |1.828394E12 |
|Asia         |9.742229333333334E11 |4.3840032E13|2.1865482E13|
+-------------+---------------------+------------+------------+



In [25]:
df.groupBy("Continent").agg( 
         avg("IMF_GDP").alias("avg_imf_gdp"),  
         sum("IMF_GDP").alias("sum_imf_gdp"),  
         max("IMF_GDP").alias("max_imf_gdp")  
     ).where(~(col('Continent') =='Europe')).show()



+-------------+--------------------+------------+------------+
|    Continent|         avg_imf_gdp| sum_imf_gdp| max_imf_gdp|
+-------------+--------------------+------------+------------+
|       Africa|5.739488888888888...| 3.099324E12|  5.80121E11|
|North America|9.144699117647059E11|3.1091977E13| 2.669515E13|
|South America|        3.2317975E11| 3.878157E12| 1.980483E12|
|      Oceania|1.264089411764705...| 2.148952E12| 1.828394E12|
|         Asia|9.742229333333334E11|4.3840032E13|2.1865482E13|
+-------------+--------------------+------------+------------+

