In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 53.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=d91d8103d7dfa412abce33cc92dd904059b69cdad798ea8e7f6ebc1f10c1fd24
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [3]:
spark = (
    SparkSession.builder
      .master('local')
      .appName('dataframe_withcolumn')
      .config('spark.ui.port', '4050')
      .getOrCreate()
)

In [4]:
spark

In [None]:
df = spark.read.format("csv") \
  .option("inferSchema", True) \
  .option("header", True) \
  .option("sep", ",") \
  .load("/content/drive/MyDrive/countries_general_info_historical.24-10-2021.csv") 
df.show()

In [40]:
#Renomear alguma coluna
df1 = df.withColumnRenamed("Iso3166P1Alpha2Code", "siglaPais2") \
        .withColumnRenamed("Iso3166P1Alpha3Code", "siglaPais3") \
        .withColumnRenamed("Iso3166P1NumericCode", "NumericCode")
df1.show()

+--------------------+--------------------+-----------+----------+----------+-----------+-------------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+
|                Name|          NativeName|CallingCode|siglaPais2|siglaPais3|NumericCode|               Isni|Population, 2010|Population, 2011|Population, 2012|Population, 2013|Population, 2014|Population, 2015|Population, 2016|Population, 2017|Population, 2018|Population, 2019|
+--------------------+--------------------+-----------+----------+----------+-----------+-------------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+
|              Canada|              Canada|          1|        CA|       CAN|        124|0000 0001 2238 5050|        34004889|        34339328|        34714222|

In [None]:
#Mostrar os países que possuam população maior que 10 milhões de habitantes em 2012
df1.select(F.col("Name"), F.col("Population, 2012")) \
  .filter(F.col("Population, 2012") > 10000000) \
  .show(50)

In [None]:
#Mostrar todos os dados apenas dos países que comecem com a letra B
df1.filter(F.col("Name").contains("B")).show()

In [None]:
#Criar um ranking de países por quantidade de habitantes (Do maior para o menor)
(df1.select(F.col("Name").alias("paisesComMaisHabitantes"), F.col("Population, 2019"))
    .orderBy(F.col("Population, 2019")
    .desc())
    .show())

In [None]:
"""
Criar uma nova coluna com a seguinte informação: 
Caso o país tenha mais que 1 milhão de habitantes, colocar o texto “População com mais de 1 milhão de habitantes”. 
Caso contrário, colocar o texto: “População com menos de 1 milhão de habitantes”
"""
df2 = (df1.withColumn("popMaiorQueUmMilhao", F.when(F.col("Population, 2019") > 1000000, "SIM")
          .otherwise("NAO"))    
      )
df2.select(F.col("Name"), F.col("Population, 2019"), F.col("popMaiorQueUmMilhao")).show(50)

In [None]:
#Mostre a soma da população no ano de 2017

#Soma da populacao por pais
df1.groupBy(F.col("Name")).agg(F.sum("Population, 2017").alias("somaPopulacaoPais2017")).show()

#Soma da populacao geral
df1.select(F.sum("Population, 2017").alias("somaPopulacaoGeral2017")).show()
