In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *

#Iniciando sessao Spark
spark = SparkSession.builder \
    .master("local") \
    .appName("PySpark") \
    .config("spark.executor.memory", "1gb") \
    .getOrCreate()

sc = spark.sparkContext

rdd = sc.textFile('Salary_Data.csv')
rdd = rdd.map(lambda line: line.split(","))

#Criando DataFame .toDF() e criando colunas AnosExperiencia e Salario
df = rdd.map(lambda line: Row(AnosExperiencia=line[0], Salario=line[1])).toDF()
df.show()

+---------------+--------+
|AnosExperiencia| Salario|
+---------------+--------+
|            1.1|39343.00|
|            1.3|46205.00|
|            1.5|37731.00|
|            2.0|43525.00|
|            2.2|39891.00|
|            2.9|56642.00|
|            3.0|60150.00|
|            3.2|54445.00|
|            3.2|64445.00|
|            3.7|57189.00|
|            3.9|63218.00|
|            4.0|55794.00|
|            4.0|56957.00|
|            4.1|57081.00|
|            4.5|61111.00|
|            4.9|67938.00|
|            5.1|66029.00|
|            5.3|83088.00|
|            5.9|81363.00|
|            6.0|93940.00|
+---------------+--------+
only showing top 20 rows



In [15]:
#Aleterando tipo de dado

print("Dados de entrada do DataFrame: ")
df.printSchema()

def converterColuna(dataframe, nomes, novoTipo):
    for nome in nomes:
        dataframe = dataframe.withColumn(nome, dataframe[nome].cast(novoTipo))
    return dataframe

colunas = ['AnosExperiencia', 'Salario']

df = converterColuna(df, colunas, FloatType())

#df.show()
print("Dados de saída do DataFrame: ")
df.printSchema()

Dados de entrada do DataFrame: 
root
 |-- AnosExperiencia: float (nullable = true)
 |-- Salario: float (nullable = true)

Dados de saída do DataFrame: 
root
 |-- AnosExperiencia: float (nullable = true)
 |-- Salario: float (nullable = true)



In [16]:
# Consultando o DataFrame
df.show()

+---------------+-------+
|AnosExperiencia|Salario|
+---------------+-------+
|            1.1|39343.0|
|            1.3|46205.0|
|            1.5|37731.0|
|            2.0|43525.0|
|            2.2|39891.0|
|            2.9|56642.0|
|            3.0|60150.0|
|            3.2|54445.0|
|            3.2|64445.0|
|            3.7|57189.0|
|            3.9|63218.0|
|            4.0|55794.0|
|            4.0|56957.0|
|            4.1|57081.0|
|            4.5|61111.0|
|            4.9|67938.0|
|            5.1|66029.0|
|            5.3|83088.0|
|            5.9|81363.0|
|            6.0|93940.0|
+---------------+-------+
only showing top 20 rows



In [17]:
#Consultando a coluna "Salario", imprimindo apenas 10 linhas
df.select('Salario').show(10)

+-------+
|Salario|
+-------+
|39343.0|
|46205.0|
|37731.0|
|43525.0|
|39891.0|
|56642.0|
|60150.0|
|54445.0|
|64445.0|
|57189.0|
+-------+
only showing top 10 rows



In [18]:
#Consultando por agrupamento  de contagem e ordenado pelo salario de forma descedente 
df.groupby('Salario').count().sort('Salario', ascending=False).show()

+--------+-----+
| Salario|count|
+--------+-----+
|122391.0|    1|
|121872.0|    1|
|116969.0|    1|
|113812.0|    1|
|112635.0|    1|
|109431.0|    1|
|105582.0|    1|
|101302.0|    1|
| 98273.0|    1|
| 93940.0|    1|
| 91738.0|    1|
| 83088.0|    1|
| 81363.0|    1|
| 67938.0|    1|
| 66029.0|    1|
| 64445.0|    1|
| 63218.0|    1|
| 61111.0|    1|
| 60150.0|    1|
| 57189.0|    1|
+--------+-----+
only showing top 20 rows



In [19]:
#Consultando a quantidade de salarios
df.select('Salario').count()

30

In [20]:
#Consultando os dados estatisticos do dataframe inteiro
df.describe().show()

+-------+------------------+------------------+
|summary|   AnosExperiencia|           Salario|
+-------+------------------+------------------+
|  count|                30|                30|
|   mean|5.3133333643277485|           76003.0|
| stddev|2.8378881722287805|27414.429784582302|
|    min|               1.1|           37731.0|
|    max|              10.5|          122391.0|
+-------+------------------+------------------+



In [21]:
#Consultando os dados estatisticos por coluna 'Salario'
df.describe('Salario').show()

+-------+------------------+
|summary|           Salario|
+-------+------------------+
|  count|                30|
|   mean|           76003.0|
| stddev|27414.429784582302|
|    min|           37731.0|
|    max|          122391.0|
+-------+------------------+



In [22]:
#Consultando os dados por coleção 
df.collect()

[Row(AnosExperiencia=1.100000023841858, Salario=39343.0),
 Row(AnosExperiencia=1.2999999523162842, Salario=46205.0),
 Row(AnosExperiencia=1.5, Salario=37731.0),
 Row(AnosExperiencia=2.0, Salario=43525.0),
 Row(AnosExperiencia=2.200000047683716, Salario=39891.0),
 Row(AnosExperiencia=2.9000000953674316, Salario=56642.0),
 Row(AnosExperiencia=3.0, Salario=60150.0),
 Row(AnosExperiencia=3.200000047683716, Salario=54445.0),
 Row(AnosExperiencia=3.200000047683716, Salario=64445.0),
 Row(AnosExperiencia=3.700000047683716, Salario=57189.0),
 Row(AnosExperiencia=3.9000000953674316, Salario=63218.0),
 Row(AnosExperiencia=4.0, Salario=55794.0),
 Row(AnosExperiencia=4.0, Salario=56957.0),
 Row(AnosExperiencia=4.099999904632568, Salario=57081.0),
 Row(AnosExperiencia=4.5, Salario=61111.0),
 Row(AnosExperiencia=4.900000095367432, Salario=67938.0),
 Row(AnosExperiencia=5.099999904632568, Salario=66029.0),
 Row(AnosExperiencia=5.300000190734863, Salario=83088.0),
 Row(AnosExperiencia=5.90000009536743

In [23]:
#Consultando por condicoes
df.filter(df['Salario'] > 5000).show()

+---------------+-------+
|AnosExperiencia|Salario|
+---------------+-------+
|            1.1|39343.0|
|            1.3|46205.0|
|            1.5|37731.0|
|            2.0|43525.0|
|            2.2|39891.0|
|            2.9|56642.0|
|            3.0|60150.0|
|            3.2|54445.0|
|            3.2|64445.0|
|            3.7|57189.0|
|            3.9|63218.0|
|            4.0|55794.0|
|            4.0|56957.0|
|            4.1|57081.0|
|            4.5|61111.0|
|            4.9|67938.0|
|            5.1|66029.0|
|            5.3|83088.0|
|            5.9|81363.0|
|            6.0|93940.0|
+---------------+-------+
only showing top 20 rows



In [24]:
df.select('Salario').filter(df['Salario'] > 5000).show()

+-------+
|Salario|
+-------+
|39343.0|
|46205.0|
|37731.0|
|43525.0|
|39891.0|
|56642.0|
|60150.0|
|54445.0|
|64445.0|
|57189.0|
|63218.0|
|55794.0|
|56957.0|
|57081.0|
|61111.0|
|67938.0|
|66029.0|
|83088.0|
|81363.0|
|93940.0|
+-------+
only showing top 20 rows



In [25]:
df.filter((df['Salario'] > 5000) | (df['AnosExperiencia'] > 2)).show()

+---------------+-------+
|AnosExperiencia|Salario|
+---------------+-------+
|            1.1|39343.0|
|            1.3|46205.0|
|            1.5|37731.0|
|            2.0|43525.0|
|            2.2|39891.0|
|            2.9|56642.0|
|            3.0|60150.0|
|            3.2|54445.0|
|            3.2|64445.0|
|            3.7|57189.0|
|            3.9|63218.0|
|            4.0|55794.0|
|            4.0|56957.0|
|            4.1|57081.0|
|            4.5|61111.0|
|            4.9|67938.0|
|            5.1|66029.0|
|            5.3|83088.0|
|            5.9|81363.0|
|            6.0|93940.0|
+---------------+-------+
only showing top 20 rows



In [27]:
#Definicao de Schema
from pyspark.sql.session import SparkSession

#Definindo os tipos de dados a serem trabalhados
from pyspark.sql.types import (ArrayType, BooleanType, FloatType, IntegerType, StringType, 
                               StructField, StructType, TimestampType)

import pyspark.sql.functions as F

spark = SparkSession.builder.appName('fisrtSession')\
    .config('spark.master', 'local[4]')\
    .config('spark.executor.memory', '1gb')\
    .config('spark.shuffle.partitions', 1)\
    .getOrCreate()

schema = StructType([
                StructField('case_id', IntegerType()),
                StructField('province', StringType()),
                StructField('city', StringType()),
                StructField('group', BooleanType()),
                StructField('infection_case', StringType()),
                StructField('confirmed', IntegerType()),
                StructField('latitude', StringType()),
                StructField('longitude', StringType())
])

path = "covid_cases.csv"

df = spark.read.format('csv')\
    .schema(schema)\
    .load(path)

df.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- province: string (nullable = true)
 |-- city: string (nullable = true)
 |-- group: boolean (nullable = true)
 |-- infection_case: string (nullable = true)
 |-- confirmed: integer (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)

