In [72]:
# Import findspark 
import findspark

# Initialize and provide path
findspark.init("/opt/spark/")

In [73]:
# Import SparkSession
from pyspark.sql import SparkSession

# Build the SparkSession
spark = SparkSession.builder \
   .master("local") \
   .appName("Linear Regression Model") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()
   
sc = spark.sparkContext

In [74]:
rdd = sc.textFile('./Salary_Data.csv')

In [75]:
rdd.take(3)

['1.1,39343.00', '1.3,46205.00', '1.5,37731.00']

In [76]:
# Split lines on commas
rdd = rdd.map(lambda line: line.split(","))

# Inspect the first line
rdd.take(3)

[['1.1', '39343.00'], ['1.3', '46205.00'], ['1.5', '37731.00']]

In [77]:
# Inspect the first line 
rdd.first()

['1.1', '39343.00']

In [78]:
# Import the necessary modules 
from pyspark.sql import Row

# Map the RDD to a DF
df = rdd.map(lambda line: Row(YearsExperience=line[0], Salary=line[1])).toDF()

In [79]:
# Show the top 20 rows 
df.show()

+--------+---------------+
|  Salary|YearsExperience|
+--------+---------------+
|39343.00|            1.1|
|46205.00|            1.3|
|37731.00|            1.5|
|43525.00|            2.0|
|39891.00|            2.2|
|56642.00|            2.9|
|60150.00|            3.0|
|54445.00|            3.2|
|64445.00|            3.2|
|57189.00|            3.7|
|63218.00|            3.9|
|55794.00|            4.0|
|56957.00|            4.0|
|57081.00|            4.1|
|61111.00|            4.5|
|67938.00|            4.9|
|66029.00|            5.1|
|83088.00|            5.3|
|81363.00|            5.9|
|93940.00|            6.0|
+--------+---------------+
only showing top 20 rows



In [80]:
#mostra algumas informações sobre os tipos de dados presentes nas colunas
df.printSchema()

root
 |-- Salary: string (nullable = true)
 |-- YearsExperience: string (nullable = true)



In [81]:
# Import all from `sql.types`
from pyspark.sql.types import *

# Write a custom function to convert the data type of DataFrame columns
def convertColumn(df, names, newType):
    for name in names: 
        df = df.withColumn(name, df[name].cast(newType))
    return df 

# Assign all column names to `columns`
columns = ['YearsExperience', 'Salary']

# Conver the `df` columns to `FloatType()`
df = convertColumn(df, columns, FloatType())

In [82]:
df.show()

+-------+---------------+
| Salary|YearsExperience|
+-------+---------------+
|39343.0|            1.1|
|46205.0|            1.3|
|37731.0|            1.5|
|43525.0|            2.0|
|39891.0|            2.2|
|56642.0|            2.9|
|60150.0|            3.0|
|54445.0|            3.2|
|64445.0|            3.2|
|57189.0|            3.7|
|63218.0|            3.9|
|55794.0|            4.0|
|56957.0|            4.0|
|57081.0|            4.1|
|61111.0|            4.5|
|67938.0|            4.9|
|66029.0|            5.1|
|83088.0|            5.3|
|81363.0|            5.9|
|93940.0|            6.0|
+-------+---------------+
only showing top 20 rows



In [89]:
# mostrar apenas uma coluna com o método .select
df.select('Salary').show()

+-------+
| Salary|
+-------+
|39343.0|
|46205.0|
|37731.0|
|43525.0|
|39891.0|
|56642.0|
|60150.0|
|54445.0|
|64445.0|
|57189.0|
|63218.0|
|55794.0|
|56957.0|
|57081.0|
|61111.0|
|67938.0|
|66029.0|
|83088.0|
|81363.0|
|93940.0|
+-------+
only showing top 20 rows



In [90]:
#groupby, onde pode-se agrupar os dados por um determinado pivô. Neste caso, usa-se a coluna Salary como pivô, efetuando a contagem dos valores e ordenando-os em ordem decrescente
df.groupBy("Salary").count().sort("Salary",ascending=False).show()

+--------+-----+
|  Salary|count|
+--------+-----+
|122391.0|    1|
|121872.0|    1|
|116969.0|    1|
|113812.0|    1|
|112635.0|    1|
|109431.0|    1|
|105582.0|    1|
|101302.0|    1|
| 98273.0|    1|
| 93940.0|    1|
| 91738.0|    1|
| 83088.0|    1|
| 81363.0|    1|
| 67938.0|    1|
| 66029.0|    1|
| 64445.0|    1|
| 63218.0|    1|
| 61111.0|    1|
| 60150.0|    1|
| 57189.0|    1|
+--------+-----+
only showing top 20 rows



In [94]:
# .describe, que faz a descrição do df baseado nas colunas, retornando uma contagem dos elementos, a média, desvio padrão, valores mínimo e máximo.
df.describe().show()

IllegalArgumentException: 'Unsupported class file major version 55'