In [9]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row, types
from pyspark.sql.types import *

In [10]:
import os
import sys
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.getOrCreate()

In [11]:
sc = SparkContext.getOrCreate()

In [12]:
spark = SparkSession(sc)

In [13]:
func = lambda x: (x, x+4*100)
iterator = range(10)
trem = list(map(func, iterator))
rdd = sc.parallelize(trem)


In [14]:
trem

[(0, 400),
 (1, 401),
 (2, 402),
 (3, 403),
 (4, 404),
 (5, 405),
 (6, 406),
 (7, 407),
 (8, 408),
 (9, 409)]

In [15]:
rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [16]:
kvDF = rdd.toDF(["key", "value"])

In [17]:
kvDF.printSchema()

root
 |-- key: long (nullable = true)
 |-- value: long (nullable = true)



In [18]:
kvDF.show(5)

+---+-----+
|key|value|
+---+-----+
|  0|  400|
|  1|  401|
|  2|  402|
|  3|  403|
|  4|  404|
+---+-----+
only showing top 5 rows



In [19]:
list_of_people = [Row(1, 'Vinicius', 30),
                 Row(2, 'Mary Jane', 25)]
peopleRDD = sc.parallelize(list_of_people)

In [20]:
schema = StructType([StructField("id", LongType(), True),
                    StructField("name", StringType(), True),
                    StructField("age", LongType(), True)])

In [21]:
peopleDF = spark.createDataFrame(peopleRDD, schema)

In [22]:
peopleDF.show()

+---+---------+---+
| id|     name|age|
+---+---------+---+
|  1| Vinicius| 30|
|  2|Mary Jane| 25|
+---+---------+---+



In [23]:
spark.range(10).toDF('carrilhao').show()

+---------+
|carrilhao|
+---------+
|        0|
|        1|
|        2|
|        3|
|        4|
|        5|
|        6|
|        7|
|        8|
|        9|
+---------+



In [24]:
text_file = spark.read.text("README.md")

In [25]:
text_file.show(5, False)

+---------------------------------------------------------------------------+
|value                                                                      |
+---------------------------------------------------------------------------+
|# Repositório criado para armazenar trabalhos e criações referentes ao Ifes|
|                                                                           |
|                                                                           |
|                                                                           |
|## Sumário para os projetos                                                |
+---------------------------------------------------------------------------+
only showing top 5 rows



In [26]:
movies = spark.read.option("header","true").csv("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.csv")

In [27]:
movies.show(truncate=False)

+-----------------+---------------------------+----+
|actor            |title                      |year|
+-----------------+---------------------------+----+
|McClure, Marc (I)|Freaky Friday              |2003|
|McClure, Marc (I)|Coach Carter               |2005|
|McClure, Marc (I)|Superman II                |1980|
|McClure, Marc (I)|Apollo 13                  |1995|
|McClure, Marc (I)|Superman                   |1978|
|McClure, Marc (I)|Back to the Future         |1985|
|McClure, Marc (I)|Back to the Future Part III|1990|
|Cooper, Chris (I)|Me, Myself & Irene         |2000|
|Cooper, Chris (I)|October Sky                |1999|
|Cooper, Chris (I)|Capote                     |2005|
|Cooper, Chris (I)|The Bourne Supremacy       |2004|
|Cooper, Chris (I)|The Patriot                |2000|
|Cooper, Chris (I)|The Town                   |2010|
|Cooper, Chris (I)|Seabiscuit                 |2003|
|Cooper, Chris (I)|A Time to Kill             |1996|
|Cooper, Chris (I)|Where the Wild Things Are  

In [28]:
movies.printSchema()

root
 |-- actor: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: string (nullable = true)



In [29]:
movies2 = spark.read.option("header","true").option("inferSchema","true").csv("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.csv")

In [30]:
movies2.printSchema()

root
 |-- actor: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)



In [34]:
movie_schema = StructType([StructField("actor_name", StringType(), True),
                             StructField("movie_title", StringType(), True),
                             StructField("produced_year", LongType(), True)])

In [35]:
movie_schema

StructType([StructField('actor_name', StringType(), True), StructField('movie_title', StringType(), True), StructField('produced_year', LongType(), True)])

In [36]:
movies3 = spark.read.option("header","true").schema(movie_schema).csv("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.csv")

In [37]:
movies3.show(5)

+-----------------+-------------+-------------+
|       actor_name|  movie_title|produced_year|
+-----------------+-------------+-------------+
|McClure, Marc (I)|Freaky Friday|         2003|
|McClure, Marc (I)| Coach Carter|         2005|
|McClure, Marc (I)|  Superman II|         1980|
|McClure, Marc (I)|    Apollo 13|         1995|
|McClure, Marc (I)|     Superman|         1978|
+-----------------+-------------+-------------+
only showing top 5 rows



In [42]:
movies4 = spark.read.option("header","true").option("sep", "\t").schema(movie_schema).csv("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.tsv")

In [43]:
movies4.printSchema()

root
 |-- actor_name: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- produced_year: long (nullable = true)



In [44]:
movies5 = spark.read.option("header","true").option("sep", "\t").schema(movie_schema).json("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.json")

In [45]:
movies5.printSchema()

root
 |-- actor_name: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- produced_year: long (nullable = true)



In [46]:
movies5.show(truncate=False)

+-----------------+---------------------------+-------------+
|actor_name       |movie_title                |produced_year|
+-----------------+---------------------------+-------------+
|McClure, Marc (I)|Coach Carter               |2005         |
|McClure, Marc (I)|Superman II                |1980         |
|McClure, Marc (I)|Apollo 13                  |1995         |
|McClure, Marc (I)|Superman                   |1978         |
|McClure, Marc (I)|Back to the Future         |1985         |
|McClure, Marc (I)|Back to the Future Part III|1990         |
|Cooper, Chris (I)|Me, Myself & Irene         |2000         |
|Cooper, Chris (I)|October Sky                |1999         |
|Cooper, Chris (I)|Capote                     |2005         |
|Cooper, Chris (I)|The Bourne Supremacy       |2004         |
|Cooper, Chris (I)|The Patriot                |2000         |
|Cooper, Chris (I)|The Town                   |2010         |
|Cooper, Chris (I)|Seabiscuit                 |2003         |
|Cooper,

In [47]:
movie_schema2 = StructType([StructField("actor_name", StringType(), True),
                           StructField("movie_title", StringType(), True),
                           StructField("produced_year",  IntegerType(), True)])

In [48]:
movie_schema2

StructType([StructField('actor_name', StringType(), True), StructField('movie_title', StringType(), True), StructField('produced_year', IntegerType(), True)])

In [49]:
badMovieSchema = StructType([
                        StructField("actor_name", BooleanType(), True),
                        StructField("movie_title", StringType(), True),
                        StructField("produced_year", IntegerType(), True)])

In [50]:
badMovieSchema

StructType([StructField('actor_name', BooleanType(), True), StructField('movie_title', StringType(), True), StructField('produced_year', IntegerType(), True)])

In [51]:
movies6 = spark.read.option("header","true").option("sep", "\t").schema(badMovieSchema).json("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.json")

In [52]:
movies6.show(5)

+----------+------------------+-------------+
|actor_name|       movie_title|produced_year|
+----------+------------------+-------------+
|      null|      Coach Carter|         2005|
|      null|       Superman II|         1980|
|      null|         Apollo 13|         1995|
|      null|          Superman|         1978|
|      null|Back to the Future|         1985|
+----------+------------------+-------------+
only showing top 5 rows



In [53]:
# movies7 = spark.read.option("header","true").option("sep", "\t").option("mode", "failFast").schema(badMovieSchema).json("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.json")


In [54]:
movies8 = spark.read.load("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.parquet")

In [55]:
movies8.show(5)

+-----------------+------------------+-------------+
|       actor_name|       movie_title|produced_year|
+-----------------+------------------+-------------+
|McClure, Marc (I)|      Coach Carter|         2005|
|McClure, Marc (I)|       Superman II|         1980|
|McClure, Marc (I)|         Apollo 13|         1995|
|McClure, Marc (I)|          Superman|         1978|
|McClure, Marc (I)|Back to the Future|         1985|
+-----------------+------------------+-------------+
only showing top 5 rows



In [56]:
movies10 = spark.read.parquet("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.parquet")
movies10.printSchema()

root
 |-- actor_name: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- produced_year: long (nullable = true)



In [57]:
movies10.show(5)

+-----------------+------------------+-------------+
|       actor_name|       movie_title|produced_year|
+-----------------+------------------+-------------+
|McClure, Marc (I)|      Coach Carter|         2005|
|McClure, Marc (I)|       Superman II|         1980|
|McClure, Marc (I)|         Apollo 13|         1995|
|McClure, Marc (I)|          Superman|         1978|
|McClure, Marc (I)|Back to the Future|         1985|
+-----------------+------------------+-------------+
only showing top 5 rows



In [58]:
spark = SparkSession.builder.config("spark.jars", "mysql-connector-java-8.0.13.jar").getOrCreate()

In [59]:
spark

In [60]:
from pyspark.sql.functions import *

In [68]:
my_list = [(1,2),(2,3)]
rdd = sc.parallelize(my_list)
kvDF = rdd.toDF(["key","value"])

In [73]:
kvDF.show(5)

+---+-----+
|key|value|
+---+-----+
|  1|    2|
|  2|    3|
+---+-----+



In [82]:
kvDF.select([column('key'), kvDF.key > 1]).show()

+---+---------+
|key|(key > 1)|
+---+---------+
|  1|    false|
|  2|     true|
+---+---------+



In [83]:
movies = spark.read.parquet("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.parquet")

In [85]:
movies.printSchema()

root
 |-- actor_name: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- produced_year: long (nullable = true)



In [90]:
movies.select(["movie_title", "actor_name"]).show()

+--------------------+-----------------+
|         movie_title|       actor_name|
+--------------------+-----------------+
|        Coach Carter|McClure, Marc (I)|
|         Superman II|McClure, Marc (I)|
|           Apollo 13|McClure, Marc (I)|
|            Superman|McClure, Marc (I)|
|  Back to the Future|McClure, Marc (I)|
|Back to the Futur...|McClure, Marc (I)|
|  Me, Myself & Irene|Cooper, Chris (I)|
|         October Sky|Cooper, Chris (I)|
|              Capote|Cooper, Chris (I)|
|The Bourne Supremacy|Cooper, Chris (I)|
|         The Patriot|Cooper, Chris (I)|
|            The Town|Cooper, Chris (I)|
|          Seabiscuit|Cooper, Chris (I)|
|      A Time to Kill|Cooper, Chris (I)|
|Where the Wild Th...|Cooper, Chris (I)|
|         The Muppets|Cooper, Chris (I)|
|     American Beauty|Cooper, Chris (I)|
|             Syriana|Cooper, Chris (I)|
| The Horse Whisperer|Cooper, Chris (I)|
|             Jarhead|Cooper, Chris (I)|
+--------------------+-----------------+
only showing top

In [108]:
movies.select(["movie_title", "actor_name", column('produced_year'), (column('produced_year') - (column('produced_year') % 10)).alias('Decade')]).show()

+--------------------+-----------------+-------------+------+
|         movie_title|       actor_name|produced_year|Decade|
+--------------------+-----------------+-------------+------+
|        Coach Carter|McClure, Marc (I)|         2005|  2000|
|         Superman II|McClure, Marc (I)|         1980|  1980|
|           Apollo 13|McClure, Marc (I)|         1995|  1990|
|            Superman|McClure, Marc (I)|         1978|  1970|
|  Back to the Future|McClure, Marc (I)|         1985|  1980|
|Back to the Futur...|McClure, Marc (I)|         1990|  1990|
|  Me, Myself & Irene|Cooper, Chris (I)|         2000|  2000|
|         October Sky|Cooper, Chris (I)|         1999|  1990|
|              Capote|Cooper, Chris (I)|         2005|  2000|
|The Bourne Supremacy|Cooper, Chris (I)|         2004|  2000|
|         The Patriot|Cooper, Chris (I)|         2000|  2000|
|            The Town|Cooper, Chris (I)|         2010|  2010|
|          Seabiscuit|Cooper, Chris (I)|         2003|  2000|
|      A

In [117]:
movies.selectExpr("count(distinct(movie_title)) as movies","count(distinct(actor_name)) as actors").show()

+------+------+
|movies|actors|
+------+------+
|  1409|  6527|
+------+------+



In [120]:
movies.where('produced_year > 2000').show()

+-------------------+--------------------+-------------+
|         actor_name|         movie_title|produced_year|
+-------------------+--------------------+-------------+
|  McClure, Marc (I)|        Coach Carter|         2005|
|  Cooper, Chris (I)|              Capote|         2005|
|  Cooper, Chris (I)|The Bourne Supremacy|         2004|
|  Cooper, Chris (I)|            The Town|         2010|
|  Cooper, Chris (I)|          Seabiscuit|         2003|
|  Cooper, Chris (I)|Where the Wild Th...|         2009|
|  Cooper, Chris (I)|         The Muppets|         2011|
|  Cooper, Chris (I)|             Syriana|         2005|
|  Cooper, Chris (I)|             Jarhead|         2005|
|  Cooper, Chris (I)| The Bourne Identity|         2002|
|  Cassavetes, Frank|          Battleship|         2012|
|  Cassavetes, Frank|              John Q|         2002|
|  Cassavetes, Frank|  My Sister's Keeper|         2009|
|  Cassavetes, Frank| Kicking & Screaming|         2005|
|Knight, Shirley (I)|Divine Sec