In [65]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row, types
from pyspark.sql.types import *

In [2]:
import os
import sys
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.getOrCreate()

In [3]:
sc = SparkContext.getOrCreate()

In [4]:
spark = SparkSession(sc)

In [5]:
func = lambda x: (x, x+4*100)
iterator = range(10)
trem = list(map(func, iterator))
rdd = sc.parallelize(trem)


In [6]:
trem

[(0, 400),
 (1, 401),
 (2, 402),
 (3, 403),
 (4, 404),
 (5, 405),
 (6, 406),
 (7, 407),
 (8, 408),
 (9, 409)]

In [7]:
rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [8]:
kvDF = rdd.toDF(["key", "value"])

In [9]:
kvDF.printSchema()

root
 |-- key: long (nullable = true)
 |-- value: long (nullable = true)



In [10]:
kvDF.show(5)

+---+-----+
|key|value|
+---+-----+
|  0|  400|
|  1|  401|
|  2|  402|
|  3|  403|
|  4|  404|
+---+-----+
only showing top 5 rows



In [11]:
list_of_people = [Row(1, 'Vinicius', 30),
                 Row(2, 'Mary Jane', 25)]
peopleRDD = sc.parallelize(list_of_people)

In [12]:
schema = StructType([StructField("id", LongType(), True),
                    StructField("name", StringType(), True),
                    StructField("age", LongType(), True)])

In [13]:
peopleDF = spark.createDataFrame(peopleRDD, schema)

In [14]:
peopleDF.show()

+---+---------+---+
| id|     name|age|
+---+---------+---+
|  1| Vinicius| 30|
|  2|Mary Jane| 25|
+---+---------+---+



In [15]:
spark.range(10).toDF('carrilhao').show()

+---------+
|carrilhao|
+---------+
|        0|
|        1|
|        2|
|        3|
|        4|
|        5|
|        6|
|        7|
|        8|
|        9|
+---------+



In [18]:
text_file = spark.read.text("README.md")

In [24]:
text_file.show(5, False)

+---------------------------------------------------------------------------+
|value                                                                      |
+---------------------------------------------------------------------------+
|# Repositório criado para armazenar trabalhos e criações referentes ao Ifes|
|                                                                           |
|                                                                           |
|                                                                           |
|## Sumário para os projetos                                                |
+---------------------------------------------------------------------------+
only showing top 5 rows



In [29]:
movies = spark.read.option("header","true").csv("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.csv")

In [32]:
movies.show(truncate=False)

+-----------------+---------------------------+----+
|actor            |title                      |year|
+-----------------+---------------------------+----+
|McClure, Marc (I)|Freaky Friday              |2003|
|McClure, Marc (I)|Coach Carter               |2005|
|McClure, Marc (I)|Superman II                |1980|
|McClure, Marc (I)|Apollo 13                  |1995|
|McClure, Marc (I)|Superman                   |1978|
|McClure, Marc (I)|Back to the Future         |1985|
|McClure, Marc (I)|Back to the Future Part III|1990|
|Cooper, Chris (I)|Me, Myself & Irene         |2000|
|Cooper, Chris (I)|October Sky                |1999|
|Cooper, Chris (I)|Capote                     |2005|
|Cooper, Chris (I)|The Bourne Supremacy       |2004|
|Cooper, Chris (I)|The Patriot                |2000|
|Cooper, Chris (I)|The Town                   |2010|
|Cooper, Chris (I)|Seabiscuit                 |2003|
|Cooper, Chris (I)|A Time to Kill             |1996|
|Cooper, Chris (I)|Where the Wild Things Are  

In [33]:
movies.printSchema()

root
 |-- actor: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: string (nullable = true)



In [34]:
movies2 = spark.read.option("header","true").option("inferSchema","true").csv("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.csv")

In [36]:
movies2.printSchema()

root
 |-- actor: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)



In [42]:
movie_schema = StructType([StructField("actor_name", StringType(), True),
                             StructField("movie_title", StringType(), True),
                             StructField("produced_year", LongType(), True)])

In [43]:
movie_schema

StructType([StructField('actor_name', StringType(), True), StructField('movie_title', StringType(), True), StructField('produced_year', LongType(), True)])

In [44]:
movies3 = spark.read.option("header","true").schema(movieSchema).csv("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.csv")

In [46]:
movies3.show(5)

+-----------------+-------------+-------------+
|       actor_name|  movie_title|produced_year|
+-----------------+-------------+-------------+
|McClure, Marc (I)|Freaky Friday|         2003|
|McClure, Marc (I)| Coach Carter|         2005|
|McClure, Marc (I)|  Superman II|         1980|
|McClure, Marc (I)|    Apollo 13|         1995|
|McClure, Marc (I)|     Superman|         1978|
+-----------------+-------------+-------------+
only showing top 5 rows



In [47]:
movies4 = spark.read.option("header","true").option("sep", "\t").schema(movieSchema).csv("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.tsv")

In [51]:
movies4.printSchema()

root
 |-- actor_name: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- produced_year: long (nullable = true)



In [53]:
movies5 = spark.read.option("header","true").option("sep", "\t").schema(movieSchema).json("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.json")

In [54]:
movies5.printSchema()

root
 |-- actor_name: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- produced_year: long (nullable = true)



In [59]:
movies5.show(truncate=False)

+-----------------+---------------------------+-------------+
|actor_name       |movie_title                |produced_year|
+-----------------+---------------------------+-------------+
|McClure, Marc (I)|Coach Carter               |2005         |
|McClure, Marc (I)|Superman II                |1980         |
|McClure, Marc (I)|Apollo 13                  |1995         |
|McClure, Marc (I)|Superman                   |1978         |
|McClure, Marc (I)|Back to the Future         |1985         |
|McClure, Marc (I)|Back to the Future Part III|1990         |
|Cooper, Chris (I)|Me, Myself & Irene         |2000         |
|Cooper, Chris (I)|October Sky                |1999         |
|Cooper, Chris (I)|Capote                     |2005         |
|Cooper, Chris (I)|The Bourne Supremacy       |2004         |
|Cooper, Chris (I)|The Patriot                |2000         |
|Cooper, Chris (I)|The Town                   |2010         |
|Cooper, Chris (I)|Seabiscuit                 |2003         |
|Cooper,

In [61]:
movie_schema2 = StructType([StructField("actor_name", StringType(), True),
                           StructField("movie_title", StringType(), True),
                           StructField("produced_year",  IntegerType(), True)])

In [62]:
movie_schema2

StructType([StructField('actor_name', StringType(), True), StructField('movie_title', StringType(), True), StructField('produced_year', IntegerType(), True)])

In [66]:
badMovieSchema = StructType([
                        StructField("actor_name", BooleanType(), True),
                        StructField("movie_title", StringType(), True),
                        StructField("produced_year", IntegerType(), True)])

In [67]:
badMovieSchema

StructType([StructField('actor_name', BooleanType(), True), StructField('movie_title', StringType(), True), StructField('produced_year', IntegerType(), True)])

In [69]:
movies6 = spark.read.option("header","true").option("sep", "\t").schema(badMovieSchema).json("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.json")

In [73]:
movies6.show(5)

+----------+------------------+-------------+
|actor_name|       movie_title|produced_year|
+----------+------------------+-------------+
|      null|      Coach Carter|         2005|
|      null|       Superman II|         1980|
|      null|         Apollo 13|         1995|
|      null|          Superman|         1978|
|      null|Back to the Future|         1985|
+----------+------------------+-------------+
only showing top 5 rows



In [79]:
# movies7 = spark.read.option("header","true").option("sep", "\t").option("mode", "failFast").schema(badMovieSchema).json("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.json")


In [80]:
movies8 = spark.read.load("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.parquet")

In [83]:
movies8.show(5)

+-----------------+------------------+-------------+
|       actor_name|       movie_title|produced_year|
+-----------------+------------------+-------------+
|McClure, Marc (I)|      Coach Carter|         2005|
|McClure, Marc (I)|       Superman II|         1980|
|McClure, Marc (I)|         Apollo 13|         1995|
|McClure, Marc (I)|          Superman|         1978|
|McClure, Marc (I)|Back to the Future|         1985|
+-----------------+------------------+-------------+
only showing top 5 rows



In [84]:
movies10 = spark.read.parquet("beginning-apache-spark-3-main/beginning-apache-spark-3-main/chapter3/data/movies/movies.parquet")
movies10.printSchema()

root
 |-- actor_name: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- produced_year: long (nullable = true)



In [85]:
movies10.show(5)

+-----------------+------------------+-------------+
|       actor_name|       movie_title|produced_year|
+-----------------+------------------+-------------+
|McClure, Marc (I)|      Coach Carter|         2005|
|McClure, Marc (I)|       Superman II|         1980|
|McClure, Marc (I)|         Apollo 13|         1995|
|McClure, Marc (I)|          Superman|         1978|
|McClure, Marc (I)|Back to the Future|         1985|
+-----------------+------------------+-------------+
only showing top 5 rows



In [93]:
spark = SparkSession.builder.config("spark.jars", "mysql-connector-java-8.0.13.jar").getOrCreate()

In [94]:
spark

In [95]:
spark.read.jdbc("jdbc:mysql://localhost:3306/emp", "employee", properties={"user": "root", "password": "root", "driver":"com.mysql.cj.jdbc.Driver"})

Py4JJavaError: An error occurred while calling o803.jdbc.
: java.lang.ClassNotFoundException: com.mysql.cj.jdbc.Driver
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:588)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:521)
	at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:46)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1(JDBCOptions.scala:101)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1$adapted(JDBCOptions.scala:101)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:101)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:39)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:34)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:350)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:228)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:210)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:210)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:171)
	at org.apache.spark.sql.DataFrameReader.jdbc(DataFrameReader.scala:248)
	at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:104)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1589)
