In [1]:
spark

In [6]:
df = spark.createDataFrame([('Ahmed',25),('Mohamed',32),('Ahmed',15),('Mohamed',11)],schema=['Name','age'])

In [3]:
df

DataFrame[Name: string, age: bigint]

In [5]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: long (nullable = true)



In [7]:
df2 = df.groupBy('Name').avg('age')

In [8]:
df2

DataFrame[Name: string, avg(age): double]

In [9]:
df2.printSchema()

root
 |-- Name: string (nullable = true)
 |-- avg(age): double (nullable = true)



In [10]:
df

DataFrame[Name: string, age: bigint]

In [11]:
df2.show()

                                                                                

+-------+--------+
|   Name|avg(age)|
+-------+--------+
|  Ahmed|    20.0|
|Mohamed|    21.5|
+-------+--------+



In [12]:
df.cache()

DataFrame[Name: string, age: bigint]

In [14]:
from pyspark.sql.functions import avg

In [15]:
df3 = df.groupBy('Name').agg(avg('age').alias('AvgAge'))

In [16]:
df3.printSchema()

root
 |-- Name: string (nullable = true)
 |-- AvgAge: double (nullable = true)



In [17]:
df3.write.parquet('MansParq')

                                                                                

In [18]:
df3.rdd.getNumPartitions()

1

In [20]:
df3.write.csv('MansParqCSV')

In [21]:
df3.write.json('MansParqJson')

In [22]:
dfFlights = spark.read.csv('csv',inferSchema=True)

In [23]:
dfFlights.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [24]:
dfFlights = spark.read.csv('csv')

In [25]:
dfFlights.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [26]:
dfFlights = spark.read.csv('csv',inferSchema=True,header=True)

In [27]:
dfFlights.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [28]:
dfFlights = spark.read.csv('csv',header=True)

In [29]:
dfFlights.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: string (nullable = true)



In [30]:
dfFlights = spark.read.csv('csv',inferSchema=True,header=True)

In [31]:
dfFlights.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [32]:
dfCSV = spark.read.csv('MansParqCSV',inferSchema=True)

In [33]:
dfCSV.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: double (nullable = true)



In [36]:
dfparq = spark.read.parquet('MansParq')

In [37]:
dfparq.printSchema()

root
 |-- Name: string (nullable = true)
 |-- AvgAge: double (nullable = true)



In [38]:
df = spark.read.format('csv')\
.option('inferschema','true')\
.option('header','true')\
.load('csv')

In [39]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [41]:
df.schema

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', IntegerType(), True)])

In [42]:
from pyspark.sql.types import StructType,StructField,StringType, IntegerType

In [43]:
myschema = StructType([StructField('DestinationCountry',StringType(),True),
                      StructField('OriginCountry',StringType(),True),
                      StructField('FlightCounts',IntegerType(),True)])

In [142]:
myschema = 'DestinationCountry STRING, OriginCountry STRING, FlightCounts INTEGER'

In [143]:
df = spark.read.format('csv')\
.option('header','true')\
.schema(myschema)\
.load('csv')

In [144]:
df.printSchema()

root
 |-- DestinationCountry: string (nullable = true)
 |-- OriginCountry: string (nullable = true)
 |-- FlightCounts: integer (nullable = true)



In [48]:
dfFlightsSample = spark.read.csv('csv',inferSchema=True,header=True,samplingRatio=0.01)

In [50]:
s = dfFlightsSample.schema

In [51]:
s

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', IntegerType(), True)])

In [52]:
df.show()

+--------------------+----------------+------------+
|  DestinationCountry|   OriginCountry|FlightCounts|
+--------------------+----------------+------------+
|       United States|         Romania|           1|
|       United States|         Ireland|         264|
|       United States|           India|          69|
|               Egypt|   United States|          24|
|   Equatorial Guinea|   United States|           1|
|       United States|       Singapore|          25|
|       United States|         Grenada|          54|
|          Costa Rica|   United States|         477|
|             Senegal|   United States|          29|
|       United States|Marshall Islands|          44|
|              Guyana|   United States|          17|
|       United States|    Sint Maarten|          53|
|               Malta|   United States|           1|
|             Bolivia|   United States|          46|
|            Anguilla|   United States|          21|
|Turks and Caicos ...|   United States|       

24/05/29 11:00:11 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count
 Schema: DestinationCountry, OriginCountry, FlightCounts
Expected: DestinationCountry but found: DEST_COUNTRY_NAME
CSV file: file:///home/hatem/PySpark/Intake%2044/L2_RDD_DataFrames/csv/2010-summary.csv


In [57]:
df = spark.read.format('csv')\
.schema(myschema)\
.load('csv')

In [61]:
df.show()

+--------------------+-------------------+------------+
|  DestinationCountry|      OriginCountry|FlightCounts|
+--------------------+-------------------+------------+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|        NULL|
|       United States|            Romania|           1|
|       United States|            Ireland|         264|
|       United States|              India|          69|
|               Egypt|      United States|          24|
|   Equatorial Guinea|      United States|           1|
|       United States|          Singapore|          25|
|       United States|            Grenada|          54|
|          Costa Rica|      United States|         477|
|             Senegal|      United States|          29|
|       United States|   Marshall Islands|          44|
|              Guyana|      United States|          17|
|       United States|       Sint Maarten|          53|
|               Malta|      United States|           1|
|             Bolivia|      United States|      

In [55]:
df.printSchema()

root
 |-- DestinationCountry: string (nullable = true)
 |-- OriginCountry: string (nullable = true)
 |-- FlightCounts: integer (nullable = true)



In [66]:
df = spark.read.format('csv')\
.schema(myschema)\
.option('mode','PERMISSIVE')\
.load('csv')

In [67]:
df.show()

+--------------------+-------------------+------------+
|  DestinationCountry|      OriginCountry|FlightCounts|
+--------------------+-------------------+------------+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|        NULL|
|       United States|            Romania|           1|
|       United States|            Ireland|         264|
|       United States|              India|          69|
|               Egypt|      United States|          24|
|   Equatorial Guinea|      United States|           1|
|       United States|          Singapore|          25|
|       United States|            Grenada|          54|
|          Costa Rica|      United States|         477|
|             Senegal|      United States|          29|
|       United States|   Marshall Islands|          44|
|              Guyana|      United States|          17|
|       United States|       Sint Maarten|          53|
|               Malta|      United States|           1|
|             Bolivia|      United States|      

In [69]:
df = spark.read.format('csv')\
.schema(myschema)\
.option('mode','dropmalformed')\
.load('csv')

In [71]:
df.show(truncate=False)

+--------------------------------+----------------+------------+
|DestinationCountry              |OriginCountry   |FlightCounts|
+--------------------------------+----------------+------------+
|United States                   |Romania         |1           |
|United States                   |Ireland         |264         |
|United States                   |India           |69          |
|Egypt                           |United States   |24          |
|Equatorial Guinea               |United States   |1           |
|United States                   |Singapore       |25          |
|United States                   |Grenada         |54          |
|Costa Rica                      |United States   |477         |
|Senegal                         |United States   |29          |
|United States                   |Marshall Islands|44          |
|Guyana                          |United States   |17          |
|United States                   |Sint Maarten    |53          |
|Malta                   

In [76]:
df = spark.read.format('csv')\
.schema(myschema)\
.option('mode','failfast')\
.load('csv')

In [77]:
df.show(truncate=False)

24/05/29 11:19:07 ERROR Executor: Exception in task 0.0 in stage 44.0 (TID 76)
org.apache.spark.SparkException: Encountered error while reading file file:///home/hatem/PySpark/Intake%2044/L2_RDD_DataFrames/csv/2010-summary.csv. Details:
	at org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:863)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:293)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.

Py4JJavaError: An error occurred while calling o355.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 44.0 failed 1 times, most recent failure: Lost task 0.0 in stage 44.0 (TID 76) (10.0.2.15 executor driver): org.apache.spark.SparkException: Encountered error while reading file file:///home/hatem/PySpark/Intake%2044/L2_RDD_DataFrames/csv/2010-summary.csv. Details:
	at org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:863)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:293)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: [MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION] Malformed records are detected in record parsing: [DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,null].
Parse Mode: FAILFAST. To process malformed records as null result, try setting the option 'mode' as 'PERMISSIVE'. 
	at org.apache.spark.sql.errors.QueryExecutionErrors$.malformedRecordsDetectedInRecordParsingError(QueryExecutionErrors.scala:1610)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:79)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$2(UnivocityParser.scala:456)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:283)
	... 22 more
Caused by: org.apache.spark.sql.catalyst.util.BadRecordException: java.lang.NumberFormatException: For input string: "count"
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:365)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$parse$2(UnivocityParser.scala:307)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$1(UnivocityParser.scala:452)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:60)
	... 28 more
Caused by: java.lang.NumberFormatException: For input string: "count"
	at java.base/java.lang.NumberFormatException.forInputString(NumberFormatException.java:67)
	at java.base/java.lang.Integer.parseInt(Integer.java:661)
	at java.base/java.lang.Integer.parseInt(Integer.java:777)
	at scala.collection.immutable.StringLike.toInt(StringLike.scala:310)
	at scala.collection.immutable.StringLike.toInt$(StringLike.scala:310)
	at scala.collection.immutable.StringOps.toInt(StringOps.scala:33)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$6(UnivocityParser.scala:189)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$6$adapted(UnivocityParser.scala:189)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.nullSafeDatum(UnivocityParser.scala:291)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$5(UnivocityParser.scala:189)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:346)
	... 31 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:984)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4344)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4334)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4332)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4332)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3549)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: Encountered error while reading file file:///home/hatem/PySpark/Intake%2044/L2_RDD_DataFrames/csv/2010-summary.csv. Details:
	at org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:863)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:293)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: org.apache.spark.SparkException: [MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION] Malformed records are detected in record parsing: [DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,null].
Parse Mode: FAILFAST. To process malformed records as null result, try setting the option 'mode' as 'PERMISSIVE'. 
	at org.apache.spark.sql.errors.QueryExecutionErrors$.malformedRecordsDetectedInRecordParsingError(QueryExecutionErrors.scala:1610)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:79)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$2(UnivocityParser.scala:456)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:283)
	... 22 more
Caused by: org.apache.spark.sql.catalyst.util.BadRecordException: java.lang.NumberFormatException: For input string: "count"
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:365)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$parse$2(UnivocityParser.scala:307)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$1(UnivocityParser.scala:452)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:60)
	... 28 more
Caused by: java.lang.NumberFormatException: For input string: "count"
	at java.base/java.lang.NumberFormatException.forInputString(NumberFormatException.java:67)
	at java.base/java.lang.Integer.parseInt(Integer.java:661)
	at java.base/java.lang.Integer.parseInt(Integer.java:777)
	at scala.collection.immutable.StringLike.toInt(StringLike.scala:310)
	at scala.collection.immutable.StringLike.toInt$(StringLike.scala:310)
	at scala.collection.immutable.StringOps.toInt(StringOps.scala:33)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$6(UnivocityParser.scala:189)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$6$adapted(UnivocityParser.scala:189)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.nullSafeDatum(UnivocityParser.scala:291)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$5(UnivocityParser.scala:189)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:346)
	... 31 more


In [78]:
df = spark.read.format('csv')\
.schema(myschema)\
.option('mode','dropmalformed')\
.load('csv')

In [79]:
df.show()

+--------------------+----------------+------------+
|  DestinationCountry|   OriginCountry|FlightCounts|
+--------------------+----------------+------------+
|       United States|         Romania|           1|
|       United States|         Ireland|         264|
|       United States|           India|          69|
|               Egypt|   United States|          24|
|   Equatorial Guinea|   United States|           1|
|       United States|       Singapore|          25|
|       United States|         Grenada|          54|
|          Costa Rica|   United States|         477|
|             Senegal|   United States|          29|
|       United States|Marshall Islands|          44|
|              Guyana|   United States|          17|
|       United States|    Sint Maarten|          53|
|               Malta|   United States|           1|
|             Bolivia|   United States|          46|
|            Anguilla|   United States|          21|
|Turks and Caicos ...|   United States|       

In [80]:
df2 = df.select('OriginCountry','DestinationCountry','FlightCounts')

In [81]:
df2.printSchema()

root
 |-- OriginCountry: string (nullable = true)
 |-- DestinationCountry: string (nullable = true)
 |-- FlightCounts: integer (nullable = true)



In [82]:
df2.show()

+----------------+--------------------+------------+
|   OriginCountry|  DestinationCountry|FlightCounts|
+----------------+--------------------+------------+
|         Romania|       United States|           1|
|         Ireland|       United States|         264|
|           India|       United States|          69|
|   United States|               Egypt|          24|
|   United States|   Equatorial Guinea|           1|
|       Singapore|       United States|          25|
|         Grenada|       United States|          54|
|   United States|          Costa Rica|         477|
|   United States|             Senegal|          29|
|Marshall Islands|       United States|          44|
|   United States|              Guyana|          17|
|    Sint Maarten|       United States|          53|
|   United States|               Malta|           1|
|   United States|             Bolivia|          46|
|   United States|            Anguilla|          21|
|   United States|Turks and Caicos ...|       

In [83]:
dfCounts = df.select('FlightCounts')

In [84]:
dfCounts.printSchema()

root
 |-- FlightCounts: integer (nullable = true)



In [85]:
dfCounts.show()

+------------+
|FlightCounts|
+------------+
|           1|
|         264|
|          69|
|          24|
|           1|
|          25|
|          54|
|         477|
|          29|
|          44|
|          17|
|          53|
|           1|
|          46|
|          21|
|         136|
|           2|
|           1|
|         390|
|         156|
+------------+
only showing top 20 rows



In [86]:
dfDoubleCounts = df.select('FlightCounts * 2')

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `FlightCounts * 2` cannot be resolved. Did you mean one of the following? [`FlightCounts`, `OriginCountry`, `DestinationCountry`].;
'Project ['FlightCounts * 2]
+- Relation [DestinationCountry#851,OriginCountry#852,FlightCounts#853] csv


In [87]:
from pyspark.sql.functions import *
import pyspark.sql.functions as F

In [88]:
expr

<function pyspark.sql.functions.expr(str: str) -> pyspark.sql.column.Column>

In [89]:
sum

<function pyspark.sql.functions.sum(col: 'ColumnOrName') -> pyspark.sql.column.Column>

In [None]:
F.sum()

In [90]:
dfDoubleCounts = df.select(expr('FlightCounts * 2'))

In [91]:
dfDoubleCounts.show()

+------------------+
|(FlightCounts * 2)|
+------------------+
|                 2|
|               528|
|               138|
|                48|
|                 2|
|                50|
|               108|
|               954|
|                58|
|                88|
|                34|
|               106|
|                 2|
|                92|
|                42|
|               272|
|                 4|
|                 2|
|               780|
|               312|
+------------------+
only showing top 20 rows



In [92]:
dfDoubleCounts = df.select(expr('FlightCounts * 2').alias('DblCounts'))

In [93]:
dfDoubleCounts.show()

+---------+
|DblCounts|
+---------+
|        2|
|      528|
|      138|
|       48|
|        2|
|       50|
|      108|
|      954|
|       58|
|       88|
|       34|
|      106|
|        2|
|       92|
|       42|
|      272|
|        4|
|        2|
|      780|
|      312|
+---------+
only showing top 20 rows



In [94]:
dfDoubleCounts = df.select(expr('FlightCounts * 2 as dblcnt'))

In [95]:
dfDoubleCounts.show()

+------+
|dblcnt|
+------+
|     2|
|   528|
|   138|
|    48|
|     2|
|    50|
|   108|
|   954|
|    58|
|    88|
|    34|
|   106|
|     2|
|    92|
|    42|
|   272|
|     4|
|     2|
|   780|
|   312|
+------+
only showing top 20 rows



In [98]:
dfDoubleCounts = df.select(df['FlightCounts']*2)
dfDoubleCounts.show()

+------------------+
|(FlightCounts * 2)|
+------------------+
|                 2|
|               528|
|               138|
|                48|
|                 2|
|                50|
|               108|
|               954|
|                58|
|                88|
|                34|
|               106|
|                 2|
|                92|
|                42|
|               272|
|                 4|
|                 2|
|               780|
|               312|
+------------------+
only showing top 20 rows



In [99]:
dfDoubleCounts = df.select((df['FlightCounts']*2).alias('dblcnt'))
dfDoubleCounts.show()

+------+
|dblcnt|
+------+
|     2|
|   528|
|   138|
|    48|
|     2|
|    50|
|   108|
|   954|
|    58|
|    88|
|    34|
|   106|
|     2|
|    92|
|    42|
|   272|
|     4|
|     2|
|   780|
|   312|
+------+
only showing top 20 rows



In [100]:
df['FlightCounts']

Column<'FlightCounts'>

In [101]:
df.FlightCounts

Column<'FlightCounts'>

In [103]:
col('FlightCounts')

Column<'FlightCounts'>

In [105]:
dfDoubleCounts = df.select((col('FlightCounts')*2).alias('dblcnt'))
dfDoubleCounts.show()

+------+
|dblcnt|
+------+
|     2|
|   528|
|   138|
|    48|
|     2|
|    50|
|   108|
|   954|
|    58|
|    88|
|    34|
|   106|
|     2|
|    92|
|    42|
|   272|
|     4|
|     2|
|   780|
|   312|
+------+
only showing top 20 rows



In [106]:
dfsquareCounts = df.select((col('FlightCounts')**2).alias('sqcnt'))
dfsquareCounts.show()

+--------+
|   sqcnt|
+--------+
|     1.0|
| 69696.0|
|  4761.0|
|   576.0|
|     1.0|
|   625.0|
|  2916.0|
|227529.0|
|   841.0|
|  1936.0|
|   289.0|
|  2809.0|
|     1.0|
|  2116.0|
|   441.0|
| 18496.0|
|     4.0|
|     1.0|
|152100.0|
| 24336.0|
+--------+
only showing top 20 rows



In [108]:
dfsquareCounts = df.select(expr('FlightCounts ^ 2 as sqcnt'))
dfsquareCounts.show()

+-----+
|sqcnt|
+-----+
|    3|
|  266|
|   71|
|   26|
|    3|
|   27|
|   52|
|  479|
|   31|
|   46|
|   19|
|   55|
|    3|
|   44|
|   23|
|  138|
|    0|
|    3|
|  388|
|  158|
+-----+
only showing top 20 rows



In [109]:
df.printSchema()

root
 |-- DestinationCountry: string (nullable = true)
 |-- OriginCountry: string (nullable = true)
 |-- FlightCounts: integer (nullable = true)



In [110]:
df_manyflights = df.select('OriginCountry','FlightCounts').where('FlightCounts > 100')

In [111]:
df_manyflights

DataFrame[OriginCountry: string, FlightCounts: int]

In [113]:
df.cache()

DataFrame[DestinationCountry: string, OriginCountry: string, FlightCounts: int]

In [114]:
df_manyflights.show()

+-------------+------------+
|OriginCountry|FlightCounts|
+-------------+------------+
|      Ireland|         264|
|United States|         477|
|United States|         136|
|United States|         390|
|       Russia|         156|
|  Netherlands|         570|
|United States|         118|
|United States|         391|
|United States|         903|
|United States|         519|
|United States|         315|
|United States|         252|
|United States|         187|
|      Ecuador|         345|
|United States|        6200|
|United States|         272|
|     Portugal|         104|
|   Costa Rica|         501|
|    Guatemala|         333|
|United States|         785|
+-------------+------------+
only showing top 20 rows



In [115]:
df_manyflights = df.select('OriginCountry','FlightCounts').where(col('FlightCounts') > 100)

In [117]:
df_manyflights.show()

+-------------+------------+
|OriginCountry|FlightCounts|
+-------------+------------+
|      Ireland|         264|
|United States|         477|
|United States|         136|
|United States|         390|
|       Russia|         156|
|  Netherlands|         570|
|United States|         118|
|United States|         391|
|United States|         903|
|United States|         519|
|United States|         315|
|United States|         252|
|United States|         187|
|      Ecuador|         345|
|United States|        6200|
|United States|         272|
|     Portugal|         104|
|   Costa Rica|         501|
|    Guatemala|         333|
|United States|         785|
+-------------+------------+
only showing top 20 rows



In [118]:
df_manyflights = df.filter(col('FlightCounts') > 100)

In [119]:
df_manyflights.show()

+--------------------+-------------+------------+
|  DestinationCountry|OriginCountry|FlightCounts|
+--------------------+-------------+------------+
|       United States|      Ireland|         264|
|          Costa Rica|United States|         477|
|Turks and Caicos ...|United States|         136|
|               Italy|United States|         390|
|       United States|       Russia|         156|
|       United States|  Netherlands|         570|
|             Iceland|United States|         118|
|            Honduras|United States|         391|
|         The Bahamas|United States|         903|
|         El Salvador|United States|         519|
|         Switzerland|United States|         315|
|           Hong Kong|United States|         252|
| Trinidad and Tobago|United States|         187|
|       United States|      Ecuador|         345|
|              Mexico|United States|        6200|
|             Ecuador|United States|         272|
|       United States|     Portugal|         104|


In [120]:
df_new = df.select('OriginCountry','DestinationCountry',
                  'FlightCounts',(col('FlightCounts')*2).alias('Dblcount'))

In [121]:
df_new.show()

+----------------+--------------------+------------+--------+
|   OriginCountry|  DestinationCountry|FlightCounts|Dblcount|
+----------------+--------------------+------------+--------+
|         Romania|       United States|           1|       2|
|         Ireland|       United States|         264|     528|
|           India|       United States|          69|     138|
|   United States|               Egypt|          24|      48|
|   United States|   Equatorial Guinea|           1|       2|
|       Singapore|       United States|          25|      50|
|         Grenada|       United States|          54|     108|
|   United States|          Costa Rica|         477|     954|
|   United States|             Senegal|          29|      58|
|Marshall Islands|       United States|          44|      88|
|   United States|              Guyana|          17|      34|
|    Sint Maarten|       United States|          53|     106|
|   United States|               Malta|           1|       2|
|   Unit

In [122]:
df_new = df.withColumn('Dblcount',col('FlightCounts')*2)

In [123]:
df_new.printSchema()

root
 |-- DestinationCountry: string (nullable = true)
 |-- OriginCountry: string (nullable = true)
 |-- FlightCounts: integer (nullable = true)
 |-- Dblcount: integer (nullable = true)



In [124]:
df_new.show()

+--------------------+----------------+------------+--------+
|  DestinationCountry|   OriginCountry|FlightCounts|Dblcount|
+--------------------+----------------+------------+--------+
|       United States|         Romania|           1|       2|
|       United States|         Ireland|         264|     528|
|       United States|           India|          69|     138|
|               Egypt|   United States|          24|      48|
|   Equatorial Guinea|   United States|           1|       2|
|       United States|       Singapore|          25|      50|
|       United States|         Grenada|          54|     108|
|          Costa Rica|   United States|         477|     954|
|             Senegal|   United States|          29|      58|
|       United States|Marshall Islands|          44|      88|
|              Guyana|   United States|          17|      34|
|       United States|    Sint Maarten|          53|     106|
|               Malta|   United States|           1|       2|
|       

In [128]:
df_new_2 = df_new.withColumnRenamed('Dblcount','DoublCount')

In [129]:
df_new.printSchema()

root
 |-- DestinationCountry: string (nullable = true)
 |-- OriginCountry: string (nullable = true)
 |-- FlightCounts: integer (nullable = true)
 |-- Dblcount: integer (nullable = true)



In [130]:
df_new_2.printSchema()

root
 |-- DestinationCountry: string (nullable = true)
 |-- OriginCountry: string (nullable = true)
 |-- FlightCounts: integer (nullable = true)
 |-- DoublCount: integer (nullable = true)



In [131]:
dfFlightGroups =  df.groupBy('DestinationCountry','OriginCountry').avg('FlightCounts')

In [132]:
dfFlightGroups.show()

+------------------+-------------+------------------+
|DestinationCountry|OriginCountry| avg(FlightCounts)|
+------------------+-------------+------------------+
|           Croatia|United States|1.6666666666666667|
|            Kosovo|United States|               1.0|
|           Romania|United States|             10.75|
|           Ireland|United States|             265.5|
|     United States|        Egypt|              14.5|
|             India|United States|63.833333333333336|
|             Niger|United States|               1.5|
|         Singapore|United States|              20.4|
|           Grenada|United States|              53.5|
|     United States|   Costa Rica| 549.8333333333334|
|     United States|      Senegal|35.666666666666664|
|  Marshall Islands|United States|59.333333333333336|
|      Sint Maarten|United States|238.33333333333334|
|     United States|        Malta|              2.25|
|     United States|       Guyana|49.666666666666664|
|     United States|   Monte

In [133]:
dfFlightGroups =  df.groupBy('DestinationCountry','OriginCountry')\
.agg(avg('FlightCounts'),F.max('FlightCounts'),median('FlightCounts'))

In [134]:
dfFlightGroups.printSchema()

root
 |-- DestinationCountry: string (nullable = true)
 |-- OriginCountry: string (nullable = true)
 |-- avg(FlightCounts): double (nullable = true)
 |-- max(FlightCounts): integer (nullable = true)
 |-- median(FlightCounts): double (nullable = true)



In [135]:
dfFlightGroups =  df.groupBy('DestinationCountry','OriginCountry')\
.agg(avg('FlightCounts').alias('AvgCount'),
     F.max('FlightCounts').alias('MaxCount'),
     median('FlightCounts').alias('MedianCounts'))

In [136]:
dfFlightGroups.printSchema()

root
 |-- DestinationCountry: string (nullable = true)
 |-- OriginCountry: string (nullable = true)
 |-- AvgCount: double (nullable = true)
 |-- MaxCount: integer (nullable = true)
 |-- MedianCounts: double (nullable = true)



In [137]:
dfFlightGroups.show(truncate=False)

+---------------------------------+-------------+------------------+--------+------------+
|DestinationCountry               |OriginCountry|AvgCount          |MaxCount|MedianCounts|
+---------------------------------+-------------+------------------+--------+------------+
|Afghanistan                      |United States|8.0               |11      |8.0         |
|Algeria                          |United States|5.0               |9       |4.0         |
|Angola                           |United States|13.166666666666666|15      |13.0        |
|Anguilla                         |United States|26.333333333333332|41      |21.5        |
|Antigua and Barbuda              |United States|129.66666666666666|146     |124.5       |
|Argentina                        |United States|187.66666666666666|208     |183.5       |
|Aruba                            |United States|350.6666666666667 |359     |350.0       |
|Australia                        |United States|294.0             |329     |291.5       |

In [145]:
dfNull = spark.read.csv('NullData.csv',inferSchema=True,header=True)

In [146]:
dfNull.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| NULL|
|emp2| NULL| NULL|
|emp3| NULL|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [147]:
df_null = dfNull.withColumn('DoubleSale',expr('Sales * 2 as dblsale'))

In [148]:
df_null.show()

+----+-----+-----+----------+
|  Id| Name|Sales|DoubleSale|
+----+-----+-----+----------+
|emp1| John| NULL|      NULL|
|emp2| NULL| NULL|      NULL|
|emp3| NULL|345.0|     690.0|
|emp4|Cindy|456.0|     912.0|
+----+-----+-----+----------+



In [149]:
df_notnull = df_null.na.drop()

In [150]:
df_notnull.show()

+----+-----+-----+----------+
|  Id| Name|Sales|DoubleSale|
+----+-----+-----+----------+
|emp4|Cindy|456.0|     912.0|
+----+-----+-----+----------+



In [151]:
df_notnull = df_null.na.drop(how='all')

In [153]:
df_notnull.show()

+----+-----+-----+----------+
|  Id| Name|Sales|DoubleSale|
+----+-----+-----+----------+
|emp1| John| NULL|      NULL|
|emp2| NULL| NULL|      NULL|
|emp3| NULL|345.0|     690.0|
|emp4|Cindy|456.0|     912.0|
+----+-----+-----+----------+



In [154]:
df_notnull = df_null.na.drop(subset=['Name','Sales','DoubleSale'],how='all')
df_notnull.show()

+----+-----+-----+----------+
|  Id| Name|Sales|DoubleSale|
+----+-----+-----+----------+
|emp1| John| NULL|      NULL|
|emp3| NULL|345.0|     690.0|
|emp4|Cindy|456.0|     912.0|
+----+-----+-----+----------+



In [155]:
df_null.show()

+----+-----+-----+----------+
|  Id| Name|Sales|DoubleSale|
+----+-----+-----+----------+
|emp1| John| NULL|      NULL|
|emp2| NULL| NULL|      NULL|
|emp3| NULL|345.0|     690.0|
|emp4|Cindy|456.0|     912.0|
+----+-----+-----+----------+



In [156]:
df_notnull = df_null.na.drop(thresh=2)
df_notnull.show()

+----+-----+-----+----------+
|  Id| Name|Sales|DoubleSale|
+----+-----+-----+----------+
|emp1| John| NULL|      NULL|
|emp3| NULL|345.0|     690.0|
|emp4|Cindy|456.0|     912.0|
+----+-----+-----+----------+



In [157]:
df_notnull = df_null.na.drop(thresh=3)
df_notnull.show()

+----+-----+-----+----------+
|  Id| Name|Sales|DoubleSale|
+----+-----+-----+----------+
|emp3| NULL|345.0|     690.0|
|emp4|Cindy|456.0|     912.0|
+----+-----+-----+----------+



In [158]:
df_null.show()

+----+-----+-----+----------+
|  Id| Name|Sales|DoubleSale|
+----+-----+-----+----------+
|emp1| John| NULL|      NULL|
|emp2| NULL| NULL|      NULL|
|emp3| NULL|345.0|     690.0|
|emp4|Cindy|456.0|     912.0|
+----+-----+-----+----------+



In [159]:
df_notnull = df_null.na.drop(subset=['Name','Sales','DoubleSale'],thresh=2)
df_notnull.show()

+----+-----+-----+----------+
|  Id| Name|Sales|DoubleSale|
+----+-----+-----+----------+
|emp3| NULL|345.0|     690.0|
|emp4|Cindy|456.0|     912.0|
+----+-----+-----+----------+



In [161]:
df_fillnull = df_null.na.fill(15)
df_fillnull.show()

+----+-----+-----+----------+
|  Id| Name|Sales|DoubleSale|
+----+-----+-----+----------+
|emp1| John| 15.0|      15.0|
|emp2| NULL| 15.0|      15.0|
|emp3| NULL|345.0|     690.0|
|emp4|Cindy|456.0|     912.0|
+----+-----+-----+----------+



In [162]:
df_fillnull = df_null.na.fill('No Name')
df_fillnull.show()

+----+-------+-----+----------+
|  Id|   Name|Sales|DoubleSale|
+----+-------+-----+----------+
|emp1|   John| NULL|      NULL|
|emp2|No Name| NULL|      NULL|
|emp3|No Name|345.0|     690.0|
|emp4|  Cindy|456.0|     912.0|
+----+-------+-----+----------+



In [163]:
15*15

225

In [166]:
df_fillnull = df_null.na.fill(15,subset=['Sales']).na.fill(225,subset=['DoubleSale'])
df_fillnull.show()

+----+-----+-----+----------+
|  Id| Name|Sales|DoubleSale|
+----+-----+-----+----------+
|emp1| John| 15.0|     225.0|
|emp2| NULL| 15.0|     225.0|
|emp3| NULL|345.0|     690.0|
|emp4|Cindy|456.0|     912.0|
+----+-----+-----+----------+



In [168]:
df_fillnull = df_null.na.fill({'Name':'No Name','Sales':15,'DoubleSale':225})
df_fillnull.show()

+----+-------+-----+----------+
|  Id|   Name|Sales|DoubleSale|
+----+-------+-----+----------+
|emp1|   John| 15.0|     225.0|
|emp2|No Name| 15.0|     225.0|
|emp3|No Name|345.0|     690.0|
|emp4|  Cindy|456.0|     912.0|
+----+-------+-----+----------+



In [169]:
df_fillnull

DataFrame[Id: string, Name: string, Sales: double, DoubleSale: double]

In [170]:
df.printSchema()

root
 |-- DestinationCountry: string (nullable = true)
 |-- OriginCountry: string (nullable = true)
 |-- FlightCounts: integer (nullable = true)



In [171]:
df.createOrReplaceTempView('myview')

In [173]:
df_ManyFlights = spark.sql('select * from myview where FlightCounts>100')

In [174]:
df_ManyFlights.show()

+--------------------+-------------+------------+
|  DestinationCountry|OriginCountry|FlightCounts|
+--------------------+-------------+------------+
|       United States|      Ireland|         264|
|          Costa Rica|United States|         477|
|Turks and Caicos ...|United States|         136|
|               Italy|United States|         390|
|       United States|       Russia|         156|
|       United States|  Netherlands|         570|
|             Iceland|United States|         118|
|            Honduras|United States|         391|
|         The Bahamas|United States|         903|
|         El Salvador|United States|         519|
|         Switzerland|United States|         315|
|           Hong Kong|United States|         252|
| Trinidad and Tobago|United States|         187|
|       United States|      Ecuador|         345|
|              Mexico|United States|        6200|
|             Ecuador|United States|         272|
|       United States|     Portugal|         104|


24/05/29 12:42:20 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count
 Schema: DestinationCountry, OriginCountry, FlightCounts
Expected: DestinationCountry but found: DEST_COUNTRY_NAME
CSV file: file:///home/hatem/PySpark/Intake%2044/L2_RDD_DataFrames/csv/2010-summary.csv


In [176]:
df_ManyFlights.write.format('parquet').\
mode('overwrite').\
option('path','manyflights').\
save()

24/05/29 13:00:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count
 Schema: DestinationCountry, OriginCountry, FlightCounts
Expected: DestinationCountry but found: DEST_COUNTRY_NAME
CSV file: file:///home/hatem/PySpark/Intake%2044/L2_RDD_DataFrames/csv/2011-summary.csv
24/05/29 13:00:53 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 2, schema size: 3
CSV file: file:///home/hatem/PySpark/Intake%2044/L2_RDD_DataFrames/csv/2009-summary.csv
24/05/29 13:00:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count
 Schema: DestinationCountry, OriginCountry, FlightCounts
Expected: DestinationCountry but found: DEST_COUNTRY_NAME
CSV file: file:///home/hatem/PySpark/Intake%2044/L2_RDD_DataFrames/csv/2012-summary.csv
24/05/29 13:00:53 WARN CSVHeaderChecker: CSV header does not conform to th

In [177]:
df_ManyFlights.write.format('jdbc').\
option('url','jdbc:postgresql://localhost:5432/database_example').\
option('dbtable','Dina').\
option('user','postgres').\
option('password','DM$1234').\
save()

24/05/29 13:02:15 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 2, schema size: 3
CSV file: file:///home/hatem/PySpark/Intake%2044/L2_RDD_DataFrames/csv/2009-summary.csv
24/05/29 13:02:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count
 Schema: DestinationCountry, OriginCountry, FlightCounts
Expected: DestinationCountry but found: DEST_COUNTRY_NAME
CSV file: file:///home/hatem/PySpark/Intake%2044/L2_RDD_DataFrames/csv/2012-summary.csv
24/05/29 13:02:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count
 Schema: DestinationCountry, OriginCountry, FlightCounts
Expected: DestinationCountry but found: DEST_COUNTRY_NAME
CSV file: file:///home/hatem/PySpark/Intake%2044/L2_RDD_DataFrames/csv/2011-summary.csv
24/05/29 13:02:15 WARN CSVHeaderChecker: CSV header does not conform to th

In [179]:
df_post = spark.read.format('jdbc').\
option('url','jdbc:postgresql://localhost:5432/database_example').\
option('dbtable','departuredelays').\
option('user','postgres').\
option('password','DM$1234').\
load()

In [181]:
df_post.printSchema()

root
 |-- date: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



In [180]:
df_post.show()

[Stage 88:>                                                         (0 + 1) / 1]

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1010630|  -10|     928|   RSW|        EWR|
|1021029|   87|     974|   RSW|        ORD|
|1021346|    0|     928|   RSW|        EWR|
|1021044|   18|     928|   RSW|        EWR|
|1021730|   29|     748|   RSW|        IAH|
|1020535|  605|     974|   RSW|        ORD|
|1021820|   71|     974|   RSW|        ORD|
|1021743|    0|     928|   RSW|        EWR|
|1022017|    0|     928|   RSW|        EWR|
|1020600|   -2|     748|   RSW|        IAH|
|1021214|   29|     891|   RSW|        CLE|
|1020630|   -5|     928|   RSW|        EWR|
|1031029|   13|     974|   RSW|        ORD|
|1031346|  279|     928|   RSW|        EWR|
|1031740|   29|     748|   RSW|        IAH|
|1030535|    0|     974|   RSW|        ORD|
|1031808|   -3|     974|   RSW|        ORD|
|1031516|   -2|    1396|   RSW|        DEN|
|1032017|   14|     928|   RSW|        EWR|
|1031214|   17|     891|   RSW| 

                                                                                