In [47]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    avg,
    col,
    round as rnd
)

spark = SparkSession.builder.appName("sql_import_csv").getOrCreate()
csv_file_path = "file:///home/jovyan/work/sample/age.csv"

# header option: either csv has header or not(default: header = false)
# inferSchema: either all columns are str or not
data = spark.read.option("header", "true")\
                 .option("inferSchema", "true")\
                 .csv(csv_file_path)
# data = spark.read.option("header", "true")\
#             .csv(csv_file_path)

# show schema
data.printSchema()

# show column name with data
data.select("name", "age").show()

# filter the data for age of 20 above
data.filter(data.age > 20).show()

# group by age and aggregates for count
data.groupBy("age").count().show()

# custom arithmetic
data.select(data.name, data.age, data.age - 10).show()

# column alias
data.select(data.name, col("age").alias("age1")).show()

# average
data.select(data.name, data.age, data.country).groupBy("country").avg("age").show()

# average & sort
data.select(data.name, data.age, data.country).groupBy("country").avg("age").sort("avg(age)").show()

# average & round
data.select(data.name, data.age, data.country)\
        .groupBy("country")\
        .agg(rnd(avg("age"), 2).alias("avg_age")).show()

+--------------------+-------+
|             country|avg_age|
+--------------------+-------+
|                Chad|  36.25|
|            Paraguay|  47.78|
|            Anguilla|   72.0|
|               Macao|   72.0|
|Heard Island and ...|   30.0|
|             Senegal|   53.0|
|              Sweden|  45.33|
|             Tokelau|  34.17|
|French Southern T...|  50.67|
|            Kiribati|  48.67|
|   Republic of Korea|  58.17|
|              Guyana|   39.0|
|             Eritrea|  39.75|
|              Jersey|   58.8|
|         Philippines|  48.33|
|            Djibouti|   38.6|
|               Tonga|   49.0|
|      Norfolk Island|  35.33|
|            Malaysia|  60.67|
|           Singapore|   40.0|
+--------------------+-------+
only showing top 20 rows



In [1]:
csv_file_path = "file:///home/jovyan/work/sample/age.csv"

123


In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import asc, desc 

spark = SparkSession.builder.appName("csv_test").getOrCreate()
csv_file_path = "file:///home/jovyan/work/sample/age.csv"
df = spark.read.option("header","true").option("inferSchema","true").csv(csv_file_path)

df.printSchema()


root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- country: string (nullable = true)



In [20]:
df.createOrReplaceTempView("age")

spark.sql (
    "select count(*) from age where age = 14" 
).show()

+--------+
|count(1)|
+--------+
|      16|
+--------+



In [23]:
from pyspark.sql import (
    functions,
    Row,
    SparkSession
)

spark = SparkSession.builder.appName("df_wordcount").getOrCreate()
csv_file_path = "file:///home/jovyan/work/sample/lorem_ipsum.txt"
line = spark.sparkContext.textFile(csv_file_path)

In [32]:
df = line.flatMap(lambda word : word.split(" ")).map(lambda a : (a,1)).reduceByKey(lambda a,b : (a+b))

for key, cnt in sorted(lambda a : 

[('Lorem', 12),
 ('dolor', 45),
 ('adipiscing', 57),
 ('sed', 194),
 ('eiusmod', 1),
 ('incididunt', 1),
 ('ut', 140),
 ('et', 117),
 ('dolore', 1),
 ('magna', 44),
 ('aliqua.', 1),
 ('Et', 18),
 ('pellentesque', 93),
 ('commodo', 45),
 ('egestas', 93),
 ('proin', 33),
 ('consequat', 44),
 ('interdum', 32),
 ('id', 127),
 ('vel', 79),
 ('elementum', 85),
 ('etiam.', 7),
 ('At', 14),
 ('Convallis', 7),
 ('velit', 53),
 ('laoreet', 30),
 ('Nam', 5),
 ('justo', 20),
 ('Metus', 4),
 ('eleifend', 23),
 ('mi', 65),
 ('nulla', 91),
 ('posuere', 33),
 ('cursus', 71),
 ('dictum.', 6),
 ('nec', 39),
 ('sagittis', 50),
 ('arcu.', 19),
 ('Amet', 29),
 ('sem', 34),
 ('cras.', 8),
 ('Nibh', 14),
 ('eget.', 12),
 ('pretium', 53),
 ('potenti', 7),
 ('Mauris', 19),
 ('neque', 70),
 ('ac', 97),
 ('at', 120),
 ('integer', 28),
 ('', 99),
 ('nibh.', 6),
 ('donec', 37),
 ('Eu', 18),
 ('turpis', 62),
 ('aenean', 35),
 ('cras', 52),
 ('mattis', 53),
 ('Congue', 8),
 ('in.', 30),
 ('non', 102),
 ('quis', 89),