In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
spark = SparkSession.builder.appName("spark_ml_practise").getOrCreate()

In [0]:
schema = StructType([StructField('Date', DateType(), True),
                     StructField('Time', StringType(), True),
                     StructField('City', StringType(), True),
                     StructField('Item', StringType(), True),
                     StructField('Total', DoubleType(), True),
                     StructField('Payment', StringType(), True)])

In [0]:
df = spark.read.csv("/FileStore/tables/purchases.csv",header=True,schema=schema,sep='\t')

In [0]:
df.show(10)

+----------+-----+--------------+--------------------+------+----------+
|      Date| Time|          City|                Item| Total|   Payment|
+----------+-----+--------------+--------------------+------+----------+
|2012-01-01|09:00|      San Jose|      Men's Clothing|214.05|      Amex|
|2012-01-01|09:00|    Fort Worth|    Women's Clothing|153.57|      Visa|
|2012-01-01|09:00|     San Diego|               Music| 66.08|      Cash|
|2012-01-01|09:00|    Pittsburgh|        Pet Supplies|493.51|  Discover|
|2012-01-01|09:00|         Omaha| Children's Clothing|235.63|MasterCard|
|2012-01-01|09:00|      Stockton|      Men's Clothing|247.18|MasterCard|
|2012-01-01|09:00|        Austin|             Cameras| 379.6|      Visa|
|2012-01-01|09:00|      New York|Consumer Electronics| 296.8|      Cash|
|2012-01-01|09:00|Corpus Christi|                Toys| 25.38|  Discover|
|2012-01-01|09:00|    Fort Worth|                Toys|213.88|      Visa|
+----------+-----+--------------+------------------

In [0]:
print(df.count())
df.printSchema()
df.describe().show()

root
 |-- Date: date (nullable = true)
 |-- Time: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Total: double (nullable = true)
 |-- Payment: string (nullable = true)

+-------+-------+-------------+----------------+------------------+-------+
|summary|   Time|         City|            Item|             Total|Payment|
+-------+-------+-------------+----------------+------------------+-------+
|  count|4138476|      4138476|         4138476|           4138476|4138476|
|   mean|   null|         null|            null| 249.9610854962055|   null|
| stddev|   null|         null|            null|144.31741115579197|   null|
|    min|  09:00|  Albuquerque|            Baby|               0.0|   Amex|
|    max|  17:59|Winston–Salem|Women's Clothing|            499.99|   Visa|
+-------+-------+-------------+----------------+------------------+-------+



In [0]:
df_CT = df.select("City","Total")
df_CT.show(10)


+--------------+------+
|          City| Total|
+--------------+------+
|      San Jose|214.05|
|    Fort Worth|153.57|
|     San Diego| 66.08|
|    Pittsburgh|493.51|
|         Omaha|235.63|
|      Stockton|247.18|
|        Austin| 379.6|
|      New York| 296.8|
|Corpus Christi| 25.38|
|    Fort Worth|213.88|
+--------------+------+
only showing top 10 rows



In [0]:
df_CT.filter(df_CT["Total"]  > 200).show(10) 
df.filter(df["Total"]  > 200).show(10) 

+-------------+------+
|         City| Total|
+-------------+------+
|     San Jose|214.05|
|   Pittsburgh|493.51|
|        Omaha|235.63|
|     Stockton|247.18|
|       Austin| 379.6|
|     New York| 296.8|
|   Fort Worth|213.88|
|       Austin|469.63|
|   Greensboro|290.82|
|San Francisco|260.65|
+-------------+------+
only showing top 10 rows

+----------+-----+-------------+--------------------+------+----------+
|      Date| Time|         City|                Item| Total|   Payment|
+----------+-----+-------------+--------------------+------+----------+
|2012-01-01|09:00|     San Jose|      Men's Clothing|214.05|      Amex|
|2012-01-01|09:00|   Pittsburgh|        Pet Supplies|493.51|  Discover|
|2012-01-01|09:00|        Omaha| Children's Clothing|235.63|MasterCard|
|2012-01-01|09:00|     Stockton|      Men's Clothing|247.18|MasterCard|
|2012-01-01|09:00|       Austin|             Cameras| 379.6|      Visa|
|2012-01-01|09:00|     New York|Consumer Electronics| 296.8|      Cash|
|201

In [0]:
df.orderBy('City',ascending=True).show(10)

+----------+-----+-----------+-----------------+------+----------+
|      Date| Time|       City|             Item| Total|   Payment|
+----------+-----+-----------+-----------------+------+----------+
|2012-11-22|15:29|Albuquerque|           Garden|214.49|      Amex|
|2012-01-01|09:15|Albuquerque|            Music| 305.0|MasterCard|
|2012-04-04|10:08|Albuquerque|           Garden| 271.3|MasterCard|
|2012-01-01|09:15|Albuquerque|Health and Beauty|192.48|      Amex|
|2012-05-20|14:52|Albuquerque|   Men's Clothing|114.61|      Amex|
|2012-01-01|09:08|Albuquerque|     Pet Supplies|484.24|  Discover|
|2012-04-04|10:34|Albuquerque|           Garden|327.88|      Amex|
|2012-01-01|09:09|Albuquerque|           Crafts|149.94|      Cash|
|2012-07-06|10:26|Albuquerque|            Books|309.12|      Visa|
|2012-01-01|09:03|Albuquerque|     Pet Supplies| 440.7|      Cash|
+----------+-----+-----------+-----------------+------+----------+
only showing top 10 rows



In [0]:
df.groupby('City').count().show()

+---------------+-----+
|           City|count|
+---------------+-----+
|North Las Vegas|40013|
|        Phoenix|40333|
|          Omaha|40209|
|      Anchorage|39806|
|        Anaheim|40086|
|     Greensboro|40232|
|         Dallas|40368|
|        Oakland|39728|
|         Laredo|40342|
|     Scottsdale|40173|
|    San Antonio|40197|
|    Bakersfield|40326|
|        Raleigh|40261|
|    Chula Vista|40080|
|   Philadelphia|40748|
|     Louisville|40099|
|    Los Angeles|40254|
|       Chandler|39826|
|     Sacramento|40561|
|   Indianapolis|40321|
+---------------+-----+
only showing top 20 rows



In [0]:
df_index = df.withColumn("index",monotonically_increasing_id())

In [0]:
df_index.filter(df_index["index"] > 50).show(10)

+----------+-----+----------+-----------------+------+----------+-----+
|      Date| Time|      City|             Item| Total|   Payment|index|
+----------+-----+----------+-----------------+------+----------+-----+
|2012-01-01|09:02|  New York|           Garden| 18.27|  Discover|   51|
|2012-01-01|09:02|Greensboro|        Computers|140.94|  Discover|   52|
|2012-01-01|09:02|   Spokane|             Toys|157.48|MasterCard|   53|
|2012-01-01|09:02|     Boise|      Video Games|350.55|  Discover|   54|
|2012-01-01|09:02|    Fresno|           Crafts|196.83|      Visa|   55|
|2012-01-01|09:02|    Durham|             Toys|425.79|      Visa|   56|
|2012-01-01|09:02| Riverside|              CDs|472.71|      Cash|   57|
|2012-01-01|09:02| Lexington| Women's Clothing|359.29|      Visa|   58|
|2012-01-01|09:02|    Durham|Health and Beauty|131.97|      Visa|   59|
|2012-01-01|09:02|   Garland|           Garden|134.33|  Discover|   60|
+----------+-----+----------+-----------------+------+----------

In [0]:
df_index.createOrReplaceTempView("purchase")

df_sql = spark.sql("SELECT * FROM purchase")
df_sql.show(10)

+----------+-----+--------------+--------------------+------+----------+-----+
|      Date| Time|          City|                Item| Total|   Payment|index|
+----------+-----+--------------+--------------------+------+----------+-----+
|2012-01-01|09:00|      San Jose|      Men's Clothing|214.05|      Amex|    0|
|2012-01-01|09:00|    Fort Worth|    Women's Clothing|153.57|      Visa|    1|
|2012-01-01|09:00|     San Diego|               Music| 66.08|      Cash|    2|
|2012-01-01|09:00|    Pittsburgh|        Pet Supplies|493.51|  Discover|    3|
|2012-01-01|09:00|         Omaha| Children's Clothing|235.63|MasterCard|    4|
|2012-01-01|09:00|      Stockton|      Men's Clothing|247.18|MasterCard|    5|
|2012-01-01|09:00|        Austin|             Cameras| 379.6|      Visa|    6|
|2012-01-01|09:00|      New York|Consumer Electronics| 296.8|      Cash|    7|
|2012-01-01|09:00|Corpus Christi|                Toys| 25.38|  Discover|    8|
|2012-01-01|09:00|    Fort Worth|                Toy

In [0]:
spark.sql("SELECT * FROM purchase ORDER BY City").show()

+----------+-----+-----------+-------------------+------+----------+-----------+
|      Date| Time|       City|               Item| Total|   Payment|      index|
+----------+-----+-----------+-------------------+------+----------+-----------+
|2012-11-22|16:39|Albuquerque|       Pet Supplies| 53.76|      Cash|60129544255|
|2012-04-04|10:32|Albuquerque|              Books| 19.32|MasterCard|17179870049|
|2012-01-01|10:16|Albuquerque|               DVDs|299.37|      Visa|       1608|
|2012-04-04|10:06|Albuquerque|               DVDs|  8.89|      Cash|17179869487|
|2012-07-06|11:11|Albuquerque|               DVDs|215.62|      Visa|34359740209|
|2012-04-04|10:34|Albuquerque|             Garden|327.88|      Amex|17179870075|
|2012-01-01|11:53|Albuquerque|          Computers|  0.11|      Cash|       3605|
|2012-04-04|09:52|Albuquerque|Children's Clothing|155.25|      Amex|17179869200|
|2012-08-21|15:53|Albuquerque|        Video Games| 96.08|  Discover|42949674487|
|2012-04-04|10:36|Albuquerqu

In [0]:
spark.sql("SELECT * FROM purchase WHERE TOTAL > 200 ORDER BY City").show(10)

+----------+-----+-----------+-----------------+------+----------+-----------+
|      Date| Time|       City|             Item| Total|   Payment|      index|
+----------+-----+-----------+-----------------+------+----------+-----------+
|2012-11-22|15:49|Albuquerque|Health and Beauty|415.89|MasterCard|60129543191|
|2012-07-06|10:48|Albuquerque|          Cameras|330.83|      Amex|34359739739|
|2012-02-16|14:40|Albuquerque|   Sporting Goods|255.31|      Amex| 8589935492|
|2012-07-06|11:22|Albuquerque|              CDs|496.78|      Cash|34359740457|
|2012-01-01|09:35|Albuquerque|           Crafts|218.31|      Amex|        741|
|2012-07-06|10:57|Albuquerque|             Baby|271.91|      Visa|34359739926|
|2012-02-16|15:14|Albuquerque|            Books|320.15|MasterCard| 8589936173|
|2012-07-06|09:57|Albuquerque|             Toys|236.87|      Cash|34359738650|
|2012-08-21|15:54|Albuquerque|            Books|283.46|      Amex|42949674502|
|2012-07-06|10:57|Albuquerque|   Sporting Goods|403.