In [1]:
! pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 43 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 46.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=730cd854ee328fc714d3093fda34bc364c92108b82daf1926da816e6fca7d67b
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [2]:
from pyspark.sql import SparkSession

In [None]:
from google.colab import files
uploaded = files.upload()

Saving people.json to people.json


In [5]:
spark = SparkSession.builder.appName("Basic").getOrCreate()

In [None]:
df = spark.read.json("people.json")

In [None]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [None]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [None]:
df.columns

['age', 'name']

In [None]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [None]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [None]:
datashema = [StructField('age',IntegerType(),True), StructField('name',StringType(),True)]

In [None]:
final_struct = StructType(fields=datashema)

In [None]:
df = spark.read.json('people.json',schema=final_struct)

In [None]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [None]:
type(df['age'])

pyspark.sql.column.Column

In [None]:
type(df.select('age'))

pyspark.sql.dataframe.DataFrame

In [None]:
df.head(2)[0]

Row(age=None, name='Michael')

In [None]:
type(df.head(2)[0])

pyspark.sql.types.Row

In [None]:
## Multiple Column Select 
df.select(['age','name'])

DataFrame[age: int, name: string]

In [None]:
df.select(['age','name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [None]:
# Create columns
df.withColumn('newage',df['age']*2).show() # this will only be showned but not assigned to the dataframe

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    60|
|  19| Justin|    38|
+----+-------+------+



In [None]:
df = df.withColumn('newage',df['age']*2)

In [None]:
df.show()

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    60|
|  19| Justin|    38|
+----+-------+------+



In [None]:
df = df.withColumnRenamed('newage','double_age')

In [None]:
df.show()

+----+-------+----------+
| age|   name|double_age|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        60|
|  19| Justin|        38|
+----+-------+----------+



In [None]:
# Pure SQL can be used 

df.createOrReplaceTempView('people')
result = spark.sql('select * from people')
result.show()

+----+-------+----------+
| age|   name|double_age|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        60|
|  19| Justin|        38|
+----+-------+----------+



In [None]:
from google.colab import files
uploaded = files.upload()

Saving appl_stock.csv to appl_stock.csv


In [None]:
## Spark way of tranformation 
df_stock = spark.read.csv('appl_stock.csv',header=True,inferSchema=True)
df_stock.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

In [None]:
#Filter records
df_stock.filter("Close <= 200").show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-22 00:00:00|206.78000600000001|        207.499996|            197.16|            197.75|220441900|         25.620401|
|2010-01-28 00:00:00|        204.930004|        205.500004|        198.699995|        199.289995|293375600|25.819922000000002|
|2010-01-29 00:00:00|        201.079996|        202.199995|        190.250002|        192.060003|311488100|         24.883208|
|2010-02-01 00:00:00|192.36999699999998|             196.0|191.29999899999999|        194.729998|187469100|         25.229131|
|2010-02-02 00:00:00|        195.909998|        196.319994|193.37999299999998|        195.859997|174585600|25.3

In [None]:
df_stock.filter("Close<=200").select(['Date','Open','Close']).show()

+-------------------+------------------+------------------+
|               Date|              Open|             Close|
+-------------------+------------------+------------------+
|2010-01-22 00:00:00|206.78000600000001|            197.75|
|2010-01-28 00:00:00|        204.930004|        199.289995|
|2010-01-29 00:00:00|        201.079996|        192.060003|
|2010-02-01 00:00:00|192.36999699999998|        194.729998|
|2010-02-02 00:00:00|        195.909998|        195.859997|
|2010-02-03 00:00:00|        195.169994|        199.229994|
|2010-02-04 00:00:00|        196.730003|        192.050003|
|2010-02-05 00:00:00|192.63000300000002|        195.460001|
|2010-02-08 00:00:00|        195.690006|194.11999699999998|
|2010-02-09 00:00:00|        196.419996|196.19000400000002|
|2010-02-10 00:00:00|        195.889997|195.12000700000002|
|2010-02-11 00:00:00|        194.880001|        198.669994|
|2010-02-23 00:00:00|        199.999998|        197.059998|
|2014-06-09 00:00:00|         92.699997|

In [None]:
df_stock.filter(df_stock['Close']>=300).select(['Open','High']).show()

+------------------+------------------+
|              Open|              High|
+------------------+------------------+
|        300.200008|        301.959995|
|        301.690002|        302.469994|
|307.43998700000003|             315.0|
|        318.470013|        319.000011|
|303.40000200000003|        313.770012|
|        308.999996|        314.249996|
|312.35999300000003|314.73999399999997|
|309.07001099999997|310.03999300000004|
|        309.090012|        311.600002|
|306.86998700000004|        309.740013|
|307.65000499999996|         309.89999|
|        307.950012|             308.0|
|        304.230007|        305.880005|
|302.22000099999997|305.59999799999997|
|        307.000004|310.19001000000003|
|311.37001000000004|        312.880005|
|        315.449997|320.18001200000003|
|        317.990002|319.57001099999997|
|        317.199997|        319.769989|
|        321.049992|        321.300011|
+------------------+------------------+
only showing top 20 rows



In [None]:
## Multiple Filter Column 
df_stock.filter((df_stock['Close']> 300) & (df_stock['Open']> 350)).show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2011-02-08 00:00:00|        353.680004|        355.519993|        352.150009|        355.200012| 95260200|         46.019554|
|2011-02-09 00:00:00|        355.189999|        358.999992|        354.869991|            358.16|120686300|46.403048999999996|
|2011-02-10 00:00:00|        357.389996|        359.999989|        348.000008|        354.539997|232137500|         45.934043|
|2011-02-11 00:00:00|        354.749989|        357.799992|353.54000099999996|         356.85001| 91893200|         46.233327|
|2011-02-14 00:00:00|        356.790009|359.48000299999995|356.71000699999996|        359.179996| 77604100|    

In [None]:
df_stock.filter(df_stock['Low'] == 197.16).show()

+-------------------+------------------+----------+------+------+---------+---------+
|               Date|              Open|      High|   Low| Close|   Volume|Adj Close|
+-------------------+------------------+----------+------+------+---------+---------+
|2010-01-22 00:00:00|206.78000600000001|207.499996|197.16|197.75|220441900|25.620401|
+-------------------+------------------+----------+------+------+---------+---------+



In [None]:
result = df_stock.filter(df_stock['Low'] == 197.16).collect() ## this returns as a list objects

In [None]:
# convert this row list to disctionary 
type(result[0]) ## this is a row object
dic = result[0].asDict()

In [None]:
dic['Volume']

220441900

In [None]:
from google.colab import files
uploaded = files.upload()

Saving sales_info.csv to sales_info.csv


In [None]:
df = spark.read.csv('sales_info.csv',header=True,inferSchema=True)

In [None]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [None]:
## group by operation the aggrating

df.groupBy('Company').mean().show()

+-------+-----------------+
|Company|       avg(Sales)|
+-------+-----------------+
|   APPL|            370.0|
|   GOOG|            220.0|
|     FB|            610.0|
|   MSFT|322.3333333333333|
+-------+-----------------+



In [None]:

df.groupBy('Company').count().show()

+-------+-----+
|Company|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+



In [None]:
# Aggregating the data

df.agg({"Sales":"max"}).show()

+----------+
|max(Sales)|
+----------+
|     870.0|
+----------+



In [None]:
group_data = df.groupBy('Company')
group_data.agg({'Sales':'sum'}).show()

+-------+----------+
|Company|sum(Sales)|
+-------+----------+
|   APPL|    1480.0|
|   GOOG|     660.0|
|     FB|    1220.0|
|   MSFT|     967.0|
+-------+----------+



In [None]:
## Import pyspark functions 

from pyspark.sql.functions import avg,countDistinct,stddev 

In [None]:
df.select(countDistinct('Company')).show()

+-----------------------+
|count(DISTINCT Company)|
+-----------------------+
|                      4|
+-----------------------+



In [None]:
df.select(avg('Sales')).show()

+-----------------+
|       avg(Sales)|
+-----------------+
|360.5833333333333|
+-----------------+



In [None]:
df_dev=df.select(stddev('Sales').alias('STD dev Sales'))

In [None]:
## Import pyspark functions 

from pyspark.sql.functions import format_number

In [None]:
df_dev.select(format_number('STD dev Sales',2)).show()

+-------------------------------+
|format_number(STD dev Sales, 2)|
+-------------------------------+
|                         250.09|
+-------------------------------+



In [None]:
# Sort the data
df.orderBy('Sales').show()


+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+



In [None]:
df.orderBy(df['Sales'].desc()).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|     FB|   Carl|870.0|
|   APPL|   Mike|750.0|
|   MSFT|   Tina|600.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   GOOG|  Frank|340.0|
|   APPL|   John|250.0|
|   MSFT|Vanessa|243.0|
|   GOOG|    Sam|200.0|
|   APPL|  Linda|130.0|
|   MSFT|    Amy|124.0|
|   GOOG|Charlie|120.0|
+-------+-------+-----+



In [3]:
from google.colab import files
uploaded = files.upload()

Saving appl_stock.csv to appl_stock.csv


In [6]:
#Date n time 
df = spark.read.csv('appl_stock.csv',header=True,inferSchema=True)
df.select(['Date','Open']).show()

+-------------------+------------------+
|               Date|              Open|
+-------------------+------------------+
|2010-01-04 00:00:00|        213.429998|
|2010-01-05 00:00:00|        214.599998|
|2010-01-06 00:00:00|        214.379993|
|2010-01-07 00:00:00|            211.75|
|2010-01-08 00:00:00|        210.299994|
|2010-01-11 00:00:00|212.79999700000002|
|2010-01-12 00:00:00|209.18999499999998|
|2010-01-13 00:00:00|        207.870005|
|2010-01-14 00:00:00|210.11000299999998|
|2010-01-15 00:00:00|210.92999500000002|
|2010-01-19 00:00:00|        208.330002|
|2010-01-20 00:00:00|        214.910006|
|2010-01-21 00:00:00|        212.079994|
|2010-01-22 00:00:00|206.78000600000001|
|2010-01-25 00:00:00|202.51000200000001|
|2010-01-26 00:00:00|205.95000100000001|
|2010-01-27 00:00:00|        206.849995|
|2010-01-28 00:00:00|        204.930004|
|2010-01-29 00:00:00|        201.079996|
|2010-02-01 00:00:00|192.36999699999998|
+-------------------+------------------+
only showing top

In [8]:
from pyspark.sql.functions import dayofyear, hour, month, year, weekofyear,format_number,date_format

In [None]:
#df.select(year('Date')).show()
#df.select(month('Date')).show()
df.select(dayofyear('Date')).show()

+---------------+
|dayofyear(Date)|
+---------------+
|              4|
|              5|
|              6|
|              7|
|              8|
|             11|
|             12|
|             13|
|             14|
|             15|
|             19|
|             20|
|             21|
|             22|
|             25|
|             26|
|             27|
|             28|
|             29|
|             32|
+---------------+
only showing top 20 rows



In [9]:
newdf = df.withColumn("Year",year(df['Date']))
newdf.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+----+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|Year|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+----+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|2010|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|2010|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|2010|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|2010|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.0600050000000

In [10]:
result = newdf.groupBy('Year').mean().select(['Year','avg(Close)'])

In [17]:
new = result.withColumnRenamed('avg(Close)','Average Closing Price')
new.show()


+----+---------------------+
|Year|Average Closing Price|
+----+---------------------+
|2015|   120.03999980555547|
|2013|    472.6348802857143|
|2014|    295.4023416507935|
|2012|    576.0497195640002|
|2016|   104.60400786904763|
|2010|    259.8424600000002|
|2011|   364.00432532142867|
+----+---------------------+



In [19]:
new.select(['Year',format_number('Average Closing Price',2).alias('Avg Close')]).show()

+----+---------+
|Year|Avg Close|
+----+---------+
|2015|   120.04|
|2013|   472.63|
|2014|   295.40|
|2012|   576.05|
|2016|   104.60|
|2010|   259.84|
|2011|   364.00|
+----+---------+

