<a href="https://colab.research.google.com/github/VictorL85/pyspark/blob/main/agregacao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
df_video = spark.read.parquet('/content/videos-preparados.snappy.parquet', header=True, inferSchema=True)

In [None]:
df_video.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)
 |-- Interaction: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Keyword Index: double (nullable = true)
 |-- Features PCA: vector (nullable = true)
 |-- Features Normal: vector (nullable = true)
 |-- Features: vector (nullable = true)



In [None]:
df_video.groupBy('Keyword').count().show()

+----------------+-----+
|         Keyword|count|
+----------------+-----+
|computer science|   48|
|            lofi|   40|
|         finance|   39|
|             cnn|   50|
|           apple|   42|
|            news|   39|
|         mukbang|   45|
|       education|   24|
|       interview|   50|
|          crypto|   50|
|   mathchemistry|   15|
|            food|   48|
|    data science|   50|
|        trolling|   50|
|        tutorial|   50|
|      literature|   46|
|             sat|   49|
|         history|   49|
|           cubes|   49|
|           music|   46|
+----------------+-----+
only showing top 20 rows



In [None]:
df_video.groupBy('Keyword').agg(format_number(avg('Interaction'), 2).alias('media Interaction')).show()

+----------------+-----------------+
|         Keyword|media Interaction|
+----------------+-----------------+
|computer science|     1,226,793.02|
|            lofi|     4,167,085.88|
|         finance|       708,542.95|
|             cnn|       570,650.86|
|           apple|    10,873,628.21|
|            news|       251,688.72|
|         mukbang|    11,053,630.38|
|       education|     2,750,838.62|
|       interview|     3,044,867.04|
|          crypto|       413,676.20|
|   mathchemistry|     3,427,342.73|
|            food|     5,352,944.10|
|    data science|       562,465.28|
|        trolling|     1,484,584.88|
|        tutorial|     6,936,688.30|
|      literature|       881,726.50|
|             sat|     1,098,927.00|
|         history|    15,652,692.57|
|           cubes|    15,043,961.22|
|           music|    29,691,370.30|
+----------------+-----------------+
only showing top 20 rows



In [None]:
df_video.groupBy('Keyword').agg(max('Interaction').alias('Rank Interactions')).orderBy(desc('Rank Interactions')).show()

+--------+-----------------+
| Keyword|Rank Interactions|
+--------+-----------------+
| animals|       1593623628|
|   music|        922551152|
|     bed|        532691631|
| history|        440187490|
|   apple|        429916936|
| mrbeast|        300397699|
|  google|        239385460|
|business|        210025196|
|   cubes|        170925917|
|  sports|        106924567|
| mukbang|         87433858|
|    lofi|         86445177|
|tutorial|         69616442|
|  movies|         65253870|
|  marvel|         56247330|
|  how-to|         53053975|
|    food|         48754479|
| physics|         43463298|
|    asmr|         34411125|
|nintendo|         32268486|
+--------+-----------------+
only showing top 20 rows



In [None]:
df_video.groupBy('Keyword').agg(format_number(avg('Likes'), 2).alias('media Likes'),
                                format_number(var_samp('Likes'), 2).alias('variância Likes')).show()

+----------------+-----------+--------------------+
|         Keyword|media Likes|     variância Likes|
+----------------+-----------+--------------------+
|computer science|  33,663.54|    2,030,136,676.30|
|            lofi|  75,263.68|   67,951,488,729.40|
|         finance|  13,235.49|    1,285,092,542.36|
|             cnn|   8,835.74|       52,239,414.93|
|           apple| 118,078.90|  408,226,900,985.16|
|            news|   3,013.10|       23,296,795.36|
|         mukbang| 141,406.64|   87,236,124,034.19|
|       education|  62,292.33|   12,301,056,627.97|
|       interview|  73,326.94|   12,322,827,963.40|
|          crypto|   8,521.36|      838,950,927.62|
|   mathchemistry|  93,848.67|   25,924,833,047.10|
|            food|  95,014.65|   16,164,496,897.04|
|    data science|  17,175.78|      501,319,417.89|
|        trolling|  61,387.04|   10,309,316,974.98|
|        tutorial| 167,720.88|   62,167,859,559.05|
|      literature|  17,825.41|      347,735,617.05|
|           

In [None]:
df_video.groupBy('Keyword').agg(format_number(avg('Likes'), 0).alias('media Likes'),
                                format_number(min('Likes'), 0).alias('mínimo Likes'),
                                format_number(max('Likes'), 0).alias('máximo Likes')).show()

+----------------+-----------+------------+------------+
|         Keyword|media Likes|mínimo Likes|máximo Likes|
+----------------+-----------+------------+------------+
|computer science|     33,664|          32|     202,969|
|            lofi|     75,264|          53|   1,638,369|
|         finance|     13,235|          13|     213,398|
|             cnn|      8,836|         544|      31,563|
|           apple|    118,079|         184|   4,144,389|
|            news|      3,013|          82|      23,828|
|         mukbang|    141,407|          -1|   1,221,507|
|       education|     62,292|         324|     431,284|
|       interview|     73,327|          97|     493,471|
|          crypto|      8,521|          66|     182,663|
|   mathchemistry|     93,849|           0|     542,855|
|            food|     95,015|         774|     672,749|
|    data science|     17,176|          71|     118,846|
|        trolling|     61,387|         344|     455,277|
|        tutorial|    167,721| 

In [None]:
df_video.groupBy('Keyword').agg(first('Published At').alias('primeira publicação'),
                                last('Published At').alias('última publicação')).show()

+----------------+-------------------+-----------------+
|         Keyword|primeira publicação|última publicação|
+----------------+-------------------+-----------------+
|computer science|         2022-02-08|       2020-09-08|
|            lofi|         2022-06-07|       2020-07-19|
|         finance|         2020-09-23|       2017-12-31|
|             cnn|         2022-08-17|       2022-08-13|
|           apple|         2022-08-22|       2022-08-02|
|            news|         2022-08-22|       2022-08-23|
|         mukbang|         2020-04-18|       2022-08-24|
|       education|         2015-02-06|       2010-10-14|
|       interview|         2021-08-03|       2018-10-05|
|          crypto|         2022-08-23|       2022-08-22|
|   mathchemistry|         2020-08-11|       2019-10-04|
|            food|         2022-07-17|       2022-08-20|
|    data science|         2019-08-18|       2021-08-06|
|        trolling|         2022-08-23|       2022-07-20|
|        tutorial|         2018

In [None]:
df_video.groupBy('Title').agg(count('Title').alias('Normal'),countDistinct('Title').alias('Distinto')).show()

+--------------------+------+--------+
|               Title|Normal|Distinto|
+--------------------+------+--------+
|Military Tank Exp...|     1|       1|
|Heart Of The Fore...|     1|       1|
|My College Advice...|     1|       1|
|Physicists At CER...|     1|       1|
|How To ACTUALLY G...|     1|       1|
|SEAFOOD MUKBANG |...|     1|       1|
|Nintendo Switch S...|     1|       1|
|Numbaa 7 - Chess ...|     1|       1|
|Music Mix 2022 🎧...|     1|       1|
|🔥5 CRYPTO COINS ...|     1|       1|
|50 Cent Talks &qu...|     1|       1|
|Representing Numb...|     1|       1|
|Punishing Beginne...|     1|       1|
|World&#39;s *WEIR...|     1|       1|
|Trolling Jet Grie...|     1|       1|
|What Is Data Scie...|     1|       1|
|I Built MrBeast’s...|     1|       1|
|Music Mix 2022 🎧...|     1|       1|
|Apple September 6...|     1|       1|
|Fulham v. Brentfo...|     1|       1|
+--------------------+------+--------+
only showing top 20 rows



In [None]:
df_video.groupBy('Year').agg(count('Year').alias('Registros')).orderBy('Year').show()

+----+---------+
|Year|Registros|
+----+---------+
|2007|        2|
|2008|        1|
|2009|        9|
|2010|        6|
|2011|        4|
|2012|       12|
|2013|        6|
|2014|       10|
|2015|       15|
|2016|       34|
|2017|       47|
|2018|       57|
|2019|       86|
|2020|      158|
|2021|      229|
|2022|     1193|
+----+---------+



In [None]:
df_video.groupBy('Year').agg(count('Year').alias('Registros')).orderBy('Registros').show()

+----+---------+
|Year|Registros|
+----+---------+
|2008|        1|
|2007|        2|
|2011|        4|
|2013|        6|
|2010|        6|
|2009|        9|
|2014|       10|
|2012|       12|
|2015|       15|
|2016|       34|
|2017|       47|
|2018|       57|
|2019|       86|
|2020|      158|
|2021|      229|
|2022|     1193|
+----+---------+



In [None]:
df_video.groupBy('Year', 'Month').agg(count('Year').alias('Registros')).orderBy('Year', 'Month').show()

+----+-----+---------+
|Year|Month|Registros|
+----+-----+---------+
|2007|    7|        1|
|2007|   12|        1|
|2008|    7|        1|
|2009|    2|        2|
|2009|    6|        2|
|2009|    7|        1|
|2009|    8|        1|
|2009|   10|        1|
|2009|   12|        2|
|2010|    3|        1|
|2010|    5|        2|
|2010|    6|        1|
|2010|    9|        1|
|2010|   10|        1|
|2011|    2|        1|
|2011|    5|        1|
|2011|    9|        1|
|2011|   10|        1|
|2012|    1|        1|
|2012|    2|        3|
+----+-----+---------+
only showing top 20 rows



In [None]:
df_video.groupBy('Year', 'Month').agg(count('Year').alias('Registros')).orderBy('Registros').show()

+----+-----+---------+
|Year|Month|Registros|
+----+-----+---------+
|2011|    2|        1|
|2010|    9|        1|
|2016|    7|        1|
|2015|   12|        1|
|2012|    8|        1|
|2012|    4|        1|
|2015|    8|        1|
|2014|   12|        1|
|2013|    3|        1|
|2013|    6|        1|
|2010|    6|        1|
|2015|    4|        1|
|2012|   10|        1|
|2015|    2|        1|
|2015|   11|        1|
|2016|   10|        1|
|2009|   10|        1|
|2008|    7|        1|
|2011|    5|        1|
|2016|   12|        1|
+----+-----+---------+
only showing top 20 rows



In [None]:
janela = Window.partitionBy('Keyword').orderBy('Year')
df_video = df_video.withColumn('Likes por ano', format_number(avg('Likes').over(janela),2))
df_video.select('Keyword', 'Year', 'Likes por ano').show()

+-------+----+-------------+
|Keyword|Year|Likes por ano|
+-------+----+-------------+
|animals|2009| 1,357,197.00|
|animals|2010|   587,977.00|
|animals|2010|   587,977.00|
|animals|2013| 3,197,276.75|
|animals|2014| 3,258,727.83|
|animals|2014| 3,258,727.83|
|animals|2019| 2,950,868.57|
|animals|2020| 1,723,934.31|
|animals|2020| 1,723,934.31|
|animals|2020| 1,723,934.31|
|animals|2020| 1,723,934.31|
|animals|2020| 1,723,934.31|
|animals|2020| 1,723,934.31|
|animals|2020| 1,723,934.31|
|animals|2020| 1,723,934.31|
|animals|2020| 1,723,934.31|
|animals|2021| 1,186,866.12|
|animals|2021| 1,186,866.12|
|animals|2021| 1,186,866.12|
|animals|2021| 1,186,866.12|
+-------+----+-------------+
only showing top 20 rows



In [None]:
spark.stop()