In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz

In [18]:
import os
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!pip install findspark



In [19]:
import findspark
findspark.init()
!pip3 install pyspark==3.0.0



<h1><center>Инициализация</center></h1>

In [6]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import DoubleType, IntegerType, ArrayType, StringType
from pyspark.sql.functions import udf, explode, rank, col, max, sum, desc, countDistinct
import re
from typing import List
import pyspark.sql as sql

In [7]:
spark = SparkSession \
    .builder \
    .appName("L2_reports_with_apache_spark") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.13.0") \
    .getOrCreate()

<h1><center>Загрузка</center></h1>

In [8]:
import os
prog_path = '/content/programming-languages.csv'
posts_path = '/content/posts_sample.xml'
posts = spark.read.format('xml').options(rowTag='row').load(posts_path)

In [10]:
program = spark.read \
      .option("header", True) \
      .option("inferSchema", True) \
      .option("DateTimeFormat", 'M/d/y H:m') \
      .csv(prog_path)

<h1><center>Решение</center></h1>

In [16]:
def get_year(date_and_time):
    return date_and_time.year

def get_tags(tags_string):
    if tags_string is None:
        return []
    pattern = r'<(.+?)>'
    tags = re.findall(pattern, tags_string)
    return tags

get_tags_udf = udf(get_tags, ArrayType(StringType()))
get_year_udf = udf(get_year, IntegerType())
posts_data_simplified = posts \
                    .withColumn("tags", get_tags_udf(posts["_Tags"])) \
                    .withColumn("year", get_year_udf(posts["_LastActivityDate"]))
posts_data_simplified = posts_data_simplified.select(col("tags"), col("year"), col("_ViewCount").alias("views"))
first_rows = posts_data_simplified.head(10)
for i, row in enumerate(first_rows):
    print(i+1, row)


posts_data_sort = posts_data_simplified.select("year", explode("tags").alias("tag"), "views")

posts_data_sort = posts_data_sort.groupBy("year", "tag").agg(sum("views").alias("total_views")) #группировка по году последней активности и тегам, сложение всех просмотров для каждого яп в рамках одного года

posts_data_sort = posts_data_sort.orderBy("year", desc("total_views"))
posts_data_sort.show()


window = Window.partitionBy("year").orderBy(posts_data_sort["total_views"].desc())

#добавляем колонку rank в DataFrame
rank_df = posts_data_sort.withColumn("rank", rank().over(window))
res_df = rank_df.filter(rank_df["rank"] <= 5)
res_df = res_df.select("year", "tag", "total_views")

posts_data_sort_res = res_df.orderBy("year", desc("total_views"))
posts_data_sort_res.show()
posts_data_sort_res.write.parquet("posts_data_sorted_result.parquet")

1 Row(tags=['c#', 'floating-point', 'type-conversion', 'double', 'decimal'], year=2019, views=42817)
2 Row(tags=['html', 'css', 'internet-explorer-7'], year=2019, views=18214)
3 Row(tags=[], year=2017, views=None)
4 Row(tags=['c#', '.net', 'datetime'], year=2019, views=555183)
5 Row(tags=['c#', 'datetime', 'time', 'datediff', 'relative-time-span'], year=2019, views=149445)
6 Row(tags=[], year=2018, views=None)
7 Row(tags=['html', 'browser', 'timezone', 'user-agent', 'timezone-offset'], year=2019, views=176405)
8 Row(tags=['.net', 'math'], year=2018, views=123231)
9 Row(tags=[], year=2010, views=None)
10 Row(tags=[], year=2010, views=None)
+----+--------------------+-----------+
|year|                 tag|total_views|
+----+--------------------+-----------+
|2008|                  c#|      25401|
|2008|                .net|      24321|
|2008|            database|      19682|
|2008|               local|      19682|
|2008|                java|      11532|
|2008|         inheritance|      

In [14]:
import shutil
directory_path = "posts_data_sorted_result.parquet"
shutil.rmtree(directory_path) #удаление директории