In [1]:
!pip install pyspark
import os
import sys
from pyspark.sql import SparkSession
from typing import NamedTuple
from datetime import datetime

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
app_name = "lr2"
spark = SparkSession.builder \
    .appName(app_name) \
    .master("local[1]") \
    .getOrCreate()
sc = spark.sparkContext
sc

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=e70229887cfe150563239115a6dc1d344791307a6d94e9f3da717c8abb35ae35
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
sc.stop()
spark.stop()

In [2]:
from pyspark.sql.functions import col, explode, regexp_extract, lower, desc, split, regexp_replace
from pyspark.sql.types import IntegerType

In [3]:
def parse_xml(row):
    import xml.etree.ElementTree as ET
    try:
        root = ET.fromstring(row)
        return (int(root.attrib['Id']), root.attrib['CreationDate'], root.attrib.get('Tags', ''))
    except Exception:
        return None

In [7]:
# Загрузка и парсинг данных
posts_lines = spark.read.text("data/posts_sample.xml").rdd.map(lambda r: r[0])
total_lines = posts_lines.count()
posts_data = posts_lines.zipWithIndex()\
    .filter(lambda x: x[1] > 0 and x[1] < total_lines - 1)\
    .map(lambda x: x[0])

In [8]:
posts_rdd = posts_data.map(parse_xml).filter(lambda x: x is not None)
posts_df = spark.createDataFrame(posts_rdd, schema="Id INT, CreationDate STRING, Tags STRING")

# Выделение года и языков
posts_df = posts_df.withColumn("Year", regexp_extract("CreationDate", "(\d{4})", 1).cast(IntegerType()))
posts_df = posts_df.filter((col("Year") >= 2010) & (col("Year") <= 2020))
posts_df = posts_df.withColumn("Language", explode(split(regexp_replace("Tags", r"<|>", " "), " ")))

# Загрузка данных о языках программирования
languages_df = spark.read.csv("data/programming-languages.csv", header=True).select(col("name").alias("LanguageName"))

# Сопоставление тегов и языков программирования
result_df = posts_df.join(languages_df, lower(posts_df.Language) == lower(languages_df.LanguageName), "inner")

# Группировка и подсчет упоминаний языков
language_popularity = result_df.groupBy("Year", "LanguageName").count().orderBy("Year", desc("count"))


In [9]:
# Печать результатов и сохранение в Parquet с перезаписью
for year in range(2010, 2021):
    year_df = language_popularity.filter(col("Year") == year).orderBy(desc("count")).limit(10)
    print(f"Top 10 Languages for {year}:")
    year_df.show()
    year_df.write.mode("overwrite").parquet(f"output/language_popularity_{year}.parquet")

Top 10 Languages for 2010:
+----+------------+-----+
|Year|LanguageName|count|
+----+------------+-----+
|2010|        Java|   52|
|2010|         PHP|   46|
|2010|  JavaScript|   44|
|2010|      Python|   26|
|2010| Objective-C|   23|
|2010|           C|   20|
|2010|        Ruby|   12|
|2010|      Delphi|    8|
|2010|        Bash|    3|
|2010|           R|    3|
+----+------------+-----+

Top 10 Languages for 2011:
+----+------------+-----+
|Year|LanguageName|count|
+----+------------+-----+
|2011|         PHP|  102|
|2011|        Java|   93|
|2011|  JavaScript|   83|
|2011|      Python|   37|
|2011| Objective-C|   34|
|2011|           C|   24|
|2011|        Ruby|   20|
|2011|        Perl|    9|
|2011|      Delphi|    8|
|2011|        Bash|    7|
+----+------------+-----+

Top 10 Languages for 2012:
+----+------------+-----+
|Year|LanguageName|count|
+----+------------+-----+
|2012|         PHP|  154|
|2012|  JavaScript|  132|
|2012|        Java|  124|
|2012|      Python|   69|
|2012| 