In [1]:
import pyspark
import os

from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StructType, StructField, StringType, DateType

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.13.0 pyspark-shell'

app_name = "lab2"

sc = SparkSession.builder.appName(app_name).master("local[*]").getOrCreate()



In [2]:
data = sc.read.csv("programming-languages.csv")
languages = [str(x[0]) for x in data.collect()]
languages[0:10]

['name',
 'A# .NET',
 'A# (Axiom)',
 'A-0 System',
 'A+',
 'A++',
 'ABAP',
 'ABC',
 'ABC ALGOL',
 'ABSET']

In [3]:
posts_samples = sc.read.format("xml").options(rowTag="row").load('posts_sample.xml')
posts_samples.first()

Row(_AcceptedAnswerId=7, _AnswerCount=13, _Body="<p>I want to use a track-bar to change a form's opacity.</p>\n\n<p>This is my code:</p>\n\n<pre><code>decimal trans = trackBar1.Value / 5000;\nthis.Opacity = trans;\n</code></pre>\n\n<p>When I build the application, it gives the following error:</p>\n\n<blockquote>\n  <p>Cannot implicitly convert type <code>'decimal'</code> to <code>'double'</code></p>\n</blockquote>\n\n<p>I tried using <code>trans</code> and <code>double</code> but then the control doesn't work. This code worked fine in a past VB.NET project.</p>\n", _ClosedDate=None, _CommentCount=2, _CommunityOwnedDate=datetime.datetime(2012, 10, 31, 16, 42, 47, 213000), _CreationDate=datetime.datetime(2008, 7, 31, 21, 42, 52, 667000), _FavoriteCount=48, _Id=4, _LastActivityDate=datetime.datetime(2019, 7, 19, 1, 39, 54, 173000), _LastEditDate=datetime.datetime(2019, 7, 19, 1, 39, 54, 173000), _LastEditorDisplayName='Rich B', _LastEditorUserId=3641067, _OwnerDisplayName=None, _OwnerUse

In [4]:
def isDateValid(x, year):
    start = datetime(year=year, month=1, day=1)
    end = datetime(year=year, month=12, day=31)
    CreationDate = x._CreationDate
    return CreationDate >= start and CreationDate <= end

def findLanguages(x):
    tag = None

    for language in languages:
        if "<" + language.lower() + ">" in x._Tags.lower():
            tag = language
            break

    if tag is None:
        return None

    return (x._Id, tag)

In [5]:
result = {}

for year in range(2010, 2020):
    result[year] = posts_samples.rdd\
        .filter(lambda x: x._Tags is not None and isDateValid(x, year))\
        .map(findLanguages)\
        .filter(lambda x: x is not None)\
        .keyBy(lambda x: x[1])\
        .aggregateByKey(
            0, lambda x, y: x + 1, lambda x1, x2: x1 + x2,)\
        .sortBy(lambda x: x[1], ascending=False)\
        .toDF()
    result[year] = result[year].select(col("_1").alias("Language"), col("_2").alias(f"Mentions_count_{year}")).limit(10)
    result[year].show()

+-----------+-------------------+
|   Language|Mentions_count_2010|
+-----------+-------------------+
|       Java|                 52|
| JavaScript|                 44|
|        PHP|                 42|
|     Python|                 25|
|Objective-C|                 22|
|          C|                 20|
|       Ruby|                 11|
|     Delphi|                  7|
|          R|                  3|
|       Bash|                  3|
+-----------+-------------------+

+-----------+-------------------+
|   Language|Mentions_count_2011|
+-----------+-------------------+
|        PHP|                 97|
|       Java|                 92|
| JavaScript|                 82|
|     Python|                 35|
|Objective-C|                 33|
|          C|                 24|
|       Ruby|                 17|
|     Delphi|                  8|
|       Perl|                  8|
|       Bash|                  7|
+-----------+-------------------+

+-----------+-------------------+
|   Language

In [6]:
for i in result.keys():
    result[i].write.format("parquet").save(f"{i}")