# Mining words from Wikipedia

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Analysing Wikipedia") \
    .getOrCreate()

In [2]:
df = spark.read.json("./nowiki-20210111-cirrussearch-general.json")

## Cleaning the dataset

Looking at the schema just to explore the dataset. Found [a description of the JSON dump format on Wikipedia](https://meta.wikimedia.org/wiki/Data_dumps/Misc_dumps_format)

In [7]:
df.printSchema()

root
 |-- auxiliary_text: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- content_model: string (nullable = true)
 |-- coordinates: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- coord: struct (nullable = true)
 |    |    |    |-- lat: double (nullable = true)
 |    |    |    |-- lon: double (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- dim: long (nullable = true)
 |    |    |-- globe: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- primary: boolean (nullable = true)
 |    |    |-- region: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- create_timestamp: string (nullable = true)
 |-- defaultsort: string (nullable = true)
 |-- display_title: string (nullable = true)
 |-- external_link: array (nullable = true)
 |    |-- element: strin

### Find columns to filter on

In [23]:
# We are only interested in "wikitext"
df.select("content_model").distinct().show()

+-------------+
|content_model|
+-------------+
|   flow-board|
|    Scribunto|
|         null|
|         json|
|sanitized-css|
|     wikitext|
|   javascript|
|          css|
+-------------+



In [26]:
# We are only interested in nb
df.select("language").distinct().show()

+--------+
|language|
+--------+
|      en|
|      nb|
|    null|
|      de|
|      nn|
|      sv|
|      nl|
|      se|
|      da|
+--------+



In [12]:
# We are only insterested in Wikipedia
df.select("namespace_text").distinct().show(21)

+-------------------+
|     namespace_text|
+-------------------+
|Wikipedia-diskusjon|
|    Brukerdiskusjon|
|               null|
|                Mal|
|       Fildiskusjon|
|MediaWiki-diskusjon|
|          Wikipedia|
|             Bruker|
|  Kategoridiskusjon|
|                Sak|
|          MediaWiki|
|    Portaldiskusjon|
|     Moduldiskusjon|
|          Diskusjon|
|       Maldiskusjon|
|           Kategori|
|     Hjelpdiskusjon|
|                Fil|
|              Modul|
|             Portal|
|              Hjelp|
+-------------------+



In [23]:
filtered_df = df.filter( \
    (df["content_model"] == "wikitext") & \
    (df["language"] == "nb") & \
    (df["namespace_text"] == "Wikipedia")) \
    .drop("content_model", "language", "category", "coordinates", "defaultsort", \
        "external_link", "heading", "incoming_links", "namespace", "namespace_text", \
        "outgoing_link", "redirect", "text_bytes", "template", "wiki", \
        "wikibase_item", "version_type", "file_bits", "file_height", "file_media_type", \
        "file_resolution", "file_size", "file_text", "file_width", "index", \
        "file_mime", "ores_articletopic", "ores_articletopics")

In [24]:
filtered_df.show()

+--------------------+--------------------+-------------+------------+--------------------+--------------------+--------------------+--------------------+--------+
|      auxiliary_text|    create_timestamp|display_title|opening_text|         source_text|                text|           timestamp|               title| version|
+--------------------+--------------------+-------------+------------+--------------------+--------------------+--------------------+--------------------+--------+
|[Se også: Wikiped...|2012-07-14T19:11:08Z|         null|        null|<noinclude>
{| wi...|                    |2016-11-01T19:38:14Z| Hva er en artikkel?|16865915|
|                  []|2012-08-09T14:23:13Z|         null|        null|<noinclude>
===[[...|Referanseløs stub...|2012-08-10T08:27:01Z|Sletting/Bobby Bo...|10999240|
|[Det er selvsagt ...|2013-11-27T20:04:38Z|         null|        null|<noinclude>
===[[...|Nominasjonen oven...|2013-12-03T13:40:28Z|Sletting/Mattangr...|12804653|
|               

In [32]:
df.filter(lambda x: x.array_contains("Kategori:Sletting")).count()

TypeError: condition should be string or Column