# Mining words from Wikipedia

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import array_contains

spark = SparkSession \
    .builder \
    .appName("Analysing Wikipedia") \
    .getOrCreate()

In [3]:
df = spark.read.json("./nowiki-20210111-cirrussearch-general.json")

## Cleaning the dataset

Looking at the schema just to explore the dataset. Found [a description of the JSON dump format on Wikipedia](https://meta.wikimedia.org/wiki/Data_dumps/Misc_dumps_format)

In [4]:
df.printSchema()

root
 |-- auxiliary_text: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- content_model: string (nullable = true)
 |-- coordinates: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- coord: struct (nullable = true)
 |    |    |    |-- lat: double (nullable = true)
 |    |    |    |-- lon: double (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- dim: long (nullable = true)
 |    |    |-- globe: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- primary: boolean (nullable = true)
 |    |    |-- region: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- create_timestamp: string (nullable = true)
 |-- defaultsort: string (nullable = true)
 |-- display_title: string (nullable = true)
 |-- external_link: array (nullable = true)
 |    |-- element: strin

### Find columns to filter on

In [5]:
# We are only interested in "wikitext"
df.select("content_model").distinct().show()

+-------------+
|content_model|
+-------------+
|   flow-board|
|    Scribunto|
|         null|
|         json|
|sanitized-css|
|     wikitext|
|   javascript|
|          css|
+-------------+



In [6]:
# We are only interested in nb
df.select("language").distinct().show()

+--------+
|language|
+--------+
|      en|
|      nb|
|    null|
|      de|
|      nn|
|      sv|
|      nl|
|      se|
|      da|
+--------+



In [27]:
# About namespaces https://en.wikipedia.org/wiki/Wikipedia:Namespace
# Articles have no namespace (no prefix), so we are interested in "null"
df.select("namespace_text").distinct().show(21)

+-------------------+
|     namespace_text|
+-------------------+
|Wikipedia-diskusjon|
|    Brukerdiskusjon|
|               null|
|                Mal|
|       Fildiskusjon|
|MediaWiki-diskusjon|
|          Wikipedia|
|             Bruker|
|  Kategoridiskusjon|
|                Sak|
|          MediaWiki|
|    Portaldiskusjon|
|     Moduldiskusjon|
|          Diskusjon|
|       Maldiskusjon|
|           Kategori|
|     Hjelpdiskusjon|
|                Fil|
|              Modul|
|             Portal|
|              Hjelp|
+-------------------+



In [49]:
filtered_df = df.filter( \
    # (df["content_model"] == "wikitext") & \
    # (df["language"] == "nb") & \
    (df["namespace_text"].isNull())) \
    .drop("content_model", "language", "category", "coordinates", "defaultsort", \
        "external_link", "heading", "incoming_links", "namespace", "namespace_text", \
        "outgoing_link", "redirect", "text_bytes", "template", "wiki", \
        "wikibase_item", "version_type", "file_bits", "file_height", "file_media_type", \
        "file_resolution", "file_size", "file_text", "file_width", "index", \
        "file_mime", "ores_articletopic", "ores_articletopics")

In [51]:
filtered_df.show()

+--------------+----------------+-------------+------------+-----------+----+---------+-----+-------+
|auxiliary_text|create_timestamp|display_title|opening_text|source_text|text|timestamp|title|version|
+--------------+----------------+-------------+------------+-----------+----+---------+-----+-------+
|          null|            null|         null|        null|       null|null|     null| null|   null|
|          null|            null|         null|        null|       null|null|     null| null|   null|
|          null|            null|         null|        null|       null|null|     null| null|   null|
|          null|            null|         null|        null|       null|null|     null| null|   null|
|          null|            null|         null|        null|       null|null|     null| null|   null|
|          null|            null|         null|        null|       null|null|     null| null|   null|
|          null|            null|         null|        null|       null|null|     

In [54]:
df.where(df["namespace"] == 0).show()

+--------------+--------+-------------+-----------+----------------+-----------+-------------+-------------+---------+-----------+---------------+---------+---------------+---------+---------+----------+-------+--------------+-----+--------+---------+--------------+------------+-----------------+------------------+-------------+--------+-----------+--------+----+----------+---------+-----+-------+------------+----+-------------+
|auxiliary_text|category|content_model|coordinates|create_timestamp|defaultsort|display_title|external_link|file_bits|file_height|file_media_type|file_mime|file_resolution|file_size|file_text|file_width|heading|incoming_links|index|language|namespace|namespace_text|opening_text|ores_articletopic|ores_articletopics|outgoing_link|redirect|source_text|template|text|text_bytes|timestamp|title|version|version_type|wiki|wikibase_item|
+--------------+--------+-------------+-----------+----------------+-----------+-------------+-------------+---------+-----------+----