# Mining words from Wikipedia

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Analysing Wikipedia") \
    .getOrCreate()

In [2]:
df = spark.read.json("./nowiki-20210111-cirrussearch-general.json")

## Cleaning the dataset

Looking at the schema just to explore the dataset. Found [a description of the JSON dump format on Wikipedia](https://meta.wikimedia.org/wiki/Data_dumps/Misc_dumps_format)

In [7]:
df.printSchema()

root
 |-- auxiliary_text: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- content_model: string (nullable = true)
 |-- coordinates: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- coord: struct (nullable = true)
 |    |    |    |-- lat: double (nullable = true)
 |    |    |    |-- lon: double (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- dim: long (nullable = true)
 |    |    |-- globe: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- primary: boolean (nullable = true)
 |    |    |-- region: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- create_timestamp: string (nullable = true)
 |-- defaultsort: string (nullable = true)
 |-- display_title: string (nullable = true)
 |-- external_link: array (nullable = true)
 |    |-- element: strin

### Find columns to filter on

In [23]:
# We are only interested in "wikitext"
df.select("content_model").distinct().show()

+-------------+
|content_model|
+-------------+
|   flow-board|
|    Scribunto|
|         null|
|         json|
|sanitized-css|
|     wikitext|
|   javascript|
|          css|
+-------------+



In [26]:
# We are only interested in nb
df.select("language").distinct().show()

+--------+
|language|
+--------+
|      en|
|      nb|
|    null|
|      de|
|      nn|
|      sv|
|      nl|
|      se|
|      da|
+--------+



In [35]:
df.select("heading").show()

+--------------------+
|             heading|
+--------------------+
|                null|
|                  []|
|                null|
|                  []|
|                null|
|                  []|
|                null|
|[Endret eksterne ...|
|                null|
|                  []|
|                null|
|                  []|
|                null|
|                  []|
|                null|
|        [Velkommen!]|
|                null|
|                  []|
|                null|
|[Endret eksterne ...|
+--------------------+
only showing top 20 rows



In [32]:
filtered_df = df.filter( \
    (df["content_model"] == "wikitext") & \
    (df["language"] == "nb"))

In [20]:
filtered_df.select("title").show(truncate=False)

+--------------------+
|               title|
+--------------------+
|    Franske musikere|
|          Hvalfisken|
|     Greske gudinner|
|    Gomez' hamburger|
|Greske mytologisk...|
|           Slørtåken|
|Personer fra Hama...|
|           Fisflicka|
| Personer fra Aomori|
|          Nya Varvet|
|Bidrag til Eurovi...|
|        Wasim~nowiki|
| Grever av Barcelona|
|Hans Hansen Grime...|
| Fotballag i Kamerun|
|Barclay James Har...|
|           Databaser|
|Bodø/Utvalgt bodø...|
|     Pasiphaëgruppen|
|        31.45.40.230|
+--------------------+
only showing top 20 rows

