<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/tim-updates/notebooks/pipelines/final-competition-pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys

!{sys.executable} -m pip install pyspark==3.1.2 -q
!{sys.executable} -m pip install spark-nlp==4.2.0 -q

[K     |████████████████████████████████| 212.4 MB 63 kB/s 
[K     |████████████████████████████████| 198 kB 20.1 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 641 kB 4.2 MB/s 
[?25h

In [2]:
from google.colab import drive
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from sparknlp.base import DocumentAssembler, EmbeddingsFinisher
from sparknlp.annotator import SentenceDetector, Tokenizer, Normalizer, Lemmatizer
import sparknlp
from pyspark.sql.types import StringType, ArrayType, FloatType, StructType
import numpy as np
import string
import pandas as pd

In [8]:
drive.mount('/content/drive')

!wget https://raw.githubusercontent.com/Tstrebe2/predicting-text-difficulty/main/assets/WikiLarge_Train.csv -q
!wget https://raw.githubusercontent.com/Tstrebe2/predicting-text-difficulty/main/assets/WikiLarge_Test.csv -q
!wget https://raw.githubusercontent.com/mahavivo/vocabulary/master/lemmas/AntBNC_lemmas_ver_001.txt -q
!wget https://raw.githubusercontent.com/Tstrebe2/predicting-text-difficulty/main/assets/dale_chall.txt -q
!wget https://raw.githubusercontent.com/Tstrebe2/predicting-text-difficulty/main/assets/AoA_51715_words.csv -q
!wget https://raw.githubusercontent.com/Tstrebe2/predicting-text-difficulty/main/assets/Concreteness_ratings_Brysbaert_et_al_BRM.txt -q
# !wget http://nlp.stanford.edu/data/glove.6B.zip -q
# !unzip glove*.zip

In [4]:
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.0")\
        .getOrCreate()

## Readability Datasets (Provided by MADS)

In [5]:
def get_readability_datasets():
  aoa = pd.read_csv('/content/AoA_51715_words.csv', 
                      encoding_errors='ignore', 
                      usecols=['Lemma_highest_PoS', 'AoA_Kup_lem'],
                      ).rename({'Lemma_highest_PoS':'lemma', 'AoA_Kup_lem':'aoa'}, axis=1)

  aoa = aoa.groupby('lemma').first().to_dict()['aoa']

  conc = (pd.read_csv('/content/Concreteness_ratings_Brysbaert_et_al_BRM.txt', 
                    sep='\t',
                    usecols=['Word', 'Bigram', 'Conc.M'])
          .rename({'Word':'word', 'Bigram':'bigram', 'Conc.M':'conc_mean'}, axis=1))

  def split_word(x):
    if x['bigram'] == 0:
      word_or_phrase = x['word']
    else:
      word_or_phrase = tuple(x['word'].split(' '))

    return {'word':word_or_phrase, 'conc_mean':x['conc_mean'] }

  conc = conc.apply(split_word, axis=1, result_type='expand').set_index('word').to_dict()['conc_mean']

  d_chall = set(pd.read_csv('/content/dale_chall.txt', names=['word'])['word'].tolist())
  return aoa, conc, d_chall

## Core Datasets
* Standars unit-conversion metrics
* Clean punctuation
* Lower-case
* Lemmatize
* Create readability feature representations using MADS provided datasets


In [6]:
def get_clean_data_frame(data_path):
  aoa, conc, d_chall = get_readability_datasets()

  df = spark.read.csv(data_path, header=True)
  df.createOrReplaceTempView('wiki')

  regex1, replace1 = r" km ", "kilometers"
  regex2, replace2 = r"[0-9]+(km) ", "kilometers"
  regex3, replace3 = r" mph "," miles per hour "
  regex4, replace4 = r"° C ","degrees celsius"
  regex5, replace5 = r"° F ","degrees farenheit"
  regex6, replace6 = r"°","degrees"
  regex7, replace7 = r" %"," percent"
  regex8, replace8 = r" cm"," centimeters"
  regex9, replace9 = r" kg "," kilograms "

  iterable = ((regex1, replace1), (regex2, replace2), (regex3, replace3),
              (regex4, replace4), (regex5, replace5), (regex6, replace6),
              (regex7, replace7), (regex8, replace8), (regex9, replace9),)

  for regex, replace in iterable:
    query = f"""
    SELECT
      regexp_replace(original_text, '{regex}', '{replace}') as original_text, 
      label 
    FROM wiki;"""
    df = spark.sql(query)
    df.createOrReplaceTempView('wiki')

  documentAssembler = DocumentAssembler()\
      .setInputCol("original_text")\
      .setOutputCol("document")

  tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")

  normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized") \
      .setLowercase(True) \
      .setCleanupPatterns(["""[^\w\d\s]"""]) # remove punctuations (keep alphanumeric chars)
  # if we don't set CleanupPatterns, it will only keep alphabet letters ([^A-Za-z])

  lemmatizer = Lemmatizer() \
      .setInputCols(["normalized"]) \
      .setOutputCol("lemma") \
      .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")

  nlp_pipeline = Pipeline(stages=[documentAssembler,
                                  tokenizer,
                                  normalizer,
                                  lemmatizer,])

  nlp_pipeline = nlp_pipeline.fit(df)
  df = nlp_pipeline.transform(df)
  df.createOrReplaceTempView('wiki')

  def get_d_chall(x):
    easy_count = 0
    word_count = 0

    for token in x:
      if token in string.punctuation:
        continue

      token_lower = token.lower()

      if token_lower in d_chall:
        easy_count += 1

      word_count += 1

    difficult_count = (word_count-easy_count)

    return (0.1579 * ((difficult_count/word_count) * 100) + 0.0496 * word_count
            if word_count else 0.0)


  def get_aoa(x):
    arr = [aoa[w.lower()] for w in x if w.lower() in aoa]
    if len(arr) > 0:
      return arr
    else:
      return [0.0] 

  def get_conc_rating(x):
    ret_val = []

    bigrams = [(f.lower(), s.lower()) for f, s in zip(x[:-1], x[1:])]
    cont = False
    
    for bigram in bigrams:
      if cont:
        cont = False
        continue

      if bigram in conc:
        cont = True
        ret_val.append(conc[bigram])
      elif bigram[0] in conc:
        ret_val.append(conc[bigram[0]])

    return ret_val

  def get_num_lemmas(x):
    num_lemmas = 0.0

    for token in x:
      if token not in string.punctuation:
        num_lemmas += 1.0

    return num_lemmas
      
  spark.udf.register('get_d_chall', get_d_chall, FloatType())
  spark.udf.register('get_aoa', get_aoa, ArrayType(FloatType()))
  spark.udf.register('get_conc_rating', get_conc_rating, ArrayType(FloatType()))
  spark.udf.register('get_num_lemmas', get_num_lemmas, FloatType())
  spark.udf.register('get_joined_text', lambda x: ' '.join(x), StringType())
  spark.udf.register('array_mean', lambda x: float(np.mean(x)), FloatType())

  query = r"""
  SELECT 
    original_text, 
    get_joined_text(lemma.result) as lemmatized_text, 
    get_d_chall(lemma.result) as d_chall_score,
    get_aoa(lemma.result) as aoa,
    get_conc_rating(lemma.result) as conc_rating,
    get_num_lemmas(lemma.result) as num_lemmas,
    label
  FROM wiki;
  """
  df = spark.sql(query)
  df.createOrReplaceTempView('wiki')

  spark.udf.register('array_mean', lambda x: float(np.mean(x)), FloatType())

  query = r"""
  SELECT 
  original_text, 
  lemmatized_text,
  d_chall_score,
  array_mean(aoa) as aoa_mean, 
  array_min(aoa) as aoa_min, 
  array_max(aoa) as aoa_max, 
  array_mean(conc_rating) as conc_rating_mean, 
  array_min(conc_rating) as conc_rating_min, 
  array_max(conc_rating) as conc_rating_max,
  num_lemmas,
  label
  FROM wiki; 
  """
  df = spark.sql(query)
  df.createOrReplaceTempView('wiki')
  return df.toPandas()

In [9]:
df_train = get_clean_data_frame(data_path='/content/WikiLarge_Train.csv')
df_train.shape

(416768, 11)

In [10]:
df_test = get_clean_data_frame(data_path='/content/WikiLarge_Test.csv')
df_test.shape

(119092, 11)

In [12]:
import joblib

clf = joblib.load('/content/drive/MyDrive/milestone-ii/Models/svc-model_final.joblib')

In [13]:
clf = clf.fit(df_train, df_train.label)

In [14]:
y_hat = clf.predict(df_test)

In [18]:
submission = pd.DataFrame(y_hat, columns=['label']).reset_index().rename({'index':'id'})

In [19]:
submission.to_csv('/content/drive/MyDrive/milestone-ii/submission.csv', index=False)