In [1]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.3.0 spark-nlp==4.2.8

In [2]:
import json
import pandas as pd
import numpy as np

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

In [3]:
spark = sparknlp.start()
#print ("Spark NLP Version :", sparknlp.version())
#spark

In [4]:
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer()\
        .setInputCols("document")\
        .setOutputCol("token")

lemmatizer = LemmatizerModel.pretrained("lemma", "am") \
        .setInputCols(["token"]) \
        .setOutputCol("lemma")

nlp_pipeline = Pipeline(stages=[document_assembler, tokenizer, lemmatizer])
light_pipeline = LightPipeline(nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")))


lemma download started this may take some time.
Approximate size to download 36 KB
[OK!]


In [5]:
#An example how the lemmatizer works
results = light_pipeline.fullAnnotate(["መጽሐፉን መጽሐፍ ኡ ን አስያዛት አስያዝ ኧ ኣት ።"])
results

Before _validateStagesInputCols


[{'document': [Annotation(document, 0, 31, መጽሐፉን መጽሐፍ ኡ ን አስያዛት አስያዝ ኧ ኣት ።, {}, [])],
  'token': [Annotation(token, 0, 4, መጽሐፉን, {'sentence': '0'}, []),
   Annotation(token, 6, 9, መጽሐፍ, {'sentence': '0'}, []),
   Annotation(token, 11, 11, ኡ, {'sentence': '0'}, []),
   Annotation(token, 13, 13, ን, {'sentence': '0'}, []),
   Annotation(token, 15, 19, አስያዛት, {'sentence': '0'}, []),
   Annotation(token, 21, 24, አስያዝ, {'sentence': '0'}, []),
   Annotation(token, 26, 26, ኧ, {'sentence': '0'}, []),
   Annotation(token, 28, 29, ኣት, {'sentence': '0'}, []),
   Annotation(token, 31, 31, ።, {'sentence': '0'}, [])],
  'lemma': [Annotation(token, 0, 4, _, {'sentence': '0'}, []),
   Annotation(token, 6, 9, መጽሐፍ, {'sentence': '0'}, []),
   Annotation(token, 11, 11, ኡ, {'sentence': '0'}, []),
   Annotation(token, 13, 13, ን, {'sentence': '0'}, []),
   Annotation(token, 15, 19, _, {'sentence': '0'}, []),
   Annotation(token, 21, 24, አስያዝ, {'sentence': '0'}, []),
   Annotation(token, 26, 26, ኧ, {'sentenc

In [8]:
with open('/content/Amharic_normalized_cleaned_text.txt', 'r', encoding="utf-8") as f:
    cleaned_corp = f.read()
cleaned_corp = cleaned_corp.split('^^^^^^')

In [10]:
#will get rid of everything and just keep the lemma 
results = light_pipeline.fullAnnotate(cleaned_corp[:5])

lemmatized_text = []

for result in results:
    lemmas = [token.result for token in result['lemma']]
    lemmatized_text.append(' '.join(lemmas))



Before _validateStagesInputCols


In [11]:
with open("/content/amh_output_10.txt", "w", encoding="utf-8") as file:
    
    for text in lemmatized_text:
        file.write(text + '\n')
file.close()

Let's see how the text looked before and after lemmatization


In [23]:
initial_text = cleaned_corp[1][1:500]
final_text = lemmatized_text[1][1:500]

The text still has some underscore (_), I believe its from the lemmatizer. will remove it before embedding

In [24]:
print('Before:')
initial_text

Before:


'ምህርት የገባብ ክፍል ቆሮግቶስ በቁጥር ላይ እኀፂህ እግፂሆኀግልሀ ይህሀ አስፅፍምቅ ቅ ሲል ሞኀ ማለቱ ነው ጥናቄ በቁጥር ላይ ፋትምክህት የሚለው ምኑገ ነው ጥዩቄ በቁጥር መሰረት ሀዋርደው ወገሀጌልኀ ይሰብክ የነበረው በፈቃፅ ነበር ለፈቃፅሀ በቁጥር ላይ ወኀግጌስሀ የመስበክ ፀመወዙ ምገኀፅነው ይላል ቁጥር ክላይ ክዘረዘረቸው መብቶች በአጎገፁም እገኳ ልተጠቀመ እግፀሆነ ይመሰክረሪለል በዚህ ገዜ በአእምሮው ዋመጣበት የቆሮሀቶስ ክርስቲፎኖች ይህጎገ አሁሀ የሚናገረው ፀመወዝ ፈልጐ ነው ይሉኝ ይሆናለ የሚለው ግምት ነበር ይህም እኀፃለሆነ በክባፅ አነጋገር ዌሪጋግጥላቸዋል ማም ትምክህቴሀ ክሀቱ ከሚፀርግብኝ ሞት ይሻለናልና ትምክህቱ ወግጌለኘሀ በነአ መስበኩ ነው ግሀ አሁሀ ፃሳቡኘ ቀይሮ ፀመጩዝ መቀበስ ቢጃምር ትምክህቱ ይቀረል ሮ ትምክህቱ የትሰቢት ሳይሆገሀ ፀመወዝ በመቀበለ ሊመጣ የላነበረው'

In [25]:
print('After:')
final_text

After:


'ምህርት የገባብ ክፍል ቆሮግቶስ በቁጥር ላይ እኀፂህ እግፂሆኀግልሀ ይህሀ አስፅፍምቅ ቅ ሲል ሞኀ ማለቱ _ ጥናቄ በቁጥር ላይ ፋትምክህት የሚለው ምኑገ _ ጥዩቄ በቁጥር መሰረት ሀዋርደው ወገሀጌልኀ ይሰብክ _ በፈቃፅ ነበር ለፈቃፅሀ በቁጥር ላይ ወኀግጌስሀ የመስበክ ፀመወዙ ምገኀፅነው _ ቁጥር ክላይ ክዘረዘረቸው መብቶች በአጎገፁም እገኳ ልተጠቀመ እግፀሆነ ይመሰክረሪለል _ ገዜ በአእምሮው ዋመጣበት የቆሮሀቶስ ክርስቲፎኖች ይህጎገ አሁሀ የሚናገረው ፀመወዝ ፈልጐ _ ይሉኝ ይሆናለ የሚለው ግምት ነበር ይህም እኀፃለሆነ በክባፅ አነጋገር ዌሪጋግጥላቸዋል ማም ትምክህቴሀ ክሀቱ ከሚፀርግብኝ ሞት ይሻለናልና ትምክህቱ ወግጌለኘሀ በነአ መስበኩ _ ግሀ አሁሀ ፃሳቡኘ ቀይሮ ፀመጩዝ መቀበስ ቢጃምር ትምክህቱ ይቀረል ሮ ትምክህቱ የትሰቢት ሳይሆገሀ ፀመወዝ በመቀበለ _ የላነበረውኀ ፃወገግጌልገ እኀቅፋ'