## Content Language Detection

### PySpark Setup

In [None]:
# This is only to setup PySpark and Spark NLP on Colab
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2022-11-11 16:39:10--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2022-11-11 16:39:10--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2022-11-11 16:39:11--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:44

In [None]:
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

spark = sparknlp.start()

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

Spark NLP version 4.2.3
Apache Spark version: 3.2.1


In [None]:
# Download a pre-trained pipeline by name and language
language_detector_pipeline = PretrainedPipeline('detect_language_21', lang='xx')

# Depending on the language (how similar the characters are), the LanguageDetectorDL works
# best with text longer than 140 characters
language_detector_pipeline.annotate("OpenEdu.ch ist eine 2020 von Wikimedia CH gegründete Plattform zur Unterstützung, Verbreitung und Förderung von Projekten, Ausbildungstools und News aus der Welt der offenen Bildung.")

detect_language_21 download started this may take some time.
Approx size to download 7.7 MB
[OK!]


{'document': ['OpenEdu.ch ist eine 2020 von Wikimedia CH gegründete Plattform zur Unterstützung, Verbreitung und Förderung von Projekten, Ausbildungstools und News aus der Welt der offenen Bildung.'],
 'sentence': ['OpenEdu.ch ist eine 2020 von Wikimedia CH gegründete Plattform zur Unterstützung, Verbreitung und Förderung von Projekten, Ausbildungstools und News aus der Welt der offenen Bildung.'],
 'language': ['de']}

### LanguageDetectorDL

In [None]:
from sparknlp.base import *
from sparknlp.annotator import *

In [None]:
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

language_detector = LanguageDetectorDL.pretrained("ld_wiki_tatoeba_cnn_21")\
.setInputCols(["document"])\
.setOutputCol("lang")\
.setThreshold(0.8)\
.setCoalesceSentences(True)

languagePipeline = Pipeline(stages=[
 documentAssembler, 
 language_detector
])

ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[OK!]


### Tests for English and Italien Languages

In [None]:
test_df = spark.createDataFrame([
  ['OpenEdu is a platform launched in 2020 by wikimedia CH to support, disseminate and promote projects, training tools and news from the world of open education.'], 
  ['OpenEdu.ch.ch è una piattaforma lanciata nel 2020 da wikimedia CH per supportare, diffondere e promuovere progetti, strumenti di formazione e novità dal mondo dell istruzione aperta.']]
).toDF("text")

results = languagePipeline.fit(test_df).transform(test_df)

In [None]:
results.select("lang.result").show()

+------+
|result|
+------+
|  [en]|
|  [it]|
+------+



In [None]:
# probabilities for other languages
results.select("lang.metadata").show(2, False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|metadata                                                                                                                                                                                                                                                                                                                                                                                                                                |
+-------------------------------------------------------------------------------------------------------------------------------------------------