# Identify the languge of user reviews 
---

Using Apache Spark/John Snow NLP resources

## Setup

In [1]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

# Set environment
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# Install packages
! pip install --ignore-installed pyspark==2.4.4
! pip install --ignore-installed spark-nlp==2.6.3

Processing /root/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471/pyspark-2.4.4-py2.py3-none-any.whl
Collecting py4j==0.10.7
  Using cached https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.4
Collecting spark-nlp==2.6.3
[?25l  Downloading https://files.pythonhosted.org/packages/84/84/3f15673db521fbc4e8e0ec3677a019ba1458b2cb70f0f7738c221511ef32/spark_nlp-2.6.3-py2.py3-none-any.whl (129kB)
[K     |████████████████████████████████| 133kB 6.0MB/s 
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-2.6.3


### Clean data

In [10]:
import pandas as pd

# Prepare review data
def clean_data():
    df=pd.read_csv('reviews.csv.gz')
    df.comments.fillna('',inplace=True)
    df.comments=df.comments.map(
        lambda x:x.replace('\n','').replace('\r',''))
    return df.comments

comments = clean_data()

## Load data

In [12]:
import sparknlp
from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *
from pyspark.sql.types import StringType


# Start sparknlp session
spark = sparknlp.start()

dfSpark = spark.createDataFrame(comments, StringType())
dfSpark.show(5)

+--------------------+
|               value|
+--------------------+
|This apartment is...|
|we had a really g...|
|Staying in Max ap...|
|In general very g...|
|The apt was nice ...|
+--------------------+
only showing top 5 rows



## Prepare documents for classifier

In [15]:
# Assemble documents
documentAssembler = DocumentAssembler()\
    .setInputCol("value")\
    .setOutputCol("document")


assembled = documentAssembler.transform(dfSpark)
assembled.select(['value','document']).show(10)

+--------------------+--------------------+
|               value|            document|
+--------------------+--------------------+
|This apartment is...|[[document, 0, 39...|
|we had a really g...|[[document, 0, 32...|
|Staying in Max ap...|[[document, 0, 38...|
|In general very g...|[[document, 0, 52...|
|The apt was nice ...|[[document, 0, 11...|
|At Copacabana apa...|[[document, 0, 93...|
|A great apartment...|[[document, 0, 16...|
|Apartment is exac...|[[document, 0, 32...|
|Great apartment, ...|[[document, 0, 31...|
|Max was very orga...|[[document, 0, 39...|
+--------------------+--------------------+
only showing top 10 rows



## Detect language

In [17]:
# Detect language
detectLanguage = LanguageDetectorDL()\
    .pretrained('ld_wiki_20',lang='xx')\
    .setInputCols(['document'])\
    .setOutputCol('language')\
    .setCoalesceSentences(False)\
    .setThreshold(.3)

language = detectLanguage.transform(assembled)
language.select(['value','language']).show(10)

ld_wiki_20 download started this may take some time.
Approximate size to download 3 MB
[OK!]
+--------------------+--------------------+
|               value|            language|
+--------------------+--------------------+
|This apartment is...|[[language, 0, 39...|
|we had a really g...|[[language, 0, 32...|
|Staying in Max ap...|[[language, 0, 38...|
|In general very g...|[[language, 0, 52...|
|The apt was nice ...|[[language, 0, 11...|
|At Copacabana apa...|[[language, 0, 93...|
|A great apartment...|[[language, 0, 16...|
|Apartment is exac...|[[language, 0, 32...|
|Great apartment, ...|[[language, 0, 31...|
|Max was very orga...|[[language, 0, 39...|
+--------------------+--------------------+
only showing top 10 rows



### Inspect

In [18]:
language.select('language').printSchema()

root
 |-- language: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)



In [25]:
import pyspark.sql.functions as F

predicted_language = language.withColumn('predicted',
                    F.explode('language.result'))\
                    .select("predicted")

predicted_language.show(10)

+---------+
|predicted|
+---------+
|       en|
|       en|
|       en|
|       en|
|       en|
|       pt|
|       en|
|       en|
|       en|
|       en|
+---------+
only showing top 10 rows



#### Write results

In [27]:
predicted_language.write.csv("language_predictions.csv")

In [28]:
from google.colab import files
    
files.download('language_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>