In [1]:

import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.6.2

openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)
Collecting pyspark==2.4.4
[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)
[K     |████████████████████████████████| 215.7MB 54kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 43.6MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130389 sha256=4628362ab13f64334b2f94911e0a9563e12d058b01cfb59bd18d6b87cd544cb2
  Stored in directory: /root/.cache/pip/wheels/ab/09/4d/0d18423005

In [2]:
import sparknlp
spark = sparknlp.start()
print("Version of SparkNLP:", sparknlp.version())
print("Version of Spark :", spark.version)

Version of SparkNLP: 2.6.2
Version of Spark : 2.4.4


In [3]:
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [4]:
import pandas as pd

# After uploading .gz file, open and clean with pandas 
dfPandas = pd.read_csv('reviews.csv.gz',compression='gzip')
dfPandas = dfPandas.dropna()
dfPandas['comments'] = dfPandas['comments'].map(
    lambda x:x.replace('\n','').replace('\r',''))
dfPandas.to_csv('review_data.csv', index=False)

In [5]:
# Review data
fileName = "review_data.csv"

# Load into spark
spark.sparkContext.addFile(fileName) ###### ERROR HERE???
dfSpark = spark.read.csv(fileName, header=True)

dfSpark.show(5)

+----------+------+----------+-----------+-------------+--------------------+
|listing_id|    id|      date|reviewer_id|reviewer_name|            comments|
+----------+------+----------+-----------+-------------+--------------------+
|     17878| 64852|2010-07-15|     135370|          Tia|This apartment is...|
|     17878| 76744|2010-08-11|      10206|         Mimi|we had a really g...|
|     17878| 91074|2010-09-06|      80253|          Jan|Staying in Max ap...|
|     17878|137528|2010-11-12|     230449|        Orene|In general very g...|
|     17878|147594|2010-12-01|     219338|        David|The apt was nice ...|
+----------+------+----------+-----------+-------------+--------------------+
only showing top 5 rows



In [9]:
documentAssembler = DocumentAssembler()\
    .setInputCol("comments")\
    .setOutputCol("document")

# Detect language
detectLanguage = LanguageDetectorDL()\
    .pretrained('ld_wiki_20',lang='xx')\
    .setInputCols(['comments'])\
    .setOutputCol('language')\
    .setCoalesceSentences(False)\
    .setThreshold(.3)

# Full pipeline
pipeline = Pipeline(
    stages=[documentAssembler,
            detectLanguage])


ld_wiki_20 download started this may take some time.
Approximate size to download 3 MB
[OK!]


In [21]:
# Assemble documents
documentAssembler = DocumentAssembler()\
    .setInputCol("comments")\
    .setOutputCol("document")

assembled = documentAssembler.transform(dfSpark)
assembled.show(5, truncate=60)

+----------+------+----------+-----------+-------------+------------------------------------------------------------+------------------------------------------------------------+
|listing_id|    id|      date|reviewer_id|reviewer_name|                                                    comments|                                                    document|
+----------+------+----------+-----------+-------------+------------------------------------------------------------+------------------------------------------------------------+
|     17878| 64852|2010-07-15|     135370|          Tia|This apartment is in a perfect location -- two blocks fro...|[[document, 0, 396, This apartment is in a perfect locati...|
|     17878| 76744|2010-08-11|      10206|         Mimi|we had a really great experience staying in Max's apartme...|[[document, 0, 325, we had a really great experience stay...|
|     17878| 91074|2010-09-06|      80253|          Jan|Staying in Max appartment is like living in a cos

In [23]:
# Detect language
detectLanguage = LanguageDetectorDL()\
    .pretrained('ld_wiki_20',lang='xx')\
    .setInputCols(['document'])\
    .setOutputCol('language')\
    .setCoalesceSentences(False)\
    .setThreshold(.3)

results = detectLanguage.transform(assembled)
results.show(5)

ld_wiki_20 download started this may take some time.
Approximate size to download 3 MB
[OK!]
+----------+------+----------+-----------+-------------+--------------------+--------------------+--------------------+
|listing_id|    id|      date|reviewer_id|reviewer_name|            comments|            document|            language|
+----------+------+----------+-----------+-------------+--------------------+--------------------+--------------------+
|     17878| 64852|2010-07-15|     135370|          Tia|This apartment is...|[[document, 0, 39...|[[language, 0, 39...|
|     17878| 76744|2010-08-11|      10206|         Mimi|we had a really g...|[[document, 0, 32...|[[language, 0, 32...|
|     17878| 91074|2010-09-06|      80253|          Jan|Staying in Max ap...|[[document, 0, 38...|[[language, 0, 38...|
|     17878|137528|2010-11-12|     230449|        Orene|In general very g...|[[document, 0, 52...|[[language, 0, 52...|
|     17878|147594|2010-12-01|     219338|        David|The apt was

In [25]:
results.select('language').show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|language                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+---------------------------------------------------------------------------------------------------------------------