In [1]:
import sparknlp 

spark = sparknlp.start()

print("Spark NLP version")
sparknlp.version()
print("Apache Spark version")
spark.version

Spark NLP version
Apache Spark version


'2.4.5'

In [3]:
import sys
!{sys.executable} -m pip install findspark



In [6]:
import sys
!{sys.executable} -m pip install pandas



In [7]:
import sys
!{sys.executable} -m pip install spark-nlp==2.5.0 pyspark==2.4.4

Processing /home/amin/.cache/pip/wheels/06/e8/1c/37ed9ed9f29a039ca301c5bf9810128334a69fde67cf5488ff/pyspark-2.4.4-py2.py3-none-any.whl
Installing collected packages: pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 2.4.5
    Can't uninstall 'pyspark'. No files were found to uninstall.
Successfully installed pyspark-2.4.4


In [2]:
spark

In [3]:
raw_tweets = spark.read.format("csv").option("inferSchema", 'true').option("header", 'false').option("sep", ",").load('tweets.csv')

In [4]:
raw_tweets.printSchema

<bound method DataFrame.printSchema of DataFrame[_c0: int, _c1: bigint, _c2: string, _c3: string, _c4: string, _c5: string]>

In [5]:
from pyspark.sql.functions import col 
raw_tweets = raw_tweets.select(col('_c0'),col('_c5'))\
.withColumnRenamed('_c0', 'Label')\
.withColumnRenamed('_c5', 'Text')\
.dropDuplicates()

In [6]:
raw_tweets.count()

1583691

In [7]:
raw_tweets.show(5)

+-----+--------------------+
|Label|                Text|
+-----+--------------------+
|    0|I feel like a com...|
|    0|@KishoreK this is...|
|    0|@InYourEyes2410 I...|
|    0|       A little sad |
|    0|I'm off too bed. ...|
+-----+--------------------+
only showing top 5 rows



In [8]:
from pyspark.ml.feature import SQLTransformer

In [9]:
import sparknlp
from sparknlp.annotator import *
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from pyspark.ml import Pipeline


In [10]:
document_assembler = DocumentAssembler().setInputCol("Text").setOutputCol("document")

sentenceDetector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentences").setUseAbbreviations(True)

tokenizer = Tokenizer().setInputCols(["sentences"]).setOutputCol("token")

normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normal")

stop_word = StopWordsCleaner().setInputCols(["normal"]).setOutputCol('clean')

stemmer = Stemmer().setInputCols(['clean']).setOutputCol('stem')

glove_embeddings = WordEmbeddingsModel().pretrained().setInputCols(["document","stem"]).setOutputCol("embeddings")

embed_sentence = SentenceEmbeddings().setInputCols(["document","embeddings"]).setOutputCol("sent_embed")\
.setPoolingStrategy('AVERAGE')

embeddings_finisher = EmbeddingsFinisher().setInputCols('sent_embed').setOutputCols('finished_sentence_embeddings')

#explodeVectors = SQLTransformer(statement = 
                               #"SELECT EXPLODE(finished_sentence_embeddings) AS features, FROM__THIS__")

#finisher = Finisher().setInputCols(['stem']).setOutputCols('ntokens')\
#.setOutputAsArray(True).setCleanAnnotations(True)

nlpPipeline = Pipeline(stages=[
 document_assembler, 
 sentenceDetector,
 tokenizer,
 normalizer,
 stop_word,   
 stemmer,
 glove_embeddings,
 embed_sentence,
 embeddings_finisher,
 #explodeVectors,
 ])

pipelineModel = nlpPipeline.fit(raw_tweets)

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [11]:
processed = pipelineModel.transform(raw_tweets)
print(processed.count())

1583691


In [15]:
processed.select("sentences").take(1)

[Row(sentences=[Row(annotatorType='document', begin=0, end=28, result='I feel like a complete idiot.', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='document', begin=30, end=90, result="I'm the only one who doesn't get how this shit works  help me", metadata={'sentence': '1'}, embeddings=[])])]

In [16]:
from pyspark.sql.functions import explode

data = processed.withColumn("features", explode(processed.finished_sentence_embeddings))

In [19]:
data.select('features').take(1)

[Row(features=[-0.07529754191637039, 0.13422273099422455, 0.3415910601615906, -0.3557145297527313, -0.20880037546157837, 0.03789282217621803, -0.2928003668785095, 0.042057182639837265, 0.14446181058883667, -0.09094779193401337, 0.12380874156951904, 0.09928963333368301, 0.23074579238891602, -0.12493426352739334, -0.09839699417352676, 0.0022225461434572935, 0.10741699486970901, 0.27960482239723206, -0.13010726869106293, 0.392767071723938, 0.1709136813879013, 0.3062072694301605, -0.12738752365112305, -0.2340545505285263, 0.03396772965788841, 0.11415155231952667, -0.25131019949913025, -0.30153873562812805, 0.2542422115802765, -0.21709389984607697, -0.2400454580783844, 0.27437493205070496, 0.09648680686950684, 0.11824581027030945, 0.011344101279973984, 0.32770535349845886, -0.1969478279352188, 0.2630782723426819, 0.06568169593811035, -0.23244719207286835, -0.11291618645191193, 0.06451380997896194, -0.15443290770053864, -0.2522612512111664, -0.47303545475006104, -0.2834317982196808, -0.11356

In [20]:
#from org.apache.spark.ml.linal import Vector, Vectors
from pyspark.ml.linalg import Vector, Vectors, VectorUDT
from pyspark.sql.functions import udf
from pyspark.sql.functions import explode

@udf(returnType = VectorUDT())
def convertToVectorUDF(matrix):
    return Vectors.dense(matrix.toArray.map(_.toDouble))


# Now let's explode the sentence_embeddings column and have a new feature column for Spark ML

#data = data.select('explode("sent_embed.embeddings").alias("sentence_embedding")').withColumn("features", convertToVectorUDF("sentence_embedding"))  
data = data.withColumn("features1", convertToVectorUDF("features")) 

In [25]:
data.select('features').take(2)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handl

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-480b2b2cab8c>", line 1, in <mo

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-480b2b2cab8c>", line 1, in <mo

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-480b2b2cab8c>", line 1, in <mo

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-480b2b2cab8c>", line 1, in <mo

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-480b2b2cab8c>", line 1, in <mo

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-480b2b2cab8c>", line 1, in <mo

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-480b2b2cab8c>", line 1, in <mo

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-480b2b2cab8c>", line 1, in <mo

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-480b2b2cab8c>", line 1, in <mo

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-480b2b2cab8c>", line 1, in <mo

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45055)
Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amin/anaconda3/envs/sparknlp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-480b2b2cab8c>", line 1, in <mo

Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:45055)

In [None]:
train,test = processed.randomSplit(weights=[0.7,0.3], seed = 102)

In [None]:
print(train.count())
print(test.count())

In [None]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(train)

# get training summary used for eval metrics and other params
lrTrainingSummary = lrModel.summary

In [None]:
# make predictions on test data
lrPredictions = lrModel.transform(testData)

# display predictions
lrPredictions.select("label", "prediction", "probability").limit(10).toPandas()