<a href="https://colab.research.google.com/github/Vitor-Sallenave/Formacao-em-NLP/blob/main/Spark/NLP_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## ***⚠️ The following code must be runned in a big data plataform! Use the software Databricks notebooks to help you on it:***

<br>

## [Databricks Community Edition](https://community.cloud.databricks.com/login.html)

In [None]:
!pip install pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import Tokenizer, StringIndexer, Word2Vec
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# This object "spark" is the session name
spark = SparkSession.builder.appName("nlp").getOrCreate()

In [None]:
# Here, we are doing a SQL query from the table "spam"
spam = spark.sql("select * from spam")

In [None]:
# Showing the table
spam.show(5, truncate=True)

+--------+--------------------+
|Category|             Message|
+--------+--------------------+
|     ham|Go until jurong p...|
|     ham|Ok lar... Joking ...|
|    spam|Free entry in 2 a...|
|     ham|U dun say so earl...|
|     ham|Nah I don't think...|
+--------+--------------------+
only showing top 5 rows



In [None]:
# Transforming the category column into numbers
stringIdx = StringIndexer(inputCol="Category", outputCol="CategoryIndex")
spam_indexed = stringIdx.fit(spam).transform(spam)
spam_indexed.show(5)

+--------+--------------------+-------------+
|Category|             Message|CategoryIndex|
+--------+--------------------+-------------+
|     ham|Go until jurong p...|          0.0|
|     ham|Ok lar... Joking ...|          0.0|
|    spam|Free entry in 2 a...|          1.0|
|     ham|U dun say so earl...|          0.0|
|     ham|Nah I don't think...|          0.0|
+--------+--------------------+-------------+
only showing top 5 rows



In [None]:
# Creating tokens
tk = Tokenizer(inputCol="Message", outputCol="MessageTokens")
spam_tokens = tk.transform(spam_indexed)
spam_tokens.show(5)

+--------+--------------------+-------------+--------------------+
|Category|             Message|CategoryIndex|       MessageTokens|
+--------+--------------------+-------------+--------------------+
|     ham|Go until jurong p...|          0.0|[go, until, juron...|
|     ham|Ok lar... Joking ...|          0.0|[ok, lar..., joki...|
|    spam|Free entry in 2 a...|          1.0|[free, entry, in,...|
|     ham|U dun say so earl...|          0.0|[u, dun, say, so,...|
|     ham|Nah I don't think...|          0.0|[nah, i, don't, t...|
+--------+--------------------+-------------+--------------------+
only showing top 5 rows



In [None]:
# Selecting a column
spam_tokens.select("MessageTokens").show(5)

+--------------------+
|       MessageTokens|
+--------------------+
|[go, until, juron...|
|[ok, lar..., joki...|
|[free, entry, in,...|
|[u, dun, say, so,...|
|[nah, i, don't, t...|
+--------------------+
only showing top 5 rows



In [None]:
# Creating the vectors
word2vec = Word2Vec(inputCol="MessageTokens", outputCol="Messages2Vec")
spam_vectors = word2vec.fit(spam_tokens).transform(spam_tokens)
spam_vectors.show(5)

In [None]:
spam_tokens.select("Messages2Vec").show(5)

In [None]:
# Spliting the data into train and test
spam_train, spam_test = spam_vectors.randomSplit([0.7, 0.3])

In [None]:
# Creating the model
rf = RandomForest(labelCol="CategoryIndex", featuresCol="Messages2Vec", numTrees=500)
rf.fit(spam_train)

In [None]:
predictions = rf.transform(spam_test)
predictions.show(10)

In [None]:
# Evaluating the model
bce = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                        labelCol="CategoryIndex",
                                        metricName="areaUnderROC")
result = bce.evaluate(predictions)
print(result)