In [1]:
pip install spark-nlp==3.3.4

Collecting spark-nlp==3.3.4
  Downloading https://files.pythonhosted.org/packages/48/8c/0d83c7e606651d0bec6c8c9a03bf3acbe1d9fbf8f840aa115505222e6328/spark_nlp-3.3.4-py2.py3-none-any.whl (133kB)
[K    100% |████████████████████████████████| 143kB 6.5MB/s eta 0:00:01
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-3.3.4
Note: you may need to restart the kernel to use updated packages.


In [2]:
spark

In [3]:
pip install numpy pandas nltk

Collecting numpy
  Downloading https://files.pythonhosted.org/packages/45/b2/6c7545bb7a38754d63048c7696804a0d947328125d81bf12beaa692c3ae3/numpy-1.19.5-cp36-cp36m-manylinux1_x86_64.whl (13.4MB)
[K    100% |████████████████████████████████| 13.4MB 98kB/s  eta 0:00:01  4% |█▋                              | 665kB 36.2MB/s eta 0:00:01    32% |██████████▎                     | 4.3MB 36.2MB/s eta 0:00:01    58% |██████████████████▋             | 7.8MB 37.0MB/s eta 0:00:01
[?25hCollecting pandas
  Downloading https://files.pythonhosted.org/packages/c3/e2/00cacecafbab071c787019f00ad84ca3185952f6bb9bca9550ed83870d4d/pandas-1.1.5-cp36-cp36m-manylinux1_x86_64.whl (9.5MB)
[K    100% |████████████████████████████████| 9.5MB 142kB/s eta 0:00:01
[?25hCollecting nltk
  Downloading https://files.pythonhosted.org/packages/c5/ea/84c7247f5c96c5a1b619fe822fb44052081ccfbe487a49d4c888306adec7/nltk-3.6.7-py3-none-any.whl (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 921kB/s eta 0:00:01
[?25

# Imports

In [4]:
from sparknlp.annotator import *
import pyspark.sql.functions as f
from pyspark.sql import Window
import pyspark.sql.types as t
from pyspark.ml.feature import Tokenizer as pysparkTokenizer, HashingTF, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Tokenizer, Normalizer, LemmatizerModel, StopWordsCleaner, PerceptronModel, Chunker
from pyspark.ml.clustering import LDA
from nltk.corpus import stopwords
import pandas as pd
import nltk
import sparknlp
nltk.download('stopwords')

from pyspark.sql.functions import regexp_extract, col
from pyspark.sql.functions import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
print("Spark NLP version", sparknlp.version())

Spark NLP version 3.3.4


## Preprocess tweeter data

In [6]:
df_tweet = spark.read.option("tableName", "Tweets").format("dynamodb").load().select(f.col("text"))
df_tweet = df_tweet.withColumn("user", regexp_extract(col("text"), r"@([A-Za-z0-9]+)", 1))
df_tweet = (df_tweet.withColumn('text', regexp_replace('text', r"@[A-Za-z0-9]+", ''))
            .withColumn('text', regexp_replace('text', "RT[\s:]+", '')))
df_tweet.show()
f"Lenght: {df_tweet.count()}"

+--------------------+--------------+
|                text|          user|
+--------------------+--------------+
| You really like ...|   TomCottonAR|
| ...eyes off you ...|   TomCottonAR|
| ...taxes for you...|   TomCottonAR|
|This is a very tr...|       tedcruz|
|_JoeManchin I app...|           Sen|
| - Nearly 1 Milli...|         POTUS|
|_JoeManchin is gi...|           Sen|
| Big Pharma compa...|  amyklobuchar|
| didn't he lie an...|HillaryClinton|
| The Name Ray Epp...|HillaryClinton|
|There is somethin...|  amyklobuchar|
|I know folks are ...|         POTUS|
|                  👀|       tedcruz|
|This is a very tr...|       tedcruz|
|A son of Searchli...|         POTUS|
|In honor of forme...|    SenSchumer|
|_JoeManchin Where...|           Sen|
| https://t.co/qvR...|      RandPaul|
| https://t.co/9kC...|      RandPaul|
| Republicans cons...|      HouseGOP|
+--------------------+--------------+
only showing top 20 rows



'Lenght: 14169'

In [8]:
df_acc = spark.read.option("header","true").csv("../accounts.csv").select("party", "username").distinct()
df_acc.show()
f"Lenght: {df_acc.count()}"

+-----+---------------+
|party|       username|
+-----+---------------+
|    D|  SenWhitehouse|
|    D| SenCoonsOffice|
|    R|     ThomTillis|
|    D|    SenStabenow|
|    R|RoundsforSenate|
|    D|       timkaine|
|    D|    SenJackReed|
|    R|  JohnnyIsakson|
|    D|      SenBooker|
|    R|  SenJohnHoeven|
|    R|  TheBushCenter|
|    R|RepublicanStudy|
|    R|SenatorLankford|
|    D| SenJeffMerkley|
|    D| MurrayCampaign|
|    R|   SenTomCotton|
|    R|     TeamCornyn|
|    R|    RogerWicker|
|    D|      Bob_Casey|
|    R|    TomCottonAR|
+-----+---------------+
only showing top 20 rows



'Lenght: 198'

In [9]:
df_tweet = df_tweet.join(df_acc, df_tweet.user == df_acc.username).select('party', 'text')
df_tweet.show()

+-----+--------------------+
|party|                text|
+-----+--------------------+
|    R| You really like ...|
|    R| ...eyes off you ...|
|    R| ...taxes for you...|
|    R|This is a very tr...|
|    D| - Nearly 1 Milli...|
|    D| Big Pharma compa...|
|    D| didn't he lie an...|
|    D| The Name Ray Epp...|
|    D|There is somethin...|
|    D|I know folks are ...|
|    R|                  👀|
|    R|This is a very tr...|
|    D|A son of Searchli...|
|    D|In honor of forme...|
|    R| https://t.co/qvR...|
|    R| https://t.co/9kC...|
|    R| Republicans cons...|
|    R|How to steal an e...|
|    D| A day after Pres...|
|    D| F**k.  Off. http...|
+-----+--------------------+
only showing top 20 rows



## Preprocess reddit data

In [10]:
df_reddit = spark.read.option("tableName", "RedditPosts").format("dynamodb").load().select(f.col("submission_id").alias("text"), f.col("subreddit"))
df_reddit = df_reddit.withColumn("party", initcap(col('subreddit')).substr(1,1)).select("party", "text")
df_reddit.show()
f"Lenght: {df_reddit.count()}"

+-----+--------------------+
|party|                text|
+-----+--------------------+
|    D|Lincoln County lo...|
|    D|Harris charts her...|
|    D|Rudy Giuliani and...|
|    D|Thousands of Russ...|
|    D|A Capitol rioter ...|
|    D|19-year-old charg...|
|    D|Harris says Ameri...|
|    D|Harris says Ameri...|
|    D|America is now in...|
|    D|Alleged ‘dead’ Ge...|
|    D|America is now in...|
|    D|Alleged ‘dead’ Ge...|
|    D|#TBT: The First K...|
|    D|#TBT: The First K...|
|    D|Joe Biden's admin...|
|    D|Joe Biden's admin...|
|    D|Georgia Republica...|
|    D|What Commitment t...|
|    D|Supreme Court: Ju...|
|    D|Georgia Republica...|
+-----+--------------------+
only showing top 20 rows



'Lenght: 178'

## Join both sources

In [11]:
df = df_tweet.limit(1000).union(df_reddit)
df.show()
f"Lenght: {df.count()}"

+-----+--------------------+
|party|                text|
+-----+--------------------+
|    R| You really like ...|
|    R| ...eyes off you ...|
|    R| ...taxes for you...|
|    R|This is a very tr...|
|    D| - Nearly 1 Milli...|
|    D| Big Pharma compa...|
|    D| didn't he lie an...|
|    D| The Name Ray Epp...|
|    D|There is somethin...|
|    D|I know folks are ...|
|    R|                  👀|
|    R|This is a very tr...|
|    D|A son of Searchli...|
|    D|In honor of forme...|
|    R| https://t.co/qvR...|
|    R| https://t.co/9kC...|
|    R| Republicans cons...|
|    R|How to steal an e...|
|    D| A day after Pres...|
|    D| F**k.  Off. http...|
+-----+--------------------+
only showing top 20 rows



'Lenght: 1178'

## Train test dataset

In [12]:
train, test = df.randomSplit([0.7, 0.3], seed=42)

In [13]:
train.groupBy("party") \
    .count() \
    .show()

+-----+-----+
|party|count|
+-----+-----+
|    D|  539|
|    R|  330|
+-----+-----+



In [14]:
test.groupBy("party") \
    .count() \
    .show()

+-----+-----+
|party|count|
+-----+-----+
|    D|  198|
|    R|  111|
+-----+-----+



In [15]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

In [16]:
embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L8_512") \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

sent_small_bert_L8_512 download started this may take some time.
Approximate size to download 149.1 MB
[OK!]


In [17]:
# bert_sent = BertSentenceEmbeddings.pretrained('sent_small_bert_L8_512')\
#     .setInputCols( ["document"])\
#     .setOutputCol("sentence_embeddings")

In [18]:
# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
    .setInputCols (["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("party")\
    .setMaxEpochs (20)\
    .setEnableOutputLogs (True)

In [19]:
bert_sent_clf_pipeline = Pipeline(
    stages = [
        document
        #embeddings,
        #classsifierdl
    ])

In [None]:
bert_Model = bert_sent_clf_pipeline.fit(train)

In [None]:
preds = bert_Model.transform(test)

In [None]:
preds.show()