In [1]:
import numpy as np
import pandas as pd
import time
import math
import threading
from nltk.corpus import stopwords

from pyspark import SparkContext
from pyspark import Row
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml.feature import Word2Vec
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors, DenseVector
from pyspark.ml.feature import PCA
from pyspark.sql.functions import split, udf, struct, array, col, lit

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
import word2vecUtilities as wvu
from pyspark.streaming import StreamingContext
from pyspark.sql.types import IntegerType

In [3]:
# Helper thread in order to have a stream running in the background in Jupyter

from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

# word2Vec and xgboost with Tweets 

## need C++ 2019 to load in gensim package of google 


In [9]:
data = sc.textFile('file:///C:/Users/ignac/Documents/GitHub/Advanced-Analytics-in-Business/spark/code_id_idf/lots_of_data/tweets')

data

df = spark.read.json(data)
df.show(20)

+--------+-------------------+--------------------+
|   label|           tweet_id|          tweet_text|
+--------+-------------------+--------------------+
|  #biden|1380150333767262211|#███████ bubbles ...|
|  #biden|1380150113650274308|Nothing to see he...|
|#vaccine|1380150618275389448|Well that was qui...|
|#vaccine|1380150554974904321|Good morning, Twi...|
|#vaccine|1380150530526306314|Here for my secon...|
|#vaccine|1380150486339366928|You don't have a ...|
|#vaccine|1380150434208358402|Because only cert...|
|#vaccine|1380150386863013894|#███████
#███████...|
|#vaccine|1380150373399339009|Proud to see thou...|
|  #covid|1380150776299937795|The pandemic is n...|
|  #covid|1380150698948575232|Tandon's next Emp...|
|  #covid|1380150671123623942|2 days virtual Co...|
|  #covid|1380150657823469573|Toronto ICU doc: ...|
|  #covid|1380150622796742658|India records 4th...|
|  #covid|1380150957300969473|When #███████ hit...|
|  #covid|1380150954503381000|With low literacy...|
|  #covid|13

In [10]:
from pyspark.sql.functions import when,regexp_replace
df2 = df.withColumn("label", when(df.label == "#biden",1)
                                 .when(df.label == "#inflation",2)
                    .when(df.label == "#china",3)
                    .when(df.label == "#stopasianhate",4)
                    .when(df.label == "#covid",5)
                    .when(df.label == "#vaccine",6)
                                 .when(df.label.isNull() ,"")
                                 .otherwise(df.label))
df2 = df2.withColumn('tweet_text', regexp_replace('tweet_text', r'[#@][^\s#@]+', ''))
df2.show(20, 100)

+-----+-------------------+----------------------------------------------------------------------------------------------------+
|label|           tweet_id|                                                                                          tweet_text|
+-----+-------------------+----------------------------------------------------------------------------------------------------+
|    1|1380150333767262211|  bubbles under surface....

|    1|1380150113650274308|Nothing to see here...Five people killed in  including 2 children, by a gunman, who has been capt...|
|    6|1380150618275389448|                                Well that was quick!! Poked by Pfizer...     https://t.co/7jJfkHc0xf|
|    6|1380150554974904321|        Good morning, Twitterverse. My kids are getting their first shots today, and I am ecstatic! |
|    6|1380150530526306314|      Here for my second   jab and the place is hopping!! Excellent!! Yay    https://t.co/9RuTTOwwLo|
|    6|1380150486339366928|You don't have

In [11]:
#We Tokenize the tweet texts

from pyspark.ml.feature import  Tokenizer
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import  IDF

tokenizer = Tokenizer(inputCol="tweet_text", outputCol="tweet_text_tokens")
data_words = tokenizer.transform(df2)
data_words.show()

+-----+-------------------+--------------------+--------------------+
|label|           tweet_id|          tweet_text|   tweet_text_tokens|
+-----+-------------------+--------------------+--------------------+
|    1|1380150333767262211| bubbles under su...|[, bubbles, under...|
|    1|1380150113650274308|Nothing to see he...|[nothing, to, see...|
|    6|1380150618275389448|Well that was qui...|[well, that, was,...|
|    6|1380150554974904321|Good morning, Twi...|[good, morning,, ...|
|    6|1380150530526306314|Here for my secon...|[here, for, my, s...|
|    6|1380150486339366928|You don't have a ...|[you, don't, have...|
|    6|1380150434208358402|Because only cert...|[because, only, c...|
|    6|1380150386863013894|





Ocugen: Pot...|[, , , , , , ocug...|
|    6|1380150373399339009|Proud to see thou...|[proud, to, see, ...|
|    5|1380150776299937795|The pandemic is n...|[the, pandemic, i...|
|    5|1380150698948575232|Tandon's next Emp...|[tandon's, next, ...|
|    5|1380150671123

In [12]:
df = data_words.toPandas()



df.label = df.label.astype('category')

print(df.dtypes)

label                category
tweet_id                int64
tweet_text             object
tweet_text_tokens      object
dtype: object


In [13]:
label_mapping = df['label'].cat.categories
df['label'] = df['label'].cat.codes
X = df['tweet_text_tokens']
y = df['label']


In [14]:
import os
from subprocess import call
test_size = 0.2
random_state = 1234


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state, stratify=y)


In [15]:
X_train.count()


1038

In [90]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec


class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):
    """
    Word vectors are averaged across to create the document-level vectors/features.
    gensim's own gensim.sklearn_api.W2VTransformer doesn't support out of vocabulary words,
    hence we roll out our own.
    All the parameters are gensim.models.Word2Vec's parameters.
    https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
    """

    def __init__(self, size=300, alpha=0.1, window=10, min_count=10, max_vocab_size=None,
                 sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
                 ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=10, null_word=0,
                 trim_rule=None, sorted_vocab=1, batch_words=1000, compute_loss=False,
                 callbacks=(), max_final_vocab=None):
        self.size = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = max_vocab_size
        self.sample = sample
        self.seed = seed
        self.workers = workers
        self.min_alpha = min_alpha
        self.sg = sg
        self.hs = hs
        self.negative = negative
        self.ns_exponent = ns_exponent
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.iter = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.batch_words = batch_words
        self.compute_loss = compute_loss
        self.callbacks = callbacks
        self.max_final_vocab = max_final_vocab

    def fit(self, X, y=None):
        self.model_ = Word2Vec(
            sentences=X, corpus_file=None,
            size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count,
            max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
            workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
            negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
            hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word,
            trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
            compute_loss=self.compute_loss, callbacks=self.callbacks,
            max_final_vocab=self.max_final_vocab)
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in X])
        return X_embeddings

    def _get_embedding(self, words):
        valid_words = [word for word in words if word in self.model_.wv.vocab]
        if valid_words:
            embedding = np.zeros((len(valid_words), self.size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model_.wv[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.size)



In [91]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

gensim_word2vec_tr = GensimWord2VecVectorizer(size=200, min_count=7, sg=1, alpha=0.25, iter=10)
xgb = XGBClassifier(learning_rate=0.1, n_estimators=100, n_jobs=-1)
w2v_xgb = Pipeline([
    ('w2v', gensim_word2vec_tr), 
    ('xgb', xgb)
])
w2v_xgb

Pipeline(steps=[('w2v',
                 GensimWord2VecVectorizer(alpha=0.25, min_count=7, sg=1,
                                          size=200)),
                ('xgb',
                 XGBClassifier(base_score=None, booster=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None, gamma=None, gpu_id=None,
                               importance_type='gain',
                               interaction_constraints=None, learning_rate=0.1,
                               max_delta_step=None, max_depth=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, n_estimators=100,
                               n_jobs=-1, num_parallel_tree=None,
                               random_state=None, reg_alpha=None,
                               reg_lambda=None, scale_pos_weight=None,
                               subsample=None, tree_metho

In [92]:

w2v_xgb.fit(X_train, y_train)

w2v_xgb







Pipeline(steps=[('w2v',
                 GensimWord2VecVectorizer(alpha=0.25, min_count=7, sg=1,
                                          size=200)),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='', learning_rate=0.1,
                               max_delta_step=0, max_depth=6,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=-1, num_parallel_tree=1,
                               objective='multi:softprob', random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
                               subsample=1, tree_method='exact'

In [93]:
from sklearn.metrics import accuracy_score, confusion_matrix

y_train_pred = w2v_xgb.predict(X_train)
print('Training set accuracy %s' % accuracy_score(y_train, y_train_pred))
confusion_matrix(y_train, y_train_pred)

Training set accuracy 0.9951830443159922


array([[141,   0,   0,   0,   0,   0],
       [  0,  59,   0,   1,   0,   0],
       [  0,   0, 245,   1,   0,   0],
       [  0,   0,   0,  87,   0,   0],
       [  0,   1,   0,   1, 229,   0],
       [  0,   0,   0,   1,   0, 272]], dtype=int64)

In [94]:


y_test_pred = w2v_xgb.predict(X_test)
print('Test set accuracy %s' % accuracy_score(y_test, y_test_pred))
confusion_matrix(y_test, y_test_pred)




Test set accuracy 0.4807692307692308


array([[16,  1,  6,  1,  6,  5],
       [ 0,  1,  7,  1,  2,  4],
       [ 1,  0, 41,  0, 10,  9],
       [ 0,  0,  2,  4,  6, 10],
       [ 3,  0,  8,  1, 27, 19],
       [ 4,  0, 13,  2, 14, 36]], dtype=int64)

In [22]:


vocab_size = len(w2v_xgb.named_steps['w2v'].model_.wv.index2word)
print('vocabulary size:', vocab_size)
w2v_xgb.named_steps['w2v'].model_.wv.most_similar(positive=['biden'])



vocabulary size: 428


[('east', 0.4596766233444214),
 ('it’s', 0.412503719329834),
 ('plan', 0.3902943432331085),
 ('never', 0.38893717527389526),
 ('every', 0.38804322481155396),
 ('trust', 0.3701404333114624),
 ('actually', 0.3690882921218872),
 ('come', 0.36514076590538025),
 ("doesn't", 0.35984739661216736),
 ('white', 0.3423382639884949)]

In [23]:
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType
 

In [24]:
globals()['models_loaded'] = False
globals()['my_model'] = None

from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.classification import LogisticRegressionModel
from  pyspark.ml.feature import CountVectorizerModel
from pyspark.ml.feature import  IDFModel 
import numpy.core.defchararray as np_f

    
def process(time, rdd):
    if rdd.isEmpty():
        return
    
  
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    df2 = df.withColumn("label", when(df.label == "#biden",1)
                                 .when(df.label == "#inflation",2)
                    .when(df.label == "#china",3)
                    .when(df.label == "#stopasianhate",4)
                    .when(df.label == "#covid",5)
                    .when(df.label == "#vaccine",6)
                                 .when(df.label.isNull() ,"")
                                 .otherwise(df.label))
    df2 = df2.withColumn('tweet_text', regexp_replace('tweet_text', r'[#@][^\s#@]+', ''))


    tokenizer = Tokenizer(inputCol="tweet_text", outputCol="tweet_text_tokens")
    data_words = tokenizer.transform(df2)
    data_words.show()
    
    DF = data_words.toPandas()
    X = DF['tweet_text_tokens']
    result = w2v_xgb.predict(X)
    result_pd = pd.DataFrame(result)
    pred = spark.createDataFrame(result_pd)
    result =  pred.selectExpr("0 as prediction")
    result.show()



In [25]:
ssc = StreamingContext(sc, 30)

In [26]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [27]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

In [28]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
+----------+-------------------+--------------------+
|     label|           tweet_id|          tweet_text|
+----------+-------------------+--------------------+
|#inflation|1393082618384502784|The next five yea...|
+----------+-------------------+--------------------+

+-----+-------------------+--------------------+--------------------+
|label|           tweet_id|          tweet_text|   tweet_text_tokens|
+-----+-------------------+--------------------+--------------------+
|    2|1393082618384502784|The next five yea...|[the, next, five,...|
+-----+-------------------+--------------------+--------------------+

+----------+
|prediction|
+----------+
|         0|
+----------+

