# Step 1: Producing labelled tweets data

In [1]:
# Parameters
TOPIC = 'labelledtweets' # Name of the topic
URL = 'https://drive.google.com/uc?id=1Lf4ievO9SP1DD-KIOXL8pOc36yZ88lqr' # URL where the dataset is stored
OUTPUT = 'data/labelledtweets.csv' # Place to store the dataset after download
ENCODER = 'ISO-8859-1' # Encoder used in the csv
HEADERS = 'false' # The csv does not have headers
NAMES = ['target', 'id', 'date', 'flag', 'user', 'text'] # Manually putting headers of the csv
IDCOL = "id" # Column used as key
DELAY = 0 # Delay between messages

# Joining names as one string
NAMES = ",".join(NAMES)

In [None]:
# Download the file and start producing the data to the topic
# Careful. It takes like 3 hours
!python producers/csvproducer.py \
                --topic $TOPIC \
                --url $URL \
                --output $OUTPUT \
                --encoder $ENCODER \
                --headers $HEADERS \
                --names $NAMES \
                --idcol $IDCOL \
                --delay $DELAY \
                > /dev/null

# Step 2: Building Spark Streaming DF to create parquet files

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.sql.types import MapType, StringType, IntegerType, ArrayType, StructType, \
                              StructField, LongType, DoubleType, BooleanType, FloatType

In [2]:
# Creating Spark session
spark = SparkSession.builder \
        .appName('kafka') \
        .getOrCreate()

In [3]:
# Schema of the labelled tweets data
schema_tweet = StructType([
    StructField("target", IntegerType(),  True),
    StructField("id", IntegerType(),  True),
    StructField("date", StringType(),  True),
    StructField("flag", StringType(),  True),
    StructField("user", StringType(),  True),
    StructField("text", StringType(), True)
    ]
)

In [4]:
# Read data from the topic
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "broker:29092") \
  .option("subscribe", "labelledtweets") \
  .option("startingOffsets","earliest") \
  .option("includeHeaders", "true") \
  .load()

In [5]:
# Transform data of the topic
ds = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")\
    .withColumn("value", F.from_json("value", schema_tweet)) \
    .select(F.col('value.*')) \
    .select('target', 'text')

In [6]:
# Output the data to a parquet file
output_ds = ds \
    .writeStream \
    .format("parquet") \
    .option("path", "data/parquet/") \
    .option("checkpointLocation", "checkpoint/data") \
    .outputMode("Append") \
    .start()

# Step 3: Training the model

In [8]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import NGram

from pyspark.ml.tuning import TrainValidationSplit

from pyspark.ml import Pipeline, PipelineModel

from pyspark.ml.classification import NaiveBayes

from pyspark.ml.evaluation import BinaryClassificationEvaluator

import cld3

In [9]:
## To always show the results of DataFrames and improve the formatting of the output
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

## To allow future conversion of Spark DataFrame into Pandas DataFrame
spark.conf.set("spark.sql.execution.arrow.enabled", True)

##  UDFs

### Remove words that start with a character

In [10]:
def remove_start(text, start_chr):
    '''
    It returns the string but removing words that start with start_chr
    '''
    return " ".join(filter(lambda word:word[0]!=start_chr, text.split()))

In [11]:
# Creating UDF of the remove_start functio
UDF_remove_start = F.udf(lambda text, start_chr: remove_start(text, start_chr), 
                        StringType())

### Detect language

In [12]:
def detect_lang(text):
    '''
    It returns the language of the text if possible
    '''
    try:
        return cld3.get_language(text)[0]
    except:
        return None

In [13]:
# Creating UDF for detect_lang function
UDF_detect_lang = F.udf(lambda text: detect_lang(text), 
                        StringType())

## Cleaning text

In [14]:
# Adding schema just in case
parquet_schema = StructType([
    StructField("target", IntegerType(),  True),
    StructField("text", StringType(),  True)
    ]
)

In [15]:
# 1) Reading tweets data from parquets
# 2) Removing words that strat with @
# 3) Removing words that strat with #
# 4) To lowercase
# 5) Add language column
# 6) Filter only enlish
# 7) Remove lang column
# 8) Remove numbers
# 9) Remove symbols
# 10) Keep obs with at least one letter

df = spark.read.format('parquet').schema(parquet_schema).load('data/parquet/') \
    .withColumn('text',UDF_remove_start(F.col('text'), F.lit('@'))) \
    .withColumn('text',UDF_remove_start(F.col('text'), F.lit('#'))) \
    .withColumn('text', F.lower(F.col('text'))) \
    .withColumn('lang', UDF_detect_lang(F.col('text'))) \
    .filter(F.col('lang') == 'en') \
    .select('target', 'text') \
    .withColumn("text", F.regexp_replace(F.col("text"), r'[0-9]', '')) \
    .withColumn("text", F.regexp_replace(F.col("text"), r'[$-/:-?{-~!"^_`\[\]]', '')) \
    .filter(F.col('text').rlike("^.*[a-zA-Z]+.*$"))

In [16]:
df

target,text
0,is upset that he ...
0,i dived many time...
0,my whole body fee...
0,no its not behavi...
0,not the whole crew
0,hey long time no ...
0,nope they didnt h...
0,i couldnt bear to...
0,it it counts idk ...
0,i wouldve been th...


## Training

In [308]:
# Spliting data into train and test set
train, test = df.randomSplit([0.7, 0.3], seed=12345)

In [309]:
# Setting stop words (not used)
stopwords = StopWordsRemover.loadDefaultStopWords('english')
stopwords = stopwords + ["http","https","amp","rt","t","c","the"]

In [310]:
# Creating the variable stages to store all the transformations
stages = []

# Separating strings into tokens
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern="\\W+")
stages += [regexTokenizer]

# Stop words (not used)
# stopwordsRemover = StopWordsRemover(inputCol="tokens", outputCol="filtered").setStopWords(stopwords)
# stages += [stopwordsRemover]

# Creating bigrams
ngram = NGram(n=2, inputCol="tokens", outputCol="ngrams")
stages += [ngram]

# Get the frequency per bigram (top 10000 that appear at least 6 times)
countVectors = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize = 10000, minDF=6.0)
stages += [countVectors]

In [313]:
# Setting the model
naiveBayes = NaiveBayes(featuresCol="features", labelCol='target', smoothing=1.0, modelType="multinomial")
stages += [naiveBayes]

# Joining the stages
pipeline = Pipeline(stages=stages)

In [314]:
# Train the model
model = pipeline.fit(train)

In [321]:
# Save the model
model.save('models/naivebayes.model')

## Evaluation

In [315]:
# Get predictions
predictions = model.transform(test)

In [316]:
# Evaluate model
evaluator_auc = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='target')

auc = evaluator_auc.evaluate(predictions)
print("Area under the curve (AUC) on test data = %g" % auc)

Area under the curve (AUC) on test data = 0.570551
