## Data Preprocessing

### Import packages and load data

In [0]:
# Import packages
from pyspark.sql.functions import udf, col, array_contains, concat, lit, monotonically_increasing_id, regexp_replace, lower, explode, split, collect_list, instr
from pyspark.ml.feature import Tokenizer, StopWordsRemover, IDF, Normalizer, CountVectorizer, StringIndexer, IndexToString
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import ArrayType, StringType
from nltk.stem import PorterStemmer
from pyspark.sql.window import Window
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer

# Loading posts from data lake
posts = spark.read.parquet("/tmp/project/posts")

In [0]:
display(posts)

In [0]:
# Data cleaning
df = posts[['id', 'Body', 'Title', 'Tags']]
df = df.filter(instr(col("Tags"), "<") > 0)

# Concatinate body and title to introduce highly relevant features
df = df.withColumn("text", concat(col("Title"), lit(" "), col("Body")))

df = df.withColumn("text", regexp_replace("text", r"\W+", " ")) \
     .withColumn("text", regexp_replace("text", r"\b\w\b", "")) \
     .withColumn("text", regexp_replace("text", "_", " ")) \
     .withColumn("text", regexp_replace("text", r"\s+", " ")) \
     .withColumn("text", lower("text"))

df = df.withColumn("Tags", regexp_replace("Tags", r"[<>]", " ")) \
     .withColumn("Tags", regexp_replace("Tags", r"\s+", " ")) \
     .withColumn("Tags", lower("Tags"))

df = df.withColumn("Tags", split(df.Tags, " "))
df = df.select("id", "text", explode("Tags").alias("tags"))

df = df[df['tags'] != '']

# data shuffling
df = df.repartition(10)

In [0]:
# Label encoder
label_encoder = StringIndexer(inputCol = "tags", outputCol = "label", handleInvalid='skip')

label_model = label_encoder.fit(df)
labels = label_model.transform(df)

# Find true labels
temp = labels.withColumnRenamed("id", "id_dup") \
     .groupBy("id_dup").agg(collect_list("label").alias("true_labels"))

true_labels = (labels.join(temp, labels.id == temp.id_dup, 'left_outer'))

In [0]:
display(true_labels)

In [0]:
# Tokenization
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
tokenized = tokenizer.transform(true_labels)

# Removing stop words
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="stop_words")
stopword = stopword_remover.transform(tokenized)

# Stemming
# Use stemming because of the computer science jargon. Words can have very specific meanings and usages
stemmer_func = udf(lambda words: [PorterStemmer().stem(word) for word in words], ArrayType(StringType()))
stemmed = stopword.withColumn("stemmed", stemmer_func(col("stop_words")))

# Removing additional stop words (highly frequent words based on EDA)
custom_stop_words = ['code', 'use', 'pre', 'get', 'want', 'like', 'thank', 'tri', 'work', 'way', 'need'] # Here

stopword_remover = StopWordsRemover(inputCol="stemmed", outputCol="filtered", stopWords = custom_stop_words)
filtered = stopword_remover.transform(stemmed)

# Count vectorizer
# There are ~8000 unique words

cv = CountVectorizer(vocabSize= 8000, inputCol="filtered", outputCol='cv') 
cv_model = cv.fit(filtered)
text_cv = cv_model.transform(filtered)

# TF-IDF
# Remove words that appeared only once
idf = IDF(inputCol='cv', outputCol="features", minDocFreq = 2)
idf_model = idf.fit(text_cv)
text_idf = idf_model.transform(text_cv)

In [0]:
# Checkpoint for next phase
text_idf.write.mode('overwrite').parquet("/tmp/project/preprocessed_data")

In [0]:
# Save preprocessing models
label_model.write().save('/mnt/bd-project/Models/stringindexer')
cv_model.write().save('/mnt/bd-project/Models/cv_model')
idf_model.write().save("/mnt/bd-project/Models/tfidf")
