In [1]:
from pyspark.sql import *
import pyspark.sql.functions as F
import pyspark.sql.types as T 
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
get_ipython().run_line_magic('matplotlib', 'inline')

In [2]:
from pyspark.sql import SparkSession
import pyspark
sc = pyspark.SparkContext(appName="ToxicTwitterComments")
spark = pyspark.sql.SQLContext(sc)

# 1. Load data

In [3]:
trainDF = spark.read.csv('train.csv', 
                         header=True, 
                         multiLine=True, 
                         encoding="UTF-8",
                         sep=',',
                         escape='"',
                         inferSchema=True)

In [4]:
trainDF.printSchema()

root
 |-- id: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- toxic: integer (nullable = true)
 |-- severe_toxic: integer (nullable = true)
 |-- obscene: integer (nullable = true)
 |-- threat: integer (nullable = true)
 |-- insult: integer (nullable = true)
 |-- identity_hate: integer (nullable = true)



In [5]:
trainDF.createOrReplaceTempView('train')

In [6]:
spark.sql('''
SELECT * FROM train
limit(10)
''').toPandas()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


# 2. Data Cleaning

In [7]:
def clean(lines):
    lines = lines.lower().translate(str.maketrans('', '', string.punctuation))
    stop_words = list(set(stopwords.words('english')))
    stop_words.remove('not')
    word_tokens = word_tokenize(lines)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    cleaned_line = []
    for w in filtered_sentence:
        word = PorterStemmer().stem(w)
        cleaned_line.append(word)
    return ' '.join(cleaned_line)

In [8]:
type(clean("'Hey man, I'm really not trying to edit war.'"))

str

In [9]:
spark.udf.register("clean", clean, T.StringType())

<function __main__.clean>

In [10]:
df_train=spark.sql("""
SELECT clean(comment_text) cleaned_comment, toxic, severe_toxic,obscene,threat,insult,identity_hate
FROM train
limit 1000
""")

In [11]:
# get a column of array<string> for word2vec
from pyspark.sql.functions import *
df_train2=df_train.withColumn("cleaned_comment2",split(df_train['cleaned_comment'], ' '))

In [12]:
df_train2.show(2)

+--------------------+-----+------------+-------+------+------+-------------+--------------------+
|     cleaned_comment|toxic|severe_toxic|obscene|threat|insult|identity_hate|    cleaned_comment2|
+--------------------+-----+------------+-------+------+------+-------------+--------------------+
|explan edit made ...|    0|           0|      0|     0|     0|            0|[explan, edit, ma...|
|daww match backgr...|    0|           0|      0|     0|     0|            0|[daww, match, bac...|
+--------------------+-----+------------+-------+------+------+-------------+--------------------+
only showing top 2 rows



In [13]:
df_train2.printSchema()

root
 |-- cleaned_comment: string (nullable = true)
 |-- toxic: integer (nullable = true)
 |-- severe_toxic: integer (nullable = true)
 |-- obscene: integer (nullable = true)
 |-- threat: integer (nullable = true)
 |-- insult: integer (nullable = true)
 |-- identity_hate: integer (nullable = true)
 |-- cleaned_comment2: array (nullable = true)
 |    |-- element: string (containsNull = true)



# 3. Feature Engineering

## 3.1 Word2Vec

In [14]:
from pyspark.ml.feature import Word2Vec
word2Vec = Word2Vec(vectorSize=1, seed=42, inputCol="cleaned_comment2", outputCol="features")
model = word2Vec.fit(df_train2)
#model.getVectors().show()
wvfeature = model.transform(df_train2)


In [15]:
wvfeature.show(2)

+--------------------+-----+------------+-------+------+------+-------------+--------------------+--------------------+
|     cleaned_comment|toxic|severe_toxic|obscene|threat|insult|identity_hate|    cleaned_comment2|            features|
+--------------------+-----+------------+-------+------+------+-------------+--------------------+--------------------+
|explan edit made ...|    0|           0|      0|     0|     0|            0|[explan, edit, ma...|[0.16769594788827...|
|daww match backgr...|    0|           0|      0|     0|     0|            0|[daww, match, bac...|[0.06638946570456...|
+--------------------+-----+------------+-------+------+------+-------------+--------------------+--------------------+
only showing top 2 rows



Now we get the Word2Vec features， we can follow the procedure like TF-IDF to build another model and compare the performance. 

## 3.2 TF-IDF

In [16]:
#from pyspark.mllib.feature import HashingTF, IDF
from __future__ import print_function
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession
tokenizer = Tokenizer(inputCol="cleaned_comment", outputCol="words")
wordsData = tokenizer.transform(df_train)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("features").show()


+--------------------+
|            features|
+--------------------+
|(262144,[13426,25...|
|(262144,[21432,72...|
|(262144,[1353,497...|
|(262144,[5173,796...|
|(262144,[19553,33...|
|(262144,[19247,72...|
|(262144,[34343,49...|
|(262144,[87273,96...|
|(262144,[976,9481...|
|(262144,[102641,1...|
|(262144,[4977,588...|
|(262144,[24918,50...|
|(262144,[5145,213...|
|(262144,[5463,636...|
|(262144,[13781,27...|
|(262144,[1353,314...|
|(262144,[70028,87...|
|(262144,[12225,30...|
|(262144,[4977,268...|
|(262144,[5595,138...|
+--------------------+
only showing top 20 rows



# 4. Data Spliting 

In [161]:
class_name="toxic"
table_toxic=rescaledData.select("features",class_name)
# table_toxic=wvfeature.select("features",class_name)
train, test = table_toxic.randomSplit([0.7, 0.3])
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 715
Test Dataset Count: 285


# 5. Modelling (LR)

## 5.1 Train

Adjust weight to balance data

In [162]:
#get weight to adjust unbalanced
def weight_ratio(predictions,class_name):
    pos_lab=predictions[predictions[class_name]==1].count()
    neg_lab=predictions[predictions[class_name]==0].count()
    return neg_lab/pos_lab

In [163]:
ratio=weight_ratio(train,class_name)
train=train.withColumn("weight", F.when(train[class_name]==1,ratio).otherwise(1))

In [164]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features',weightCol="weight",labelCol = class_name,maxIter=10)
lrModel = lr.fit(train)

## 5.2 Prediction

In [165]:
predictions = lrModel.transform(test)
predictions.select(class_name, 'rawPrediction', 'prediction', 'probability').show(10)

+-----+--------------------+----------+--------------------+
|toxic|       rawPrediction|prediction|         probability|
+-----+--------------------+----------+--------------------+
|    0|[15.4693295751181...|       0.0|[0.99999980868217...|
|    0|[28.4578665260417...|       0.0|[0.99999999999956...|
|    0|[1.08383327509183...|       0.0|[0.74721870939531...|
|    0|[53.0803120305009...|       0.0|[1.0,8.8616253985...|
|    0|[20.8773927680332...|       0.0|[0.99999999914283...|
|    0|[12.8276604647968...|       0.0|[0.99999731455242...|
|    0|[4.42092757086431...|       0.0|[0.98811976213226...|
|    0|[8.79687933254632...|       0.0|[0.99984881866162...|
|    0|[2.08399842937656...|       0.0|[0.88933815493915...|
|    0|[19.5575406115810...|       0.0|[0.99999999679175...|
+-----+--------------------+----------+--------------------+
only showing top 10 rows



In [166]:
predictions=predictions.select(predictions[class_name].alias("label"), 'rawPrediction', 'prediction', 'probability')

In [167]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.905882352941178


In [171]:
def recall_precision(predictions):
    TP=predictions[(predictions.label==1)&(predictions.prediction==1)].count()
    FP=predictions[(predictions.label==0)&(predictions.prediction==1)].count()
    TN=predictions[(predictions.label==0)&(predictions.prediction==0)].count()
    FN=predictions[(predictions.label==1)&(predictions.prediction==0)].count()
    recall=TP/(TP+FN)
    precision=TP/(TP+FP)
    f1=2 * precision * recall / (precision + recall)
    return recall,precision,f1

In [172]:
recall, precision, f1=recall_precision(predictions)
print("Pricision: " + str(precision))
print("Recall: " + str(recall))
print("F1 score: " + str(f1))

Pricision: 0.6818181818181818
Recall: 0.5
F1 score: 0.576923076923077
