In [1]:
from pyspark.sql import *
import pyspark.sql.functions as F
import pyspark.sql.types as T 
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
get_ipython().run_line_magic('matplotlib', 'inline')

In [None]:
#sc.stop()

In [2]:
from pyspark.sql import SparkSession
import pyspark
sc = pyspark.SparkContext(appName="ToxicTwitterComments")
spark = pyspark.sql.SQLContext(sc)

# 1. Load data

In [57]:
##Modify path to train.csv
trainDF = spark.read.csv('test2/train.csv', 
                         header=True, 
                         multiLine=True, 
                         encoding="UTF-8",
                         sep=',',
                         escape='"',
                         inferSchema=True)

# 1.1 Split data

In [58]:
train, test = trainDF.randomSplit([0.6, 0.4], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 95557
Test Dataset Count: 64014


In [59]:
train.createOrReplaceTempView('train')
test.createOrReplaceTempView('test')

In [13]:
train.printSchema()

root
 |-- id: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- toxic: integer (nullable = true)
 |-- severe_toxic: integer (nullable = true)
 |-- obscene: integer (nullable = true)
 |-- threat: integer (nullable = true)
 |-- insult: integer (nullable = true)
 |-- identity_hate: integer (nullable = true)



In [15]:
spark.sql('''
SELECT * FROM train
limit(10)
''').toPandas()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
4,00024b59235015f3,Virgin\nMy only warning? You'll block me? Well...,1,0,1,0,1,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,0002bfc2abe2a51f,"""*::::::::I believe that you're confusing """"pr...",0,0,0,0,0,0
8,0002eeaf4c0cdf35,But isnt it against the rules to edit if you a...,0,0,0,0,0,0
9,00030003d620f7a8,"""\nseems about right. nableezy - """,0,0,0,0,0,0


# 2. Data Cleaning

In [16]:
stop_words = list(set(stopwords.words('english')))
stop_words.remove('not')

In [17]:
def clean(lines):
    lines = lines.lower().translate(str.maketrans('', '', string.punctuation))
    lines = lines.split(' ')
    filtered_sentence = [w for w in lines if not w in stop_words]
    cleaned_line = []
    for w in filtered_sentence:
        word = PorterStemmer().stem(w)
        cleaned_line.append(word)
    return ' '.join(cleaned_line)

In [18]:
type(clean("'Hey man, I'm really not trying to edit war.'"))

str

In [19]:
spark.udf.register("clean", clean, T.StringType())

<function __main__.clean(lines)>

In [64]:
#Get rid of limit
df_train=spark.sql("""
SELECT clean(comment_text) cleaned_comment, toxic, severe_toxic,obscene,threat,insult,identity_hate
FROM train
LIMIT 20000
""")

In [65]:
#cleaning test data
df_test=spark.sql("""
SELECT clean(comment_text) cleaned_comment, toxic, severe_toxic,obscene,threat,insult,identity_hate
FROM test
LIMIT 15000
""")

In [66]:
# get a column of array<string> for word2vec
from pyspark.sql.functions import *
df_train2=df_train.withColumn("cleaned_comment2",split(df_train['cleaned_comment'], ' '))
df_test2 = df_test.withColumn("cleaned_comment2",split(df_test['cleaned_comment'], ' '))

In [None]:
df_train2.show(2)

In [None]:
df_train2.printSchema()

# 3. Feature Engineering

## 3.1 Word2Vec

In [67]:
from pyspark.ml.feature import Word2Vec
from pyspark.ml import Pipeline

word2Vec = Word2Vec(vectorSize=30, seed=42, inputCol="cleaned_comment2", outputCol="features")
#model = word2Vec.fit(df_train2)
#model.getVectors().show()
#wvfeature = model.transform(df_train2)


pipeline_w2v = Pipeline(stages=[word2Vec])
pipelineModel_w2v = pipeline_w2v.fit(df_train2)
wvfeature = pipelineModel_w2v.transform(df_train2)
wvfeature_test = pipelineModel_w2v.transform(df_test2)

In [None]:
wvfeature.show(2)

Now we get the Word2Vec features， we can follow the procedure like TF-IDF to build another model and compare the performance. 

## 3.2 TF-IDF

In [None]:
#import sys
#sys.getrecursionlimit()

In [None]:
#sys.setrecursionlimit(30000)

In [68]:
from __future__ import print_function
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

In [69]:
#data processing pipeline
tokenizer = Tokenizer(inputCol="cleaned_comment", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf])
pipelineModel = pipeline.fit(df_train)
rescaledData = pipelineModel.transform(df_train)
rescaledData_test = pipelineModel.transform(df_test)

In [None]:
#Save as Parquet for faster read
#rescaledData.write.parquet("rescaledData.parquet")

In [None]:
#rescaledData = spark.read.parquet("rescaledData.parquet")

# 4. Modelling (LR)

## 4.1 Training, Testing and Evaluation

Adjust weight to balance data

In [None]:
#get weight to adjust unbalanced
def weight_ratio(predictions,class_name):
    pos_lab=predictions[predictions[class_name]==1].count()
    neg_lab=predictions[predictions[class_name]==0].count()
    return neg_lab/pos_lab

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
evaluator = BinaryClassificationEvaluator()

def recall_precision(predictions):
    TP=predictions[(predictions.label==1)&(predictions.prediction==1)].count()
    FP=predictions[(predictions.label==0)&(predictions.prediction==1)].count()
    TN=predictions[(predictions.label==0)&(predictions.prediction==0)].count()
    FN=predictions[(predictions.label==1)&(predictions.prediction==0)].count()
    recall=TP/(TP+FN)
    precision=TP/(TP+FP)
    f1=2 * precision * recall / (precision + recall)
    return recall, precision, f1

In [70]:
class_name = ["toxic", 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [71]:
for name in class_name:
    train=rescaledData.select("features",name)
    test = rescaledData_test.select("features",name)
    ratio=weight_ratio(train,name)
    train=train.withColumn("weight", F.when(train[name]==1,ratio).otherwise(1))
    ##changed maxIter to 5 for faster calculation
    lr = LogisticRegression(featuresCol = 'features',weightCol="weight",labelCol = name, maxIter=10)
    lrModel = lr.fit(train)
    #save model, not yet tested
    #lrModel.save([spark_context], [file_path])
    predictions = lrModel.transform(test)
    #predictions.select(class_name, 'rawPrediction', 'prediction', 'probability').show(10)
    predictions=predictions.select(predictions[name].alias("label"), 'rawPrediction', 'prediction', 'probability')
    print('Classification model for ' + str(name))
    print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))
    recall, precision, f1 = recall_precision(predictions)
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("F1 score: " + str(f1))

Classification model for toxic
Test Area Under ROC: 0.8285462173184769
Precision: 0.4675925925925926
Recall: 0.5422818791946309
F1 score: 0.5021752641392169
Classification model for severe_toxic
Test Area Under ROC: 0.8220228136342548
Precision: 0.12786885245901639
Recall: 0.5131578947368421
F1 score: 0.2047244094488189
Classification model for obscene
Test Area Under ROC: 0.8169618887586468
Precision: 0.2989771833202203
Recall: 0.4791929382093317
F1 score: 0.3682170542635658
Classification model for threat
Test Area Under ROC: 0.7855831651632627
Precision: 0.11764705882352941
Recall: 0.18181818181818182
F1 score: 0.14285714285714285
Classification model for insult
Test Area Under ROC: 0.8316064261375564
Precision: 0.3494874184529357
Recall: 0.49407114624505927
F1 score: 0.40938864628820953
Classification model for identity_hate
Test Area Under ROC: 0.8021161983026731
Precision: 0.20100502512562815
Recall: 0.30303030303030304
F1 score: 0.24169184290030213


In [72]:
for name in class_name:
    train=wvfeature.select("features",name)
    test = wvfeature_test.select("features",name)
    ratio=weight_ratio(train,name)
    train=train.withColumn("weight", F.when(train[name]==1,ratio).otherwise(1))
    ##changed maxIter to 5 for faster calculation
    lr = LogisticRegression(featuresCol = 'features',weightCol="weight",labelCol = name,maxIter=10)
    lrModel = lr.fit(train)
    #save model, not yet tested
    #lrModel.save([spark_context], [file_path])
    predictions = lrModel.transform(test)
    #predictions.select(class_name, 'rawPrediction', 'prediction', 'probability').show(10)
    predictions=predictions.select(predictions[name].alias("label"), 'rawPrediction', 'prediction', 'probability')
    print('Classification model for ' + str(name))
    print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))
    recall, precision, f1 = recall_precision(predictions)
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("F1 score: " + str(f1))

Classification model for toxic
Test Area Under ROC: 0.9232691170845194
Precision: 0.4505791505791506
Recall: 0.7832214765100671
F1 score: 0.5720588235294117
Classification model for severe_toxic
Test Area Under ROC: 0.9746527974704784
Precision: 0.1499460625674218
Recall: 0.9144736842105263
F1 score: 0.25764596848934196
Classification model for obscene
Test Area Under ROC: 0.9540816557492106
Precision: 0.38648180242634317
Recall: 0.8436317780580076
F1 score: 0.5301109350237718
Classification model for threat
Test Area Under ROC: 0.9215167521699208
Precision: 0.02328508495909377
Recall: 0.8409090909090909
F1 score: 0.0453153704837722
Classification model for insult
Test Area Under ROC: 0.9420444819690497
Precision: 0.33496999454446263
Recall: 0.8089591567852438
F1 score: 0.4737654320987654
Classification model for identity_hate
Test Area Under ROC: 0.9339312210076061
Precision: 0.06905370843989769
Recall: 0.8181818181818182
F1 score: 0.12735849056603774
