In [20]:
from pyspark.sql import *
import pyspark.sql.functions as F
import pyspark.sql.types as T 
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
get_ipython().run_line_magic('matplotlib', 'inline')

In [23]:
from pyspark.sql import SparkSession
import pyspark
sc = pyspark.SparkContext(appName="ToxicTwitterComments")
spark = pyspark.sql.SQLContext(sc)

In [24]:
trainDF = spark.read.csv('train.csv', 
                         header=True, 
                         multiLine=True, 
                         encoding="UTF-8",
                         sep=',',
                         escape='"',
                         inferSchema=True)

In [25]:
trainDF.printSchema()

root
 |-- id: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- toxic: integer (nullable = true)
 |-- severe_toxic: integer (nullable = true)
 |-- obscene: integer (nullable = true)
 |-- threat: integer (nullable = true)
 |-- insult: integer (nullable = true)
 |-- identity_hate: integer (nullable = true)



In [26]:
trainDF.createOrReplaceTempView('train')

In [27]:
spark.sql('''
SELECT * FROM train
limit(10)
''').toPandas()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [28]:
def clean(lines):
    lines = lines.lower().translate(str.maketrans('', '', string.punctuation))
    stop_words = list(set(stopwords.words('english')))
    stop_words.remove('not')
    word_tokens = word_tokenize(lines)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    cleaned_line = []
    for w in filtered_sentence:
        word = PorterStemmer().stem(w)
        cleaned_line.append(word)
    return cleaned_line

In [29]:
clean("'Hey man, I'm really not trying to edit war.'")

['hey', 'man', 'im', 'realli', 'not', 'tri', 'edit', 'war']

In [30]:
spark.udf.register("clean", clean, T.StringType())

<function __main__.clean(lines)>

In [31]:
spark.sql("""
SELECT clean(comment_text) cleaned_comment 
FROM train
LIMIT 5
""").toPandas()

Unnamed: 0,cleaned_comment
0,"[explan, edit, made, usernam, hardcor, metalli..."
1,"[daww, match, background, colour, im, seemingl..."
2,"[hey, man, im, realli, not, tri, edit, war, gu..."
3,"[cant, make, real, suggest, improv, wonder, se..."
4,"[sir, hero, chanc, rememb, page, that]"


In [None]:
##test