In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
import re
import gc



%matplotlib inline

In [2]:
DATA_PATH = '../data/'
SUBMISSION_PATH = '../sub/'

In [26]:
df_train = pd.read_csv(DATA_PATH+"train.csv", sep = ",", header=0)
df_test = pd.read_csv(DATA_PATH+"test.csv", sep = ",", header=0)

In [39]:
# load different spellings of bad words in a dictionary
# key: different spelling form, val: root form if any
def bad_word_normalize(file_path):
    bad_word = {}
    with open(file_path, "r") as f:
        for line in f:
            line = line.strip().lower().split(", ")
            if len(line) == 1:
                bad_word[line[0].strip()] = line[0].strip()
            elif len(line) == 2: # different spelling of bad words return to dict
                bad_word[line[0].strip()] = line[1].strip()
            else:
                print("badwords.txt contains error line at:\n {}".format(line))
    return(bad_word)

BAD_WORD_DICT = bad_word_normalize(DATA_PATH+"badwords.txt")

## Feature Engineering
1. **Number of (Unique) Words**
2. **Number of Puncuations**
3. **Number of Sentences**
4. **Number of Cap Words**
5. **Mis-spelling**
6. **Number of Bad Words**

In [59]:
def feature_engineer(df):
    
    # 1. number of words && number of unique words
    df['word_count'] = df['comment_text'].apply(lambda x: len(x.lower().split(' ')))
    df['uni_word_count'] = df['comment_text'].apply(lambda x: len(set(x.lower().split(' '))))

    # 2. number of puncuations: ! or ?
    punc_re = re.compile(r'[?!]+')
    df['punc_counts'] = df['comment_text'].apply(lambda x: len(punc_re.findall(x)))

    # 3. number of sentences
    sentence_re = re.compile(r'[.!?]+[ ]')
    df['sentence_count'] = df['comment_text'].apply(lambda x: len(sentence_re.split(x)))

    # 4. number of Caps
    cap_re = re.compile(r'\b[A-Z]{2,}\b') # all cap words with 2+ chars, (get rid of 'I', 'A')
    df['caps_count'] = df['comment_text'].apply(lambda x: len(cap_re.findall(x)))

    # 5. mis-spelling ?????
    
    # 6. number of bad words
    bad_word_set = set(BAD_WORD_DICT.keys())
    df['badword_count'] = df['comment_text'].apply(lambda x: sum(
        [len(re.findall(r'\b'+badword+r'\b', x.lower())) for badword in bad_word_set]))
    return(df)

In [60]:
df_train_test = pd.concat([df_train.iloc[:,:2], df_test.iloc[:,:2]], axis=0)
df_train_test = feature_engineer(df_train_test)

df_train_test.head(10)

error: nothing to repeat at position 2

In [50]:
re.findall(r'\btest\b', 'test traintest Test'.lower())

['test', 'test']

In [53]:
pattern = 'test'
r'\b'+pattern+r'\b'

'\\btest\\b'

In [58]:
for badword in set(BAD_WORD_DICT.keys()):
    print(badword)

helvete
feg
pecker
biatch
butt-pirate
b!tch
cipa
motha fukker
spierdalaj
fuck
splooge
cunts
fukkah
jism
fuk*
teets
puto
spic
dike*
nigger*
paska*
basterds
penus
knobs
schmuck
s t f u
motha fuker
assh0lez
faget
bollock*
vullva
blow job
blowjob
nigger
h00r
niigr
tits
injun
qweerz
shits
buceta
mother fukker
packy
sh1tter
fanny
felcher
assholes
wop*
qweir
futkretzn
orgasim
muschi
hoorem whore
vulva
polak
sadist
gay
queef*
ekrem*
hore
kraut
kuk
bi+ch
@ss
hells
guiena
faig
slutty
b17ch
cocks
chink
testical
sh#t
chuj
titt*
hui
teez
gays
pussee
basterdz
sh!t
asswipe
prost!tute
penis
ekto
wh00r. whore
dildo
ahole
chraa
jerk-off
sh1tz
picka
fukin
masterbate
wetback*
shyte
sluts
orgasum
titt
b00bs
poontsee
h0r
g00k
queers
gaygirl
cock*
jackoff
dild0s
honkey
yed
azz
8ss
fagit
sphencter
retard
lipshitz
hoer*
assface
mother fukkah
motherfucker
b00b*
cunt
f u c k e r
pr1ck
son-of-a-bitch
fotze
shi+
hoar
screwing
cock-head
japs
fanculo
fux0r
piss*
c0ck
cawk
arschloch
sh1ter
jizz
pussy
peenus
ayir
reck

In [62]:
bad_word_set = set(BAD_WORD_DICT.keys())
df_train.iloc[:5, 1].apply(lambda x: )

error: nothing to repeat at position 2

In [63]:
sum([len(re.findall(r'\b'+badword+r'\b', df_train.iloc[5,1].lower())) for badword in bad_word_set])

error: nothing to repeat at position 2

In [64]:
df_train.iloc[5,1].lower()

'"\n\ncongratulations from me as well, use the tools well. \xa0· talk "'

In [66]:
[re.findall(r'\b'+badword+r'\b', df_train.iloc[5,1].lower()) for badword in bad_word_set]

error: nothing to repeat at position 2

In [71]:
for badword in bad_word_set:
    print(r'\b'+badword+r'\b')
    print(re.findall(r'\b'+badword+r'\b', df_train.iloc[5,1].lower()))

\bhelvete\b
[]
\bfeg\b
[]
\bpecker\b
[]
\bbiatch\b
[]
\bbutt-pirate\b
[]
\bb!tch\b
[]
\bcipa\b
[]
\bmotha fukker\b
[]
\bspierdalaj\b
[]
\bfuck\b
[]
\bsplooge\b
[]
\bcunts\b
[]
\bfukkah\b
[]
\bjism\b
[]
\bfuk*\b
[]
\bteets\b
[]
\bputo\b
[]
\bspic\b
[]
\bdike*\b
[]
\bnigger*\b
[]
\bpaska*\b
[]
\bbasterds\b
[]
\bpenus\b
[]
\bknobs\b
[]
\bschmuck\b
[]
\bs t f u\b
[]
\bmotha fuker\b
[]
\bassh0lez\b
[]
\bfaget\b
[]
\bbollock*\b
[]
\bvullva\b
[]
\bblow job\b
[]
\bblowjob\b
[]
\bnigger\b
[]
\bh00r\b
[]
\bniigr\b
[]
\btits\b
[]
\binjun\b
[]
\bqweerz\b
[]
\bshits\b
[]
\bbuceta\b
[]
\bmother fukker\b
[]
\bpacky\b
[]
\bsh1tter\b
[]
\bfanny\b
[]
\bfelcher\b
[]
\bassholes\b
[]
\bwop*\b
[]
\bqweir\b
[]
\bfutkretzn\b
[]
\borgasim\b
[]
\bmuschi\b
[]
\bhoorem whore\b
[]
\bvulva\b
[]
\bpolak\b
[]
\bsadist\b
[]
\bgay\b
[]
\bqueef*\b
[]
\bekrem*\b
[]
\bhore\b
[]
\bkraut\b
[]
\bkuk\b
[]
\bbi+ch\b
[]
\b@ss\b
[]
\bhells\b
[]
\bguiena\b
[]
\bfaig\b
[]
\bslutty\b
[]
\bb17ch\b
[]
\bcocks\b
[]
\bchink\b
[]
\btest

error: nothing to repeat at position 2