# Profanity in Quora questions
This notebook investigates the use and distribution of profanity (offensive language) among the Quora dataset of questions. Perhaps detection of profaniity in a question could help determine whether the question is insincere. We will use the full list of words blacklisted by Google, found [here](https://www.freewebheaders.com/full-list-of-bad-words-banned-by-google/).


## Import packages and data

In [13]:
# Import packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import pickle
from nltk.tokenize import word_tokenize
from tqdm.auto import tqdm

In [14]:
# Import training and test data from Quora questions dataset
train = pd.read_csv('/Volumes/SDMemory/Datasets/Quora/train.csv')
test = pd.read_csv('/Volumes/SDMemory/Datasets/Quora/test.csv')

# Import list of bad words (banned by Google)
bad_words = pd.read_csv('/Volumes/SDMemory/Datasets/Quora/bad_words.csv', header=None)

In [15]:
# Sample of bad words - some are actually not swear words as such
bad_words.sample(3)

Unnamed: 0,0
1543,tard
968,kike
1006,leather restraint


## Function to detect and output bad words in text

In [16]:
# Returns a list (possibly empty) of bad words in text
def detect_badwords(text):
    # tokenize the text
    tokens = word_tokenize(text)
    bad_found = []
    for word in tokens:
        for bad_word in bad_words[0]:
            if bad_word == word:
                bad_found.append(word)
    return bad_found

In [17]:
# Simple examples
s1 = 'This politician speaks nothing but bullshit, what a twat'
s2 = 'It is a nice day today'
[detect_badwords(s) for s in [s1,s2]]

[['bullshit', 'twat'], []]

In [18]:
# Simple use of tqdm
from time import sleep
for i in tqdm(range(100), desc = 'Running through i'):
    sleep(0.01)




## Apply to the Quora dataset


In [None]:
# Register `pandas.progress_apply' with `tqdm`
tqdm.pandas()
# Run detect_badwords over all entries in the training data
temp_size = 10000
train_bad_words = train['question_text'].progress_apply(detect_badwords)
# Add as a column to the DataFrame
train['bad_words'] = train_bad_words

In [None]:
# Export the new dataframe as a pickle file for import at a later date
train.to_pickle('/Volumes/SDMemory/Datasets/Quora/train_bw.pkl')

In [None]:
# Import training data with bad words detected
train = pd.read_pickle('/Volumes/SDMemory/Datasets/Quora/train_bw.pkl')

In [34]:
train.head()

Unnamed: 0,qid,question_text,target,bad_words
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,[]
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0,[]
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0,[]
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,[]
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,[]


In [41]:
# Collect all bad words used in the sincere data
bools_bw_nonempty = [train['bad_words'].iloc[i] != [] for i in range(train.shape[0])]
df_bw = train[bools_bw_nonempty]
# Sincere set
df_bw_sincere = train_bw[train_bw['target'] == 0]
# Insincere set
df_bw_insincere = train_bw[train_bw['target'] ==1 ]


In [60]:
df_bw_sincere.head()

Unnamed: 0,qid,question_text,target,bad_words
20,0000dd973dfd35508c16,How I know whether a girl had done sex before ...,0,"[sex, sex]"
48,000209378782897bbd75,What is that movie in which a kid is fooled in...,0,[kill]
58,00029d76717deaff60f6,Someone breaks into your house you shoot and k...,0,[kill]
181,00083b5c34d0b450557f,Can I kill myself now?,0,[kill]
242,000b5eb278abb46ace82,What does it mean that my boyfriend of 3 years...,0,[erection]


In [None]:
# Collect all bad words and their freqencies in the sincere questions
import collections
list_bw_sincere = df_bw_sincere['bad_words'].iloc[:temp_size].tolist()


In [None]:
# Define a function to flatten a list of lists
def flatten(l):
    x = []
    for sublist in l:
        for element in sublist:
            x.append(element)
    return x

# import collections library which has functions to compute frequency of elements
import collections

In [47]:
df_bw_sincere = train[bools_sincere & bools_bw_nonempty]

In [54]:
bw_sincere = set().union(*df_bw_sincere['bad_words'].tolist())
bw_sincere

TypeError: 'float' object is not iterable

In [None]:
temp=train[(train['bad_words'].iloc[i] == [] for i in range(train.shape[0])) and (train['target'] == 1)]

In [None]:
# Number of each type of question
[len(q) for q in [q_sincere, q_insincere]]

In [None]:
temp.iloc[11]

In [None]:
temp['question_text'].iloc[15]

In [None]:
# Extract sincere posts with a non-empty bad_words list
bools_badwords = [train['bad_words'].iloc[i] != [] for i in range(train.shape[0]) ]
sincere_bad = train[(train['target'] == 0) & bools_badwords]

In [None]:
sincere_bad.head()

In [None]:
# Collect all those bad words
sincere_badwords = set().union(*sincere_bad['bad_words'].tolist())
sincere_badwords

In [None]:
# Find sincere Quora post(s) with 'dickhead','bugger'
bools = ['fucking' in sincere_bad['bad_words'].iloc[i] for i in range(sincere_bad.shape[0])]
dh_posts = sincere_bad[bools]

In [None]:
print(len(dh_posts))
dh_posts['question_text'].iloc[2]

In [None]:
# Collect all bad words used in the sincere Quora questions
# Entries with sincere questions
bad_sincere = train[train['target'] == 0]['bad_words']
bad_insincere = train[train['target'] == 1]['bad_words']

In [None]:
bad_sincere_words = set().union(*bad_sincere.tolist())
bad_insincere_words = set().union(*bad_insincere.tolist())

In [None]:
bad_sincere_words

In [None]:
len(bad_sincere_words)

In [None]:
len(bad_insincere_words)