https://github.com/tianqwang/Toxic-Comment-Classification-Challenge/blob/master/Toxic_Comment_Classification.ipynb
https://github.com/katwegner/DeepToxic/blob/master/clean_data.ipynb

In [10]:
import pandas as pd
import numpy as np
import re
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from timeit import default_timer as timer

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import fbeta_score
from statistics import mean
from sklearn.metrics import hamming_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve

from sklearn.metrics import roc_auc_score, confusion_matrix
import statistics
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from wordcloud import WordCloud
from collections import Counter

from sklearn.pipeline import Pipeline

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
#import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
from utils import *
%matplotlib inline

In [11]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True,
                            reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    # remove stop words and stemm
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

In [12]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
WNlemma = nltk.WordNetLemmatizer()
stemmer = nltk.PorterStemmer()




[nltk_data] Downloading package wordnet to /Users/kw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/kw/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [13]:
train = pd.read_csv("../data/train_wikipedia_pre_clean.csv")
test = pd.read_csv("../data/test_wikipedia_pre_clean.csv")
test_y = pd.read_csv("../data/test_labels_wikipedia_pre_clean.csv")


In [14]:
stopwords_english = stopwords.words('english')


## Additional Cleaning

Get rid of nontext

In [15]:
train["comment_text"]  = train["comment_text"] .apply(lambda x: x.encode("latin-1","ignore").decode('ISO-8859-1'))
test["comment_text"]  = test["comment_text"] .apply(lambda x: x.encode("latin-1","ignore").decode('ISO-8859-1'))


Get rid of numbers

In [16]:
train["comment_text"]  = train["comment_text"] .apply(lambda x: x.encode("ascii","ignore").decode('ISO-8859-1'))
test["comment_text"]  = test["comment_text"] .apply(lambda x: x.encode("ascii","ignore").decode('ISO-8859-1'))


remove stock market tickers like $GE


In [17]:
train["comment_text"]  = train["comment_text"] .apply(lambda x: re.sub(r'\$\w*', '', x))
test["comment_text"]  = test["comment_text"] .apply(lambda x: re.sub(r'\$\w*', '', x))

Remove Hashtags

In [18]:
train["comment_text"]  = train["comment_text"] .apply(lambda x: re.sub(r'#', '', x))
test["comment_text"]  = test["comment_text"] .apply(lambda x: re.sub(r'#', '', x))


# Preprocessing
### Tokenizer
    * nltk.tokenize.TweetTokenizer
    * Twikenizer
    * Twokenizer by ARK at CMU
    * twokenize.


In [20]:
X = train["comment_text"]
Y = train['toxic']
X_test = test["comment_text"]
#Y_test = test['toxic']

Tweettokenizer

In [21]:
# create a function for the tweet tokenizer from NLTK
def tweettokenizer(text):
    tt = TweetTokenizer(preserve_case=True, strip_handles=True,
                            reduce_len=True)
    return tt.tokenize(text)
 

twikenizer

In [71]:
#import twikenizer as twk
#def twi(text):
#    twk = twk.Twikenizer()
#    return twk.tokenize(text)


Twokenizer

In [72]:
#def two(text):


In [102]:
def nlp_preprocess(X, tokenizer, vectorizer, min_df=1, stop_words={}, ngram=1, lowercase=True, test=0):
    print('Vectorizing...')
    if vectorizer == 'count':
        analyzer = CountVectorizer(min_df=min_df, 
                            ngram_range=(1,ngram), stop_words=stop_words, 
                            tokenizer=tokenizer, 
                            lowercase=lowercase).build_analyzer()
        vect = CountVectorizer(analyzer=analyzer)
    elif vectorizer == 'tfidf':
        analyzer = TfidfVectorizer(min_df=min_df,
                        ngram_range=(1,ngram), stop_words=stop_words, 
                        tokenizer=tweettokenizer, 
                        lowercase=lowercase).build_analyzer()        
        vect = TfidfVectorizer(analyzer=vectorizer)
    print('Done!')
    print(' ')
    print('Creating X...')
    if test == 0:
        X_vect_= vect.fit_transform(X)
    else: 
        X_vect_= vect.transform(X)
    print('Done!')
    return X_vect

def fit_nlp(X, Y, tokenizer, vectorizer, min_df=1, stop_words={}, ngram=1, lowercase=True):
    X_vect = nlp_preprocess(X, tokenizer, vectorizer, min_df=min_df, stop_words=stop_words, ngram=ngram, lowercase=lowercase)
    
    # Creating classifiers with default parameters initially.
    clf1 = MultinomialNB()
    clf2 = LogisticRegression()
    clf3 = LinearSVC()
    # Calculating the cross validation F1 and Recall score for our 3 baseline models.
    methods1_cv = pd.DataFrame(cross_validation_score(clf1, X_vect, Y))
    methods2_cv = pd.DataFrame(cross_validation_score(clf2, X_vect, Y))
    #methods3_cv = pd.DataFrame(cross_validation_score(clf3, X_vect, Y))

    # Creating a dataframe to show summary of results.
    print('Used Vectorizer: ' + tokenizer + ' ' + vectorizer)
    print(' ')
    print(f"Used parameters: Minimim Occurrence: {min_df}, stop words: {stop_words}, ngram: {ngram}, lower case: {lowercase}") 
    methods_cv = pd.concat([methods1_cv, methods2_cv])
    methods_cv.columns = ['Model', 'Recall', 'F1', 'ROC']
    meth_cv = methods_cv.reset_index()
    meth_cv[['Model', 'Recall', 'F1', 'ROC']]


In [25]:
# parameter
from nltk.tokenize import TweetTokenizer

min_df = 20 # 1
stop_words = 'english' # {}
ngram = 1 # 1, 2
lowercase = True # False
vectorizer = 'lemmatize_word' # 'stem_word'
#fit_nlp(X, Y, 'count', vectorizer, min_df=1, stop_words={}, ngram=1, lowercase=True)

### Vectorizer

#### CountVectorizer

In [27]:
# Fit the CountVectorizer to the training data
analyzer = CountVectorizer(min_df=min_df, 
                        ngram_range=(1,ngram), stop_words=stop_words, 
                        tokenizer=tweettokenizer, 
                        lowercase=False).build_analyzer()
# analyzer = TfidfVectorizer().build_analyzer()
def lemmatize_word(doc):
    return (WNlemma.lemmatize(t) for t in analyzer(doc))

def stemmer_word(doc):
    return (stemmer.stem(t) for t in analyzer(doc))
# transform the documents in the training data to a document-term matrix
# X_train_cv = cv.transform(X)
# print()
# print('CV Tokenized string:')
# print(X_train_cv[1])
lemm_vectorizer = CountVectorizer(analyzer=lemmatize_word)
X_train_cv_lemm = lemm_vectorizer.fit_transform(X[:100])
#X_test_cv_lemm = lemm_vectorizer.fit_transform(X_test)

#stemmer_vectorizer = CountVectorizer(analyzer=stemmer_word)
#X_train_cv_stem = stemmer_vectorizer.fit_transform(X)



In [28]:
print(X_train_cv_lemm)

  (0, 234)	1
  (0, 601)	1
  (0, 1016)	1
  (0, 1866)	1
  (0, 296)	1
  (0, 395)	1
  (0, 244)	1
  (0, 1611)	1
  (0, 85)	1
  (0, 554)	1
  (0, 1872)	1
  (0, 5)	1
  (0, 1271)	1
  (0, 843)	1
  (0, 271)	1
  (0, 314)	2
  (0, 1883)	1
  (0, 416)	1
  (0, 617)	1
  (0, 214)	1
  (0, 236)	1
  (0, 8)	3
  (0, 111)	1
  (0, 1587)	1
  (0, 1802)	1
  :	:
  (98, 1824)	1
  (98, 163)	1
  (98, 38)	1
  (99, 1866)	2
  (99, 85)	1
  (99, 5)	1
  (99, 314)	1
  (99, 8)	2
  (99, 111)	1
  (99, 2)	1
  (99, 3)	1
  (99, 618)	1
  (99, 1515)	1
  (99, 628)	1
  (99, 1618)	1
  (99, 15)	1
  (99, 1645)	1
  (99, 1635)	1
  (99, 1923)	1
  (99, 1787)	1
  (99, 350)	1
  (99, 992)	1
  (99, 1214)	1
  (99, 1078)	1
  (99, 1791)	1


TFIDF

In [29]:
analyzer = TfidfVectorizer(min_df=20,
                        ngram_range=(1,ngram), stop_words=stop_words, 
                        tokenizer=tweettokenizer, 
                        lowercase=False).build_analyzer()   
def lemmatize_word(doc):
    return (WNlemma.lemmatize(t) for t in analyzer(doc))

def stemmer_word(doc):
    return (stemmer.stem(t) for t in analyzer(doc))     
lemm_vectorizer = TfidfVectorizer(analyzer=lemmatize_word)
X_train_tf_lemm = lemm_vectorizer.fit_transform(X[:100])
print()
print('Tfidf Tokenized string:')
print(X_train_tf_lemm[1])


#stemmer_vectorizer = TfidfVectorizer(analyzer=stemmer_word)
#X_train_tf_stem = stemmer_vectorizer.fit_transform(X)
#print()
#print('Tfidf Tokenized string:')
#print(X_train_tf_stem[1])



Tfidf Tokenized string:
  (0, 571)	0.20056983923748828
  (0, 52)	0.2690495506761683
  (0, 20)	0.2690495506761683
  (0, 338)	0.2690495506761683
  (0, 53)	0.2690495506761683
  (0, 3)	0.23161924080269516
  (0, 2)	0.2349833796844818
  (0, 547)	0.16673133392805883
  (0, 1746)	0.2468856343568275
  (0, 1649)	0.2690495506761683
  (0, 849)	0.2690495506761683
  (0, 744)	0.2690495506761683
  (0, 1347)	0.23116008546768646
  (0, 299)	0.20056983923748828
  (0, 0)	0.15538115505072275
  (0, 190)	0.2690495506761683
  (0, 1790)	0.14051645770966564
  (0, 8)	0.13078436229586188
  (0, 314)	0.09095240351109517
  (0, 5)	0.14186481364233


Here:  merge lemmatizer/stemmer into vectorizer

# Test Performance

In [97]:
# Creating classifiers with default parameters initially.
clf1 = MultinomialNB()
clf2 = LogisticRegression()
clf3 = LinearSVC()

In [98]:
def cross_validation_score(classifier, X_train, y_train):
    '''
    Iterate though each label and return the cross validation F1 and Recall score 
    '''
    methods = []
    name = classifier.__class__.__name__.split('.')[-1]

    
    recall = cross_val_score(
        classifier, X_train, y_train, cv=10, scoring='recall')
    f1 = cross_val_score(classifier, X_train,
                        y_train, cv=10, scoring='f1')
    roc = cross_val_score(classifier, X_train,
                        y_train, cv=10, scoring='roc_auc')
    methods.append([name, recall.mean(), f1.mean(), roc.mean()])

    return methods

## Model Fitting with train data

CountVectorizer with Lemma

In [109]:
# Calculating the cross validation F1 and Recall score for our 3 baseline models.
methods1_cv = pd.DataFrame(cross_validation_score(clf1, X_train_tf_lemm, Y))
methods2_cv = pd.DataFrame(cross_validation_score(clf2, X_train_tf_lemm, Y))
#methods3_cv = pd.DataFrame(cross_validation_score(clf3, X_train_cv, Y))

# Creating a dataframe to show summary of results.
print('Used Vectorizer: CountVectorizer Lemmatization')
print(' ')
print(f"Used parameters: ") 
methods_cv = pd.concat([methods1_cv, methods2_cv])
methods_cv.columns = ['Model', 'Recall', 'F1', 'ROC']
meth_cv = methods_cv.reset_index()
meth_cv[['Model', 'Recall', 'F1', 'ROC']]


Used Vectorizer: CountVectorizer Lemmatization
 
Used parameters: 


Unnamed: 0,Model,Recall,F1,ROC
0,MultinomialNB,0.168397,0.276981,0.864504
1,LogisticRegression,0.562334,0.68394,0.966155


In [79]:
# Calculating the cross validation F1 and Recall score for our 3 baseline models.
methods1_cv = pd.DataFrame(cross_validation_score(clf1, X_train_cv_stem, Y))
methods2_cv = pd.DataFrame(cross_validation_score(clf2, X_train_cv_stem, Y))
#methods3_cv = pd.DataFrame(cross_validation_score(clf3, X_train_cv, Y))
# Creating a dataframe to show summary of results.
print('Used Vectorizer: CountVectorizer Stemming')
methods_cv = pd.concat([methods1_cv, methods2_cv])
methods_cv.columns = ['Model', 'Recall', 'F1', 'ROC']
meth_cv = methods_cv.reset_index()
meth_cv[['Model', 'Recall', 'F1', 'ROC']]

Used Vectorizer: CountVectorizer Stemming


Unnamed: 0,Model,Recall,F1,ROC
0,MultinomialNB,0.62252,0.627507,0.891245
1,LogisticRegression,0.623176,0.704004,0.943985


TfIDF

In [80]:
# Calculating the cross validation F1 and Recall score for our 3 baseline models.
methods1_tf = pd.DataFrame(cross_validation_score(clf1, X_train_tf_lemm, Y))
methods2_tf = pd.DataFrame(cross_validation_score(clf2, X_train_tf_lemm, Y))
#methods3_cv = pd.DataFrame(cross_validation_score(clf3, X_train_cv, Y))

# Creating a dataframe to show summary of results.
print('Used Vectorizer: tfidf Lemmatization')
methods_tf = pd.concat([methods1_tf, methods2_tf])
methods_tf.columns = ['Model', 'Recall', 'F1', 'ROC']
meth_tf = methods_tf.reset_index()
meth_tf[['Model', 'Recall', 'F1', 'ROC']]

Used Vectorizer: tfidf Lemmatization


Unnamed: 0,Model,Recall,F1,ROC
0,MultinomialNB,0.168397,0.276981,0.864504
1,LogisticRegression,0.562334,0.68394,0.966155


In [81]:
# Calculating the cross validation F1 and Recall score for our 3 baseline models.
methods1_tf = pd.DataFrame(cross_validation_score(clf1, X_train_tf_stem, Y))
methods2_tf = pd.DataFrame(cross_validation_score(clf2, X_train_tf_stem, Y))
#methods3_cv = pd.DataFrame(cross_validation_score(clf3, X_train_cv, Y))

# Creating a dataframe to show summary of results.
print('Used Vectorizer: tfidf Stemm')
methods_tf = pd.concat([methods1_tf, methods2_tf])
methods_tf.columns = ['Model', 'Recall', 'F1', 'ROC']
meth_tf = methods_tf.reset_index()
meth_tf[['Model', 'Recall', 'F1', 'ROC']]

Used Vectorizer: tfidf Stemm


Unnamed: 0,Model,Recall,F1,ROC
0,MultinomialNB,0.105966,0.187853,0.860785
1,LogisticRegression,0.593806,0.703538,0.969227


## Model Evaluation with test data

In [82]:
test_labels = ["toxic"]

In [83]:
def score(classifier, X_train, y_train, X_test, y_test):
    """
    Calculate Hamming-loss, F1, Recall for each label on test dataset.
    """
    methods = []
    hloss = []
    name = classifier.__class__.__name__.split('.')[-1]
    predict_df = pd.DataFrame()
    predict_df['id'] = test_y['id']

    for label in test_labels:
        classifier.fit(X_train, y_train[label])
        predicted = classifier.predict(X_test)

        predict_df[label] = predicted

        recall = recall_score(y_test[y_test[label] != -1][label],
                              predicted[y_test[label] != -1],
                              average="weighted")
        f1 = f1_score(y_test[y_test[label] != -1][label],
                      predicted[y_test[label] != -1],
                      average="weighted")
        roc_auc = roc_auc_score(y_test[y_test[label] != -1][label],
                      predicted[y_test[label] != -1],
                      average="weighted")
        conf_mat = confusion_matrix(y_test[y_test[label] != -1][label],
                                    predicted[y_test[label] != -1])

        methods.append([name, label, recall, f1, conf_mat])



    return methods

In [84]:
# Calculating tF1 and Recall score for our 3 baseline models.
methods1 = score(clf1, X_train, train, X_test, test_y)
methods2 = score(clf2, X_train, train, X_test, test_y)
methods3 = score(clf3, X_train, train, X_test, test_y)

NameError: name 'X_train' is not defined

In [None]:
# Creating a dataframe to show summary of results.
methods1 = pd.DataFrame(methods1)
methods2 = pd.DataFrame(methods2)
methods3 = pd.DataFrame(methods3)
methods = pd.concat([methods1, methods2, methods3])
methods.columns = ['Model', 'Label', 'Recall', 'F1', 'Confusion_Matrix']
meth = methods.reset_index()
meth[['Model', 'Label', 'Recall', 'F1']]

Unnamed: 0,Model,Label,Recall,F1
0,MultinomialNB,toxic,0.938538,0.932841
1,LogisticRegression,toxic,0.948855,0.947624
2,LinearSVC,toxic,0.959484,0.959611


In [93]:
# interactive visual : enter the label name.
def W_Cloud():
    """
    Visualize the most common words contributing to the token.
    """
    threat_context = train[train['toxic'] == 1]
    threat_text = threat_context.comment_text
    neg_text = pd.Series(threat_text).str.cat(sep=' ')
    wordcloud = WordCloud(width=1600, height=800,
                          max_font_size=200).generate(neg_text)

    plt.figure(figsize=(15, 10))
    plt.imshow(wordcloud.recolor(colormap="Blues"), interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Most common words associated with toxic comment", size=20)
    plt.show()
token = 'toxic'
W_Cloud()

AttributeError: 'Series' object has no attribute 'lower'

In [91]:
threat_context = train[train['toxic'] == 1]
threat_text = threat_context.comment_text
neg_text = pd.Series(threat_text).str.cat(sep=' ')
neg_text

