In [1]:
#import required packages
#basic
import pandas as pd 
import numpy as np

#misc
import gc
import time
import warnings
#viz
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns
import pyLDAvis.gensim
#nlp
import string
import re     #for regex
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary


#Modeling
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from scipy import sparse

#settings
start_time=time.time()
color = sns.color_palette()
sns.set_style("dark")

#constants
eng_stopwords = set(stopwords.words("english"))
#settings
warnings.filterwarnings("ignore")
lem = WordNetLemmatizer()
tokenizer=ToktokTokenizer()


%matplotlib inline

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
start_time=time.time()
#importing the dataset
train=pd.read_csv("/Users/karen/Desktop/datamining/cw1/jigsaw-toxic-comment-classification-challenge/train.csv")
test=pd.read_csv("/Users/karen/Desktop/datamining/cw1/jigsaw-toxic-comment-classification-challenge/test.csv")
end_import=time.time()
print("Time till import:",end_import-start_time,"s")

Time till import: 1.6451141834259033 s


In [3]:
#to seperate sentenses into words
def preprocess(comment):
    """
    Function to build tokenized texts from input comment
    """
    return gensim.utils.simple_preprocess(comment, deacc=True, min_len=3)

In [4]:
#tokenize the comments
train_text=train.comment_text.apply(lambda x: preprocess(x))
test_text=test.comment_text.apply(lambda x: preprocess(x))
all_text=train_text.append(test_text)
end_preprocess=time.time()
print("Time till pre-process:",end_preprocess-start_time,"s")

Time till pre-process: 57.17487096786499 s


In [5]:
#checks
#print("Total number of comments:",len(all_text))
#print("Before preprocessing:",train.comment_text.iloc[30])
print(all_text)

0         [explanation, why, the, edits, made, under, us...
1         [aww, matches, this, background, colour, seemi...
2         [hey, man, really, not, trying, edit, war, jus...
3         [more, can, make, any, real, suggestions, impr...
4         [you, sir, are, hero, any, chance, you, rememb...
5         [congratulations, from, well, use, the, tools,...
6             [cocksucker, before, you, piss, around, work]
7         [your, vandalism, the, matt, shirvington, arti...
8         [sorry, the, word, nonsense, was, offensive, y...
9         [alignment, this, subject, and, which, are, co...
10        [fair, use, rationale, for, image, wonju, jpg,...
11        [bbq, man, and, lets, discuss, maybe, over, th...
12        [hey, what, talk, what, exclusive, group, some...
13        [before, you, start, throwing, accusations, an...
14        [and, the, girl, above, started, her, argument...
15        [juelz, santanas, age, juelz, santana, was, ye...
16        [bye, don, look, come, think, 

In [6]:
#Phrases help us group together bigrams :  new + york --> new_york
bigram = gensim.models.Phrases(all_text)

In [7]:
#check bigram collation functionality 
bigram[all_text.iloc[30]]

['how',
 'could',
 'post',
 'before',
 'the',
 'block_expires',
 'the',
 'funny_thing',
 'you',
 'think',
 'being_uncivil']

In [8]:
def clean(word_list):
    """
    Function to clean the pre-processed word lists 
    
    Following transformations will be done
    1) Stop words removal from the nltk stopword list
    2) Bigram collation (Finding common bigrams and grouping them together using gensim.models.phrases)
    3) Lemmatization (Converting word to its root form : babies --> baby ; children --> child)
    """
    #remove stop words
    clean_words = [w for w in word_list if not w in eng_stopwords]
    #collect bigrams
    clean_words = bigram[clean_words]
    #Lemmatize
    clean_words=[lem.lemmatize(word, "v") for word in clean_words]
    return(clean_words)    

In [9]:
#check clean function
import nltk
nltk.download('wordnet')
print("Before clean:",all_text.iloc[1])
print("After clean:",clean(all_text.iloc[1]))
print(all_text)

[nltk_data] Downloading package wordnet to /Users/karen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Before clean: ['aww', 'matches', 'this', 'background', 'colour', 'seemingly', 'stuck', 'with', 'thanks', 'talk', 'january', 'utc']
After clean: ['aww', 'match', 'background', 'colour', 'seemingly', 'stick', 'thank', 'talk', 'january_utc']
0         [explanation, why, the, edits, made, under, us...
1         [aww, matches, this, background, colour, seemi...
2         [hey, man, really, not, trying, edit, war, jus...
3         [more, can, make, any, real, suggestions, impr...
4         [you, sir, are, hero, any, chance, you, rememb...
5         [congratulations, from, well, use, the, tools,...
6             [cocksucker, before, you, piss, around, work]
7         [your, vandalism, the, matt, shirvington, arti...
8         [sorry, the, word, nonsense, was, offensive, y...
9         [alignment, this, subject, and, which, are, co...
10        [fair, use, rationale, for, i

In [10]:
#scale it to all text
all_text=all_text.apply(lambda x:clean(x))
end_clean=time.time()
#print("Time till cleaning corpus:",end_clean-start_time,"s")
print(type(all_text))

<class 'pandas.core.series.Series'>


In [11]:
#create the dictionary
dictionary = Dictionary(all_text)
print(dictionary)
#print("There are",len(dictionary),"number of words in the final dictionary")

Dictionary(322843 unique tokens: ['closure', 'dolls', 'edit', 'explanation', 'fac']...)


In [12]:
#convert into lookup tuples within the dictionary using doc2bow
print(dictionary.doc2bow(all_text.iloc[1]))
print("Wordlist from the sentence:",all_text.iloc[1])
#to check
print("Wordlist from the dictionary lookup:", 
      dictionary[21],dictionary[22],dictionary[23],dictionary[24],dictionary[25],dictionary[26],dictionary[27])


[(21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)]
Wordlist from the sentence: ['aww', 'match', 'background', 'colour', 'seemingly', 'stick', 'thank', 'talk', 'january_utc']
Wordlist from the dictionary lookup: aww background colour january_utc match seemingly stick


In [13]:
#scale it to all text
corpus = [dictionary.doc2bow(text) for text in all_text]
end_corpus=time.time()
print("Time till corpus creation:",end_clean-start_time,"s")

Time till corpus creation: 176.90189003944397 s


In [14]:
#create the LDA model
ldamodel = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary)
end_lda=time.time()
print("Time till LDA model creation:",end_lda-start_time,"s")

Time till LDA model creation: 360.1368000507355 s


In [15]:
#pyLDAvis.enable_notebook()

In [16]:
#pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

In [17]:
#end_viz=time.time()
#print("Time till viz:",end_viz-start_time,"s")

In [18]:
#creating the topic probability matrix 
topic_probability_mat = ldamodel[corpus]

In [19]:
#split it to test and train
train_matrix=topic_probability_mat[:train.shape[0]]
test_matrix=topic_probability_mat[train.shape[0]:]

In [20]:
del(topic_probability_mat)
del(corpus)
del(all_text)
gc.collect()

0

In [21]:
#convert to sparse format (Csr matrix)
train_sparse=gensim.matutils.corpus2csc(train_matrix)
test_sparse=gensim.matutils.corpus2csc(test_matrix)
end_time=time.time()
print("total time till Sparse mat creation",end_time-start_time,"s")

total time till Sparse mat creation 463.2603120803833 s


In [26]:
#custom NB model
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self
    

model = NbSvmClassifier(C=2, dual=True, n_jobs=-1)

In [23]:
#set the target columns
target_x=train_sparse.transpose()
TARGET_COLS=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
target_y=train[TARGET_COLS]

In [24]:
del(train_sparse)
gc.collect()

0

In [25]:
model = NbSvmClassifier(C=4, dual=True, n_jobs=-1)
X_train, X_valid, y_train, y_valid = train_test_split(target_x, target_y, test_size=0.33, random_state=2018)
train_loss = []
valid_loss = []
preds_train = np.zeros((X_train.shape[0], y_train.shape[1]))
preds_valid = np.zeros((X_valid.shape[0], y_train.shape[1]))
for i, j in enumerate(TARGET_COLS):
    print('Class:= '+j)
    model.fit(X_train,y_train[j])
    preds_valid[:,i] = model.predict_proba(X_valid)[:,1]
    preds_train[:,i] = model.predict_proba(X_train)[:,1]
    train_loss_class=log_loss(y_train[j],preds_train[:,i])
    valid_loss_class=log_loss(y_valid[j],preds_valid[:,i])
    print('Trainloss=log loss:', train_loss_class)
    print('Validloss=log loss:', valid_loss_class)
    train_loss.append(train_loss_class)
    valid_loss.append(valid_loss_class)
print('mean column-wise log loss:Train dataset', np.mean(train_loss))
print('mean column-wise log loss:Validation dataset', np.mean(valid_loss))


end_time=time.time()
print("total time till NB base model creation",end_time-start_time)

Class:= toxic
Trainloss=log loss: 0.24875156155448505
Validloss=log loss: 0.24859783520421386
Class:= severe_toxic
Trainloss=log loss: 0.04213695052989292
Validloss=log loss: 0.04326405536329918
Class:= obscene
Trainloss=log loss: 0.1578500852145476
Validloss=log loss: 0.15775597008347342
Class:= threat
Trainloss=log loss: 0.016821525405965217
Validloss=log loss: 0.016563809523861245
Class:= insult
Trainloss=log loss: 0.15174119952283305
Validloss=log loss: 0.1533719794504035
Class:= identity_hate
Trainloss=log loss: 0.04004948560782748
Validloss=log loss: 0.04315805746136871
mean column-wise log loss:Train dataset 0.1095584679725919
mean column-wise log loss:Validation dataset 0.1104519511811033
total time till NB base model creation 479.46180415153503
