In [1]:
import gensim
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV, dump, load
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.metrics import classification_report

import time
import multiprocessing

from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.utils import tokenize, simple_preprocess

from tqdm import tqdm
tqdm.pandas(desc='progress-bar')

import multiprocessing

In [2]:
df = pd.read_csv('Library/cleaned_text_train_df.csv')
df.dropna(inplace=True)
df.head()

Unnamed: 0,clean_text,toxic_type
0,explanation edit make username hardcore metall...,0
1,aww match background colour seemingly stuck th...,0
2,hey man really not try edit war guy constantly...,0
3,make real suggestion improvement wonder sectio...,0
4,sir hero chance remember page,0


In [3]:
# Split the data in train and test

train, test = train_test_split(df, test_size=0.2, random_state=12)

In [4]:
# Tokenizing and tagging the train and test sets

train_tagged = train.apply(lambda r: TaggedDocument(words=list(tokenize(r['clean_text'])), 
                                                    tags=str(r['toxic_type'])), axis=1)

test_tagged = test.apply(lambda r: TaggedDocument(words=list(tokenize(r['clean_text'])), 
                                                    tags=str(r['toxic_type'])), axis=1)


In [5]:
train_tagged.values[8]

TaggedDocument(words=['miss', 'general', 'causal', 'vandal', 'part', 'consideration', 'e', 'always', 'case', 'since', 'appear', 'time', 'large', 'number', 'user', 'vandalise', 'page', 'classify', 'vandalism', 'casual', 'talk'], tags='0')

In [6]:
test_tagged.values[5]

TaggedDocument(words=['hello', 'see', 'create', 'article', 'wikipedia', 'talk', 'articles', 'creation', 'fps', 'jg', 'sg', 'k', 'full', 'automatic', 'full', 'metal', 'gearbox', 'aeg', 'rifle', 'not', 'submit', 'include', 'encyclopedia', 'intend'], tags='0')

In [7]:
# Setup Doc2Vec training model

## Building the vocabulary dictionary

cores = multiprocessing.cpu_count()

model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample=0, workers=cores, epochs=15)

model_dbow.build_vocab(train_tagged.values)

In [8]:
# Let's check how many times a word appeared in corpus
model_dbow.wv.get_vecattr('article', 'count')

58333

In [9]:
# Train the model

model_dbow.train(pd.Series(train_tagged).values, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs)

In [10]:
# Function to infer the documents

def vectorize(model, corpus):
    regressors, tags = zip(*[(model.infer_vector(doc[0].split(), steps=20), doc[1]) for doc in corpus.values])
    return regressors, tags

In [11]:
# Text vectorization of train and test set

X_train, y_train = vectorize(model_dbow, train)
X_test, y_test = vectorize(model_dbow, test)

In [20]:
X_train


(array([-0.04541947, -0.05711823, -0.08620428,  0.06984594,  0.06245466,
         0.05271948, -0.15159163, -0.15030561, -0.09337347, -0.01554491,
         0.02805524, -0.02618225, -0.1247825 , -0.11216964, -0.07339843,
        -0.07111991,  0.09413322,  0.1506065 , -0.11012437,  0.01693248,
        -0.02157778,  0.0112526 , -0.04141685,  0.00585702,  0.08348784,
        -0.10102692, -0.12007303, -0.14318515,  0.09730779, -0.19077414,
         0.08795644,  0.11628111, -0.08909071, -0.11499856,  0.00798846,
         0.02229939, -0.04466223, -0.0484979 , -0.00413646, -0.0122757 ,
        -0.00934865, -0.06171915,  0.04017987, -0.06857849,  0.04272008,
        -0.0551642 ,  0.01889369, -0.03477311,  0.13718186, -0.21087272,
        -0.04472022, -0.0072092 , -0.10799128, -0.1320703 , -0.02850504,
         0.16667357,  0.01021395,  0.03813295, -0.06933233,  0.14705154,
         0.02548563,  0.08323767,  0.08577009, -0.05627497,  0.04452985,
        -0.12295159,  0.15149114,  0.05504617, -0.0

## Classification Algorithm

In [12]:
clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     28658
           1       0.82      0.53      0.65      3246

    accuracy                           0.94     31904
   macro avg       0.88      0.76      0.81     31904
weighted avg       0.94      0.94      0.93     31904



## Hyperparameter Tuning

In [14]:
class Doc2VecModel(BaseEstimator):

    def __init__(self, dm=1, vector_size=100, window=1, epochs=10, max_vocab_size=12e7, min_count=1):
        #print('>>>>>>>>init() called.\n')
        self.d2v_model = None
        self.vector_size = vector_size
        self.window = window
        self.dm = dm
        self.epochs = epochs
        self.max_vocab_size = max_vocab_size
        self.min_count = min_count

    def fit(self, corpus, y=None):
        #print('>>>>>>>>fit() called.\n')
        ## Initialize model
        self.d2v_model = Doc2Vec(vector_size=self.vector_size, window=self.window, dm=self.dm, epochs=self.epochs,
                                 max_vocab_size=self.max_vocab_size, min_count=self.min_count,
                                 alpha=0.025, min_alpha=0.001, seed=21)
        ## Tag docs
        docs_tagged=[]
        for index, row in corpus.iteritems():
            docs_tagged.append(TaggedDocument(words=list(tokenize(row)), tags=str(index)))
        ## Build vocabulary
        self.d2v_model.build_vocab(docs_tagged)
        ## Train model
        self.d2v_model.train(pd.Series(docs_tagged).values, total_examples=self.d2v_model.corpus_count, epochs=self.d2v_model.epochs)
        return self

    def transform(self, corpus):
        #print('>>>>>>>>transform() called.\n')
        sents = corpus.values
        regressors = [(self.d2v_model.infer_vector(doc[0].split(), steps=20)) for doc in sents]
        regressors = pd.DataFrame(regressors, index=corpus.index)
        return regressors


    def fit_transform(self, corpus, y=None):
        self.fit(corpus)
        return self.transform(corpus)
    

In [15]:
## Model Specifications
# Create pipeline
model_pipe = Pipeline([('vect', Doc2VecModel()),
                      ('clf', LogisticRegression(solver='liblinear'))])

# Parameter grid
param_grid = {'vect__window': list(range(5)),
              'vect__dm': [0,1],
              'vect__vector_size': list(range(100,400)),
              'vect__min_count': list(range(100)),
              'vect__epochs': [10,15,20,25,30],
              'clf__dual': (True,False),
              'clf__max_iter': [100,110,120,130,140],
              'clf__C': (1e-5, 1e2, "log-uniform"),
}

# Number of iterations: Number of parameter settings that are sampled.
n_iter = 15

# Split the data to train and test
train, test = train_test_split(df, test_size=0.2, random_state=21)


In [40]:
## Model Specifications
# Create pipeline
model_pipe = Pipeline([('vect', Doc2VecModel()),
                      ('clf', LogisticRegression(solver='liblinear'))])

# Parameter grid
param_grid = {'vect__window': [5],
              'vect__dm': [0],
              'vect__vector_size': [500],
              'vect__min_count': [100],
              'vect__epochs': [10],
              'clf__dual': (True, False),
              'clf__max_iter': [100],
              'clf__C': (1e-5, "log-uniform")
}

# Number of iterations: Number of parameter settings that are sampled.
n_iter = 15

# Split the data to train and test
train, test = train_test_split(df, test_size=0.2, random_state=21)


In [41]:
grid_search = GridSearchCV(model_pipe, param_grid, cv=5, scoring='f1')

## Print search configurations
print("Performing grid search...")
print("pipeline:", [name for name, _ in model_pipe.steps])
print("parameters:")
for key, value in param_grid.items():
    print('{}: {}'.format(key, value))

## Run the search
grid_search.fit(train.clean_text, train.toxic_type.values)

## Print best parameters and results
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
vect__window: [5]
vect__dm: [0]
vect__vector_size: [500]
vect__min_count: [100]
vect__epochs: [10]
clf__dual: (True, False)
clf__max_iter: [100]
clf__C: (1e-05, 'log-uniform')


KeyboardInterrupt: 