# Neural Net

#### Import the libraries

In [None]:
import wordstovec as wtv
import preprocess_new as pre
import pandas as pd

#### Preprocess the data and transform comments into vectors

In [None]:
preProcessedTrainDF = pre.prepareTrainTestSet('train.csv','test.csv','word2vec',seperateLabelInfo=1)#,sampleNum=100)
sent2vec = []
sent2vec = wtv.WordsToVecFunction(preProcessedTrainDF)

In [None]:
# Save the data on disk

import pickle
with open("sent2vec.txt", "wb") as fp:   #Pickling
    pickle.dump(sent2vec, fp)
# with open("sent2vec.txt", "rb") as fp:   # Unpickling
#    sent2vec = pickle.load(fp)

In [None]:
data = pd.DataFrame(sent2vec)
data.head()

#### Parse the results to get the vectors and the labels

In [5]:
# Parse the sent2vec list to extract the vectors

vectorsList = []
for i in range(len(sent2vec)):
    vect_components = []
    for j in sent2vec[i][0]:
        vect_components.append(j)
    vectorsList.append(vect_components)
vectors = pd.DataFrame(vectorsList)
#data.shape

# Fill the NaN values with 0
vectors.fillna(0)

vectors.head()
vectors.shape

(159548, 100)

In [12]:
# Parse the sent2vec list to extract the labels

labelsList = []
for i in range(len(sent2vec)):
    labels_components = []
    for j in sent2vec[i][1]:
        labels_components.append(j)
    labelsList.append(labels_components)
labels = pd.DataFrame(labelsList)
labels.columns = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]

# Fill the NaN values with 0
labels['identity_hate'] = labels.identity_hate.fillna(0)
# Replace the type of the last columns to int64 like the others instead of float64 (check labels.dtypes)
labels['identity_hate'] = labels.identity_hate.astype('int64')

labels.head()
labels.shape

(159548, 6)

#### Train Test Split and Scaling

In [13]:
# Splitting the data

from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(vectors, labels, test_size = 0.30)  

In [14]:
# Scaling the data (not mandatory in this case since already scaled by words2vec)

from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)  

#### Applying the neural network using multi-labels

In [15]:
# Fit the model on the data with the multi-labels

from sklearn.neural_network import MLPClassifier  
mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)  
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [18]:
# Make the prediction

predictions = mlp.predict(X_test)

#### Evaluating the Algorithm

In [19]:
# The metrics

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
#print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test,predictions))
accuracy_score(y_test,predictions)

             precision    recall  f1-score   support

          0       0.77      0.28      0.41      4639
          1       0.49      0.04      0.07       493
          2       0.80      0.37      0.51      2570
          3       0.00      0.00      0.00       148
          4       0.68      0.33      0.45      2373
          5       0.00      0.00      0.00       452

avg / total       0.70      0.29      0.41     10675



  'precision', 'predicted', average, warn_for)


0.8991956544447927

#### Applying the neural network on each label

##### Label 1

In [20]:
# Fit the model on the data on label 1: toxic

mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)  
mlp.fit(X_train, y_train.toxic)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [21]:
# Make the prediction

predictions = mlp.predict(X_test)

In [22]:
# The metrics

print(confusion_matrix(y_test.toxic,predictions))  
print(classification_report(y_test.toxic,predictions))
accuracy_score(y_test.toxic,predictions)

[[42788   438]
 [ 3353  1286]]
             precision    recall  f1-score   support

          0       0.93      0.99      0.96     43226
          1       0.75      0.28      0.40      4639

avg / total       0.91      0.92      0.90     47865



0.9207980779275045

##### Label 2

In [23]:
# Fit the model on the data on label 2: severe_toxic

mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)  
mlp.fit(X_train, y_train.severe_toxic)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [24]:
# Make the prediction

predictions = mlp.predict(X_test)

In [25]:
# The metrics

print(confusion_matrix(y_test.severe_toxic,predictions))  
print(classification_report(y_test.severe_toxic,predictions))
accuracy_score(y_test.severe_toxic,predictions)

[[47027   345]
 [  295   198]]
             precision    recall  f1-score   support

          0       0.99      0.99      0.99     47372
          1       0.36      0.40      0.38       493

avg / total       0.99      0.99      0.99     47865



0.9866290609004492

##### Label 3

In [26]:
# Fit the model on the data on label 3: obscene

mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)  
mlp.fit(X_train, y_train.obscene)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [27]:
# Make the prediction

predictions = mlp.predict(X_test)

In [28]:
# The metrics

print(confusion_matrix(y_test.obscene,predictions))  
print(classification_report(y_test.obscene,predictions))
accuracy_score(y_test.obscene,predictions)

[[44941   354]
 [ 1555  1015]]
             precision    recall  f1-score   support

          0       0.97      0.99      0.98     45295
          1       0.74      0.39      0.52      2570

avg / total       0.95      0.96      0.95     47865



0.9601169957171211

##### Label 4

In [29]:
# Fit the model on the data on label 4: threat

mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)  
mlp.fit(X_train, y_train.threat)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [30]:
# Make the prediction

predictions = mlp.predict(X_test)

In [31]:
# The metrics
 
print(confusion_matrix(y_test.threat,predictions))  
print(classification_report(y_test.threat,predictions))
accuracy_score(y_test.threat,predictions)

[[47646    71]
 [  132    16]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     47717
          1       0.18      0.11      0.14       148

avg / total       0.99      1.00      1.00     47865



0.9957589052543612

##### Label 5

In [32]:
# Fit the model on the data on label 5: insult

mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)  
mlp.fit(X_train, y_train.insult)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [33]:
# Make the prediction

predictions = mlp.predict(X_test)

In [34]:
# The metrics

print(confusion_matrix(y_test.insult,predictions))  
print(classification_report(y_test.insult,predictions))
accuracy_score(y_test.insult,predictions)

[[45099   393]
 [ 1629   744]]
             precision    recall  f1-score   support

          0       0.97      0.99      0.98     45492
          1       0.65      0.31      0.42      2373

avg / total       0.95      0.96      0.95     47865



0.9577561892823566

##### Label 6

In [35]:
# Fit the model on the data on label 6: identity_hate

mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)  
mlp.fit(X_train, y_train.identity_hate)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [36]:
# Make the prediction

predictions = mlp.predict(X_test)

In [37]:
# The metrics

print(confusion_matrix(y_test.identity_hate,predictions))  
print(classification_report(y_test.identity_hate,predictions))
accuracy_score(y_test.identity_hate,predictions)

[[47349    64]
 [  406    46]]
             precision    recall  f1-score   support

          0       0.99      1.00      1.00     47413
          1       0.42      0.10      0.16       452

avg / total       0.99      0.99      0.99     47865



0.9901807165987674