## Multi Intent Classifier based on toxic comment dataset 

####  Reference: https://en.wikipedia.org/wiki/Multi-label_classification

Goal of this notebook is to create a model which predicts a probability of each type of toxicity for each comment. 

•	Data Exploration

•	Text Preprocessing

•	ML pipelines

•	Evaluate Classifier 

•	Predictions on Test data



#### Import Libraries

In [None]:
import pandas as pd
import nltk
import plotly
import re
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer 
from plotly.offline import iplot
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report 
from sklearn.metrics import roc_auc_score,roc_curve,auc
import numpy as np
nltk.download(['wordnet', 'punkt', 'stopwords'])

#### Load Dataset

In [None]:
#read Train Dataset
toxic_comment_data=pd.read_csv("dataset/train.csv")
toxic_comment_data.shape

In [None]:
#Display first few rows
toxic_comment_data.head()

In [None]:
#View comment
toxic_comment_data["comment_text"][1]

In [None]:
toxic_comment_data.info()

In [None]:
#read test Dataset
toxic_comment_test=pd.read_csv("dataset/test.csv")
toxic_comment_test.shape

In [None]:
#Display first few rows
toxic_comment_test.head()

#### Data Exploration

In [None]:
#check if there are null values
toxic_comment_data.isnull().values.any()

In [None]:
#Distribution of length of comments
fig = go.Figure(go.Histogram(
       x=toxic_comment_data["comment_text"].apply(lambda x:len(x)),
       nbinsx=100,
   
    )
               )
fig.update_layout(
    title={
        'text': "Distribution of length of comments",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
        
        },
    xaxis_title="Length of comments",
    yaxis_title="Count",)
fig.show()

In [None]:
#Number of comments in each labels
toxic_comment_data.iloc[:,2:].sum().to_frame("count")

In [None]:
#Number of comments in each labels
fig = go.Figure(go.Bar(
       x=toxic_comment_data.iloc[:,2:].sum().index,
       y=toxic_comment_data.iloc[:,2:].sum().values,
   
    )
               )
fig.update_layout(
    title={
        'text': "Count of comments in each label",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
        
        },
    xaxis_title="Labels",
    yaxis_title="Count",)
fig.show()


In [None]:
#comments which have multiple labels
toxic_comment_data.iloc[:,2:].sum(axis=1).value_counts().to_frame("count")

In [None]:
# Are there comments which have multiple labels??
fig = go.Figure(go.Bar(
       x=toxic_comment_data.iloc[:,2:].sum(axis=1).value_counts().index,
       y=toxic_comment_data.iloc[:,2:].sum(axis=1).value_counts().values,
   
    )
               )
fig.update_layout(
    title={
        'text': "Count of comments with multiple label",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
        
        },
    xaxis_title="Multiple Labels count",
    yaxis_title="Count",)
fig.show()


In [None]:
#Most common words in the labels
for i in range(len(toxic_comment_data.columns[2:])):
    label=toxic_comment_data.columns[i+2]
    label_filter=toxic_comment_data[toxic_comment_data[label]==1]
    wordcloud = WordCloud (
                        background_color = 'white',
                        stopwords=STOPWORDS,
                        collocations=False,
                        width = 1000,
                        height = 1000
                            ).generate(''.join(label_filter["comment_text"].values))
    plt.rcParams["figure.figsize"] = (20,25)
    plt.subplot(3,3,i+1)
    plt.title("Common words in "+ label)
    plt.imshow(wordcloud)
    plt.axis('off')

#### Text Preprocessing

In [None]:
def text_preprocessing(text):

    # Normalize text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # Tokenize text
    words = word_tokenize(text)
    
    # Remove stop words
    stop = stopwords.words("english")
    words = [t for t in words if t not in stop]
    
    # Lemmatization
    s = [WordNetLemmatizer().lemmatize(w) for w in words]
    return s

#### ML pipelines

In [None]:
# Logistic Regression Algorithm
Pipeline_LR = Pipeline([
                ('vect', TfidfVectorizer(tokenizer = text_preprocessing,strip_accents='unicode', analyzer='word', ngram_range=(1,3))),
                ('tfidf', TfidfTransformer()),
                ('clf', OneVsRestClassifier(LogisticRegression(C=0.1),n_jobs=-1)),
            ])

# Naive Bayes Algorithm
Pipeline_NBC = Pipeline([
                ('vect', TfidfVectorizer(tokenizer = text_preprocessing,strip_accents='unicode', analyzer='word', ngram_range=(1,3))),
                ('tfidf', TfidfTransformer()),
                ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=None))),
            ])


#### Split Train data into train and test data

In [None]:
#Comments 
X=toxic_comment_data["comment_text"]
#Labels
Y=toxic_comment_data.iloc[:,2:]
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.30, random_state=42,shuffle=True)

#### Logistic Regression

In [None]:
# Compute ROC curve and AUC for each class
Labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
roc_auc_list=[]
for label in Labels:
    print("....",label,".....")
    Pipeline_LR.fit(X_train, y_train[label])
    prediction = Pipeline_LR.predict_proba(X_test)
    preds = prediction[:, 1]
    fpr, tpr, threshold = roc_curve(y_test[label], preds)
    roc_auc = auc(fpr, tpr)
    roc_auc_list.append(roc_auc)
    print(f'AUC: {roc_auc:.5f}')      
    # Plot ROC_AUC
    plt.rcParams["figure.figsize"] = (5,5)    
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.3f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
#Mean ROC_AUC for Logistic Regression 
print("Mean AUC Score for Logistic regression is ", np.mean(roc_auc_list))

#### Naive Bayes

In [None]:
#### Naive Bayes
roc_auc_list_NB=[]
for label in Labels:
    print("....",label,".....")
    Pipeline_NBC.fit(X_train, y_train[label])
    prediction = Pipeline_NBC.predict_proba(X_test)
    preds = prediction[:, 1]
    fpr, tpr, threshold = roc_curve(y_test[label], preds)
    roc_auc = auc(fpr, tpr)
    roc_auc_list_NB.append(roc_auc)
    print(f'AUC: {roc_auc:.5f}')      
    # Plot ROC_AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.3f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()


In [None]:
#Mean ROC_AUC for Naive Bayes
print("Mean AUC Score for Logistic regression is ", np.mean(roc_auc_list_NB))

#### We can see from the above AUC score for Logistic regression is better . so We will use logistic regression to predict labels for test data

In [None]:
# Compute ROC curve and AUC for each class
Labels=list(toxic_comment_data.columns)[2:]
roc_auc_list=[]
for label in Labels:
    print("....",label,".....")
    Pipeline_LR.fit(X,Y[label])
    prediction = Pipeline_LR.predict_proba(toxic_comment_test.comment_text)
    preds = prediction[:, 1]
    exec("preds_%s = pd.Series(preds)" % label)

In [None]:
result_data=pd.DataFrame()
result_data["id"]=toxic_comment_test["id"]
result_data["toxic"]=preds_toxic
result_data["severe_toxic"]=preds_severe_toxic
result_data["obscene"]=preds_obscene
result_data["threat"]=preds_threat
result_data["insult"]=preds_insult
result_data["identity_hate"]=preds_identity_hate

In [None]:
result_data.head()

In [None]:
result_data.to_csv("submission.csv")

## Future work to improve the model

##### 1.Hyperparameter tuning for Logistic Regression model
##### 2.Try different methods for feature extraction
##### 3.Try ensemble classification methods  
##### 4.Use pretrained Deep Learning models 
