In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Load packages


In [None]:
import base64
import numpy as np
import pandas as pd
import seaborn as sns
import re
import string

# Plotly imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# Other imports
from collections import Counter
# from scipy.misc import imread
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost.sklearn import XGBClassifier

from wordcloud import WordCloud

#To ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
file = r'/kaggle/input/janatahack-independence-day-2020-ml-hackathon/'
train_df = pd.read_csv(file+'train.csv')
test_df = pd.read_csv(file+'test.csv')
sub_df = pd.read_csv(file+'sample_submission_UVKGLZE.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
sub_df.head()

In [None]:
print('size of train data',train_df.shape)
print('size of test data',test_df.shape)
print('size of sub data',sub_df.shape)

In [None]:
train_df.columns

In [None]:
train_df.sample(5)

In [None]:
cols_target = ['Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance']

In [None]:
# check missing values in numeric columns
train_df.describe()

In [None]:
unlabelled_in_all = train_df[(train_df['Computer Science']!=1) & (train_df['Physics']!=1) & (train_df['Mathematics']!=1) & 
                            (train_df['Statistics']!=1) & (train_df['Quantitative Biology']!=1) & (train_df['Quantitative Finance']!=1)]
print('Percentage of unlabelled comments is ', len(unlabelled_in_all)/len(train_df)*100)

In [None]:
# check for any 'null' comment
no_comment = train_df[train_df['TITLE'].isnull()]
len(no_comment)

In [None]:
# check for any 'null' comment
no_comment = train_df[train_df['ABSTRACT'].isnull()]
len(no_comment)

In [None]:
no_comment = test_df[test_df['ABSTRACT'].isnull()]
no_comment

In [None]:
no_comment = test_df[test_df['TITLE'].isnull()]
no_comment

In [None]:
# let's see the total rows in train, test data and the numbers for the various categories
print('Total rows in test is {}'.format(len(test_df)))
print('Total rows in train is {}'.format(len(train_df)))
print(train_df[cols_target].sum())

As mentioned earlier, majority of the comments in the training data are not labelled in one or more of these categories.

In [None]:
# Here is the total number of samples belongs to each class
x = train_df.iloc[:,3:].sum()
print('total number of comment:',len(train_df),'\n','samples belongs to each class','\n',x)

plt.figure(figsize=(15,5))
sns.barplot(x.index,x.values)
plt.xticks(rotation=90)
plt.title('class distribution')
plt.show()

There are some messages which belongs to multiple classes and as you can see in the above image classes are also not evenlt spread,that means class imbalance also there, let us check how one class is correlated with other class with the help of heapmaps

In [None]:
y = train_df.corr()
plt.figure(figsize=(8,8))
sns.heatmap(y,annot=True,center=True,square=True)
plt.title('heatmap showing correlation between classes')
plt.show()
#Here i intentionally included seventh class which we created

In [None]:
# Let's look at the character length for the rows in the training data and record these
train_df['TITLE_char_length'] = train_df['TITLE'].apply(lambda x: len(str(x)))
train_df['ABSTRACT_char_length'] = train_df['ABSTRACT'].apply(lambda x: len(str(x)))

In [None]:
# look at the histogram plot for text length
sns.set()
train_df['TITLE_char_length'].hist()
plt.show()

In [None]:
# look at the histogram plot for text length
sns.set()
train_df['ABSTRACT_char_length'].hist()
plt.show()

In [None]:
# Let's look at the character length for the rows in the training data and record these
test_df['TITLE_char_length'] = test_df['TITLE'].apply(lambda x: len(str(x)))
test_df['ABSTRACT_char_length'] = test_df['ABSTRACT'].apply(lambda x: len(str(x)))

In [None]:
train_df['comment_text'] = train_df['TITLE'] + train_df['ABSTRACT']
test_df['comment_text'] = test_df['TITLE'] + test_df['ABSTRACT']

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
word_counter = {}


def clean_text(text):
    text = re.sub('[{}]'.format(string.punctuation), ' ', text.lower())
    return ' '.join([word for word in text.split() if word not in (stop)])

for categ in cols_target:
    d = Counter()
    train_df[train_df[categ] == 1]['comment_text'].apply(lambda t: d.update(clean_text(t).split()))
    word_counter[categ] = pd.DataFrame.from_dict(d, orient='index')\
                                        .rename(columns={0: 'count'})\
                                        .sort_values('count', ascending=False)
    
for w in word_counter:
    wc = word_counter[w]

    wordcloud = WordCloud(
          background_color='black',
          max_words=200,
          max_font_size=100, 
          random_state=4561
         ).generate_from_frequencies(wc.to_dict()['count'])

    fig = plt.figure(figsize=(12, 8))
    plt.title(w)
    plt.imshow(wordcloud)
    plt.axis('off')

    plt.show()

### Clean up the comment text

In [None]:
train_df['comment_text'] = train_df['comment_text'].map(lambda com : clean_text(com))

test_df['comment_text'] = test_df['comment_text'].map(lambda com : clean_text(com))

### Define X from entire train & test data for use in tokenization by Vectorizer

In [None]:
# train_df = train_df.drop('char_length',axis=1)
X = train_df.comment_text
test_X = test_df.comment_text
print(X.shape, test_X.shape)

### Vectorize the data¶

In [None]:
# import and instantiate TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=5000,stop_words='english')
vect

In [None]:
# learn the vocabulary in the training data, then use it to create a document-term matrix
X_dtm = vect.fit_transform(X)
# examine the document-term matrix created from X_train
X_dtm

In [None]:
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
test_X_dtm = vect.transform(test_X)
# examine the document-term matrix from X_test
test_X_dtm

### Solving a multi-label classification problem

One way to approach a multi-label classification problem is to transform the problem into separate single-class classifier problems. This is known as 'problem transformation'. There are three methods:

Binary Relevance. This is probably the simplest which treats each label as a separate single classification problems. The key assumption here though, is that there are no correlation among the various labels.

Classifier Chains. In this method, the first classifier is trained on the input X. Then the subsequent classifiers are trained on the input X and all previous classifiers' predictions in the chain. This method attempts to draw the signals from the correlation among preceding target variables.


Label Powerset. This method transforms the problem into a multi-class problem where the multi-class labels are essentially all the unique label combinations. In our case here, where there are six labels, Label Powerset would in effect turn this into a 2^6 or 64-class problem. {Thanks Joshua for pointing out.}

### Binary Relevance - build a multi-label classifier using Logistic Regression

In [None]:
# import and instantiate the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logreg = LogisticRegression(C=12.0)

# create submission file
submission_binary = sub_df.copy()

for label in cols_target:
    print('... Processing {}'.format(label))
    y = train_df[label]
    # train the model using X_dtm & y
    logreg.fit(X_dtm, y)
    # compute the training accuracy
    y_pred_X = logreg.predict(X_dtm)
    print('Training accuracy is {}'.format(accuracy_score(y, y_pred_X)))
    # compute the predicted probabilities for X_test_dtm
    test_y_prob = logreg.predict_proba(test_X_dtm)[:,1]
    submission_binary[label] = test_y_prob

In [None]:
submission_binary.head()

In [None]:
for col in cols_target:
    submission_binary[col] = submission_binary[col].apply(lambda x: 1 if x >= 0.4 else 0)

In [None]:
submission_binary.head()

In [None]:
submission_binary.to_csv('log_reg_baseline.csv', index=False)

In [None]:
# import and instantiate the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logreg = LogisticRegression(C=12.0)
xgb=XGBClassifier(max_depth=4,base_score=0.5,learning_rate=0.1,n_estimators=350)
# create submission file
submission_binary = sub_df.copy()

for label in cols_target:
    print('... Processing {}'.format(label))
    y = train_df[label]
    # train the model using X_dtm & y
    xgb.fit(X_dtm, y)
    # compute the training accuracy
    y_pred_X = xgb.predict(X_dtm)
    print('Training accuracy is {}'.format(accuracy_score(y, y_pred_X)))
    # compute the predicted probabilities for X_test_dtm
    test_y_prob = xgb.predict_proba(test_X_dtm)[:,1]
    submission_binary[label] = test_y_prob

In [None]:
submission_binary.head()

In [None]:
for col in cols_target:
    submission_binary[col] = submission_binary[col].apply(lambda x: 1 if x >= 0.5 else 0)

In [None]:
submission_binary.to_csv('lgbm_baseline.csv', index=False)

### Classifier Chains - build a multi-label classifier using Logistic Regression

In [None]:
#  create submission file
submission_chains = sub_df.copy()

# create a function to add features
def add_feature(X, feature_to_add):
    '''
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    '''
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [None]:
for label in cols_target:
    print('... Processing {}'.format(label))
    y = train_df[label]
    # train the model using X_dtm & y
    logreg.fit(X_dtm,y)
    # compute the training accuracy
    y_pred_X = logreg.predict(X_dtm)
    print('Training Accuracy is {}'.format(accuracy_score(y,y_pred_X)))
    # make predictions from test_X
    test_y = logreg.predict(test_X_dtm)
    test_y_prob = logreg.predict_proba(test_X_dtm)[:,1]
    submission_chains[label] = test_y_prob
    # chain current label to X_dtm
    X_dtm = add_feature(X_dtm, y)
    print('Shape of X_dtm is now {}'.format(X_dtm.shape))
    # chain current label predictions to test_X_dtm
    test_X_dtm = add_feature(test_X_dtm, test_y)
    print('Shape of test_X_dtm is now {}'.format(test_X_dtm.shape))


In [None]:
submission_chains.head()

In [None]:
for col in cols_target:
    submission_chains[col] = submission_chains[col].apply(lambda x: 1 if x >= 0.5 else 0)

In [None]:
submission_chains.head()

In [None]:
submission_binary.to_csv('log_reg_baseline_chains.csv', index=False)

### Refernces/Credits:

As this hackathon is similar to the contest conducted in Kaggle - I referred to these kernels.

https://www.kaggle.com/clinma/eda-toxic-comment-classification-challenge

https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/discussion/100661

https://www.kaggle.com/rhodiumbeng/classifying-multi-label-comments-0-9741-lb