# MULTI-LABEL TEXT CLASSIFICATION FOR STACK OVERFLOW TAG PREDICTION

In [19]:
import pandas as pd
import numpy as np

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

In [21]:
url = 'https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/stackoverflow.csv'
df = pd.read_csv(url, index_col=0) # We provide index_col=0 as one of the column is not defined(unnamed)
df.head(5)

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


# Pre-Processing

In [22]:
# tf-idf will be used for pre-processing. tf-idf is known as term frequency.
# We need to change the tags column from string to list in order to do one hot encoding. 
#In order to do that we are going to evaluate it. For evaluation, 
# we have to import the ast library.

In [23]:
import ast

In [24]:
ast.literal_eval(df['Tags'].iloc[0]) # This converts the Tags column from string to list.

['sql', 'asp.net']

In [25]:
df['Tags'] = df['Tags'].apply(lambda x: ast.literal_eval(x)) 
                                                           # To convert all the rows of the Tags to list, we use lambda function

In [26]:
df.head(5)

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"[sql, asp.net]"
4,adding scripting functionality to net applicat...,"[c#, .net]"
5,should i use nested classes in this case i am ...,[c++]
6,homegrown consumption of web services i have b...,[.net]
8,automatically update version number i would li...,[c#]


In [33]:
y = df['Tags'] # We will do One-Hot Encoding and convert y to MultiLabel Binarizer
y

2          [sql, asp.net]
4              [c#, .net]
5                   [c++]
6                  [.net]
8                    [c#]
                ...      
1262668             [c++]
1262834             [c++]
1262915          [python]
1263065          [python]
1263454             [c++]
Name: Tags, Length: 48976, dtype: object

In [38]:
multilabel = MultiLabelBinarizer() # MultiLabelBinazrizer object

In [39]:
y = multilabel.fit_transform(df['Tags'])

In [40]:
y                                                        # Converted Tags to one hot encoding(All zeroes and ones)

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [42]:
# To check for which classes y got converted
multilabel.classes_

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

In [44]:
pd.DataFrame(y, columns = multilabel.classes_) # We need to convert all the text to ones and zeroes to train our model

Unnamed: 0,.net,android,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby,ruby-on-rails,sql
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48971,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48972,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
48974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [None]:
# Now we will use Tfidf for tokenization. If we select analyzer=word, then it will do tokenization word by word else if we 
# choose analyzer=char, it will tokenize character by character.
# lets say you have -> l,e,t,s,...
# max_features ensures that we should not select dictionary size more than the max_features size. 

In [116]:
tfidf = TfidfVectorizer(analyzer='word', max_features=10000, ngram_range=(1,3), stop_words='english')
X = tfidf.fit_transform(df['Text'])

In [117]:
X

<48976x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 2002390 stored elements in Compressed Sparse Row format>

In [118]:
# tfidf.vocabulary_ (for what are the words the tfidf has done tokenization)

In [119]:
X.shape, y.shape # X has 10,000 columns of Text from stackoverflow and y has 20 columns of Tags from stackoverflow
                 # total no. of rows are 48976

((48976, 10000), (48976, 20))

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)                               

# Build Model

In [121]:
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs') 
svc = LinearSVC()

### Jaccard similarity, or Jaccard index is the size of the intersection of the predicted labels and true labels by the size of the union of the predicted and true labels. It ranges from 0 to 1 and 1 is the perfect score.

In [123]:
def j_score(y_true, y_pred):
    # minimum = intersection, maximum = union
    jaccard = np.minimum(y_true, y_pred).sum(axis = 1)/np.maximum(y_true, y_pred).sum(axis = 1)
    return jaccard.mean()*100
    
def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__) # It will tell us which classifier we are using
    print('Jacard score: {}'.format(j_score(y_test, y_pred))) # To print the Jaccard score
    print('----')                                                                                   

In [124]:
for classifier in [LinearSVC(C=1.5, penalty = 'l1', dual=False)]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)

Clf:  LinearSVC
Jacard score: 63.87096774193548
----


In [126]:
for classifier in [sgd, lr, svc]:       # Iterating the ML algorithms
    clf = OneVsRestClassifier(classifier) # From 20 classes, we will select one at a time
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)

Clf:  SGDClassifier
Jacard score: 52.61960664216687
----
Clf:  LogisticRegression
Jacard score: 51.1014699877501
----
Clf:  LinearSVC
Jacard score: 62.42105621342044
----


## Model Test with Real Data

In [127]:
x = [ 'how to write ml code in python and java i have data but do not know how to do it']

In [128]:
xt = tfidf.transform(x) # It will return a sparse matrix

In [130]:
clf.predict(xt)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [131]:
multilabel.inverse_transform(clf.predict(xt))  # To check which classes has value as 1

[('java', 'python')]