# ___Multilabel Text Classification - Stackoverflow___

## ___Dataload___

In [1]:
#Importing required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/stackoverflow.csv', index_col=0)
# This is the subset of the complete data

In [3]:
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


In [6]:
df.Tags.iloc[0]

"['sql', 'asp.net']"

In [5]:
type(df.Tags.iloc[0])

str

In [7]:
# As the elements in the tags are String we need to change them to List of string tags

# ast.literal_eval(node_or_string)
# Safely evaluate an expression node or a string containing a Python literal or container display. 
# The string or node provided may only consist of the following Python literal structures: strings, bytes, numbers, tuples, lists, dicts, sets, booleans, and None.

import ast

ast.literal_eval(df['Tags'].iloc[0])

['sql', 'asp.net']

In [8]:
df['Tags'] = df['Tags'].apply(lambda x: ast.literal_eval(x))

In [9]:
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"[sql, asp.net]"
4,adding scripting functionality to net applicat...,"[c#, .net]"
5,should i use nested classes in this case i am ...,[c++]
6,homegrown consumption of web services i have b...,[.net]
8,automatically update version number i would li...,[c#]


## ___Data Preprocessing___

In [10]:
# Convert Tags to Binary form as expected for Multi Label Classifier 
from sklearn.preprocessing import MultiLabelBinarizer

multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['Tags'])

In [11]:
y

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
multilabel.classes_ # We have 20 classes here for this excercise

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

In [14]:
pd.DataFrame(y, columns=multilabel.classes_)

Unnamed: 0,.net,android,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby,ruby-on-rails,sql
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48971,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48972,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
48974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [15]:
# TF-IDF 

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer='word', max_features=10000, stop_words='english') # word - will do tokenization word by word, max_features - dictionary size
X = tfidf.fit_transform(df['Text'])

In [None]:
# tfidf.vocabulary_ # Vocabulary selected by TFIDF

In [17]:
X.shape

(48976, 10000)

## ___Modelling___

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [41]:
# Defining Models

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC(penalty= 'l1', dual=False)

In [42]:
# Multiclass Classifier

from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_score # used for measuring intersection of 2 labels divided by total occurances together

def print_score(y_test, y_pred, clf):
  print("Clf: ", clf.__class__.__name__)
  print('Jacard score: {}'.format(jaccard_score(y_test, y_pred, average = 'samples')))
  print('----')

for classifier in [sgd, lr, svc]:
  clf = OneVsRestClassifier(classifier)
  clf.fit(X_train,y_train)
  y_pred = clf.predict(X_test)
  print_score(y_test, y_pred, classifier)

Clf:  SGDClassifier
Jacard score: 0.5233973050224582
----
Clf:  LogisticRegression
Jacard score: 0.5073227167551382
----
Clf:  LinearSVC
Jacard score: 0.6321985164012522
----


In [43]:
# Model Testing 

test_data = ['not able to integrate the systems in java and jquery']
test_data = tfidf.transform(test_data)
clf.predict(test_data)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

In [44]:
multilabel.inverse_transform(clf.predict(test_data))

[('java', 'jquery')]