In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.lines as mlines

import nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation

import re
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss
from sklearn.metrics import confusion_matrix

from scipy.sparse import hstack

In [2]:
questions = pd.read_csv("Questions.csv")
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...


In [3]:
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607282 entries, 0 to 607281
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Id            607282 non-null  int64  
 1   OwnerUserId   601070 non-null  float64
 2   CreationDate  607282 non-null  object 
 3   Score         607282 non-null  int64  
 4   Title         607282 non-null  object 
 5   Body          607282 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 27.8+ MB


In [4]:
tags = pd.read_csv("Tags.csv", dtype={'Tag':str})
tags.head()

Unnamed: 0,Id,Tag
0,469,python
1,469,osx
2,469,fonts
3,469,photoshop
4,502,python


In [5]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1885078 entries, 0 to 1885077
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Id      int64 
 1   Tag     object
dtypes: int64(1), object(1)
memory usage: 28.8+ MB


In [6]:
tags['Tag'] = tags['Tag'].astype(str)

In [7]:
grouped_tags = tags.groupby("Id")['Tag'].apply(lambda tags: ' '.join(tags))

In [8]:
grouped_tags.head()

Id
469                           python osx fonts photoshop
502                             python windows image pdf
535    python continuous-integration extreme-programming
594                 python sql database oracle cx-oracle
683                              python arrays iteration
Name: Tag, dtype: object

In [9]:
grouped_tags.reset_index()

Unnamed: 0,Id,Tag
0,469,python osx fonts photoshop
1,502,python windows image pdf
2,535,python continuous-integration extreme-programming
3,594,python sql database oracle cx-oracle
4,683,python arrays iteration
...,...,...
607278,40143190,python bash multiline
607279,40143228,python selenium-webdriver
607280,40143267,python django django-rest-framework
607281,40143338,python


In [10]:
grouped_tags_final = pd.DataFrame({'Id':grouped_tags.index, 'Tags':grouped_tags.values})

In [11]:
grouped_tags_final.head(5)

Unnamed: 0,Id,Tags
0,469,python osx fonts photoshop
1,502,python windows image pdf
2,535,python continuous-integration extreme-programming
3,594,python sql database oracle cx-oracle
4,683,python arrays iteration


In [12]:
questions.drop(columns=['OwnerUserId', 'CreationDate'], inplace=True)

In [13]:
questions = questions.merge(grouped_tags_final, on='Id')
questions.head()

Unnamed: 0,Id,Score,Title,Body,Tags
0,469,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,python osx fonts photoshop
1,502,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,python windows image pdf
2,535,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,python continuous-integration extreme-programming
3,594,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,python sql database oracle cx-oracle
4,683,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,python arrays iteration


In [14]:
filtered_questions = questions[questions['Score']>5]

In [15]:
filtered_questions.drop(columns = ['Id', 'Score'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [16]:
filtered_questions.head()

Unnamed: 0,Title,Body,Tags
0,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,python osx fonts photoshop
1,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,python windows image pdf
2,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,python continuous-integration extreme-programming
3,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,python sql database oracle cx-oracle
4,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,python arrays iteration


In [17]:
filtered_questions['Tags'] = filtered_questions['Tags'].apply(lambda x: x.split())

all_tags = [item for sublist in filtered_questions['Tags'].values for item in sublist]

print(len(all_tags))

my_set = set(all_tags)
unique_tags = list(my_set)
print(len(unique_tags))

134101
6384


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Tags'] = filtered_questions['Tags'].apply(lambda x: x.split())


In [18]:
flat_list = [item for sublist in filtered_questions['Tags'].values for item in sublist]

keywords = nltk.FreqDist(flat_list)

keywords = nltk.FreqDist(keywords)

frequencies_words = keywords.most_common(100)
tags_features = [word[0] for word in frequencies_words]

In [19]:
tags_features

['python',
 'django',
 'numpy',
 'matplotlib',
 'pandas',
 'python-3.x',
 'python-2.7',
 'list',
 'string',
 'flask',
 'dictionary',
 'scipy',
 'regex',
 'performance',
 'google-app-engine',
 'sqlalchemy',
 'arrays',
 'pip',
 'windows',
 'algorithm',
 'unit-testing',
 'linux',
 'unicode',
 'multithreading',
 'django-models',
 'osx',
 'json',
 'datetime',
 'c++',
 'mysql',
 'virtualenv',
 'multiprocessing',
 'subprocess',
 'class',
 'java',
 'c',
 'ipython',
 'file',
 'csv',
 'logging',
 'exception',
 'opencv',
 'sorting',
 'selenium',
 'tkinter',
 'python-imaging-library',
 'javascript',
 'module',
 'celery',
 'function',
 'import',
 'parsing',
 'cython',
 'scikit-learn',
 'math',
 'xml',
 'dataframe',
 'html',
 'pycharm',
 'machine-learning',
 'beautifulsoup',
 'plot',
 'list-comprehension',
 'generator',
 'django-admin',
 'pyqt',
 'postgresql',
 'debugging',
 'nltk',
 'tuples',
 'file-io',
 'urllib2',
 'oop',
 'setuptools',
 'sockets',
 'decorator',
 'image-processing',
 'image',
 'r

In [20]:
def most_common(tags):
    tags_filtered = []
    for i in range(0, len(tags)):
        if tags[i] in tags_features:
            tags_filtered.append(tags[i])
    return tags_filtered

In [21]:
filtered_questions['Tags'] = filtered_questions['Tags'].apply(lambda x: most_common(x))
filtered_questions['Tags'] = filtered_questions['Tags'].apply(lambda x: x if len(x)>0 else None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Tags'] = filtered_questions['Tags'].apply(lambda x: most_common(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Tags'] = filtered_questions['Tags'].apply(lambda x: x if len(x)>0 else None)


In [22]:
filtered_questions.head()

Unnamed: 0,Title,Body,Tags
0,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,"[python, osx]"
1,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,"[python, windows, image]"
2,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,[python]
3,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,"[python, sql, database]"
4,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,"[python, arrays]"


In [23]:
filtered_questions.shape

(42420, 3)

In [24]:
filtered_questions.dropna(subset=['Tags'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions.dropna(subset=['Tags'], inplace=True)


In [25]:
filtered_questions.shape

(42420, 3)

In [26]:
filtered_questions.head()

Unnamed: 0,Title,Body,Tags
0,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,"[python, osx]"
1,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,"[python, windows, image]"
2,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,[python]
3,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,"[python, sql, database]"
4,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,"[python, arrays]"


In [27]:
filtered_questions['Body'] = filtered_questions['Body'].apply(lambda x: BeautifulSoup(x).get_text()) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Body'] = filtered_questions['Body'].apply(lambda x: BeautifulSoup(x).get_text())


In [28]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [29]:
filtered_questions['Body'] = filtered_questions['Body'].apply(lambda x: clean_text(x)) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Body'] = filtered_questions['Body'].apply(lambda x: clean_text(x))


In [30]:
token=ToktokTokenizer()

In [31]:
punct = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'

In [32]:
def strip_list_noempty(mylist):
    newlist = (item.strip() if hasattr(item, 'strip') else item for item in mylist)
    return [item for item in newlist if item != '']

In [33]:
def clean_punct(text): 
    words=token.tokenize(text)
    punctuation_filtered = []
    regex = re.compile('[%s]' % re.escape(punct))
    remove_punctuation = str.maketrans(' ', ' ', punct)
    for w in words:
        if w in tags_features:
            punctuation_filtered.append(w)
        else:
            punctuation_filtered.append(regex.sub('', w))
  
    filtered_list = strip_list_noempty(punctuation_filtered)
        
    return ' '.join(map(str, filtered_list))

In [34]:
filtered_questions['Body'] = filtered_questions['Body'].apply(lambda x: clean_punct(x))
filtered_questions['Body'][2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Body'] = filtered_questions['Body'].apply(lambda x: clean_punct(x))


'i am starting work on a hobby project with a python codebase and would like to set up some form of continuous integration ie running a battery of test-cases each time a check-in is made and sending nag e-mails to responsible persons when the tests fail similar to cruisecontrol or teamcity i realize i could do this with hooks in most vcses but that requires that the tests run on the same machine as the version control server which is not as elegant as i would like does anyone have any suggestions for a small user-friendly open-source continuous integration system suitable for a python codebase'

In [35]:
lemma=WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [36]:
def lemitizeWords(text):
    words=token.tokenize(text)
    listLemma=[]
    for w in words:
        x=lemma.lemmatize(w, pos="v")
        listLemma.append(x)
    return ' '.join(map(str, listLemma))

def stopWordsRemove(text):
    
    stop_words = set(stopwords.words("english"))
    
    words=token.tokenize(text)
    
    filtered = [w for w in words if not w in stop_words]
    
    return ' '.join(map(str, filtered))

In [37]:
filtered_questions['Body'] = filtered_questions['Body'].apply(lambda x: lemitizeWords(x)) 
filtered_questions['Body'] = filtered_questions['Body'].apply(lambda x: stopWordsRemove(x)) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Body'] = filtered_questions['Body'].apply(lambda x: lemitizeWords(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Body'] = filtered_questions['Body'].apply(lambda x: stopWordsRemove(x))


In [38]:
filtered_questions['Title'] = filtered_questions['Title'].apply(lambda x: str(x))
filtered_questions['Title'] = filtered_questions['Title'].apply(lambda x: clean_text(x)) 
filtered_questions['Title'] = filtered_questions['Title'].apply(lambda x: clean_punct(x)) 
filtered_questions['Title'] = filtered_questions['Title'].apply(lambda x: lemitizeWords(x)) 
filtered_questions['Title'] = filtered_questions['Title'].apply(lambda x: stopWordsRemove(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Title'] = filtered_questions['Title'].apply(lambda x: str(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Title'] = filtered_questions['Title'].apply(lambda x: clean_text(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Title'] = filt

In [39]:
filtered_questions.head(10)

Unnamed: 0,Title,Body,Tags
0,find full path font display name mac,use photoshop javascript api find fonts give p...,"[python, osx]"
1,get preview jpeg pdf windows,cross-platform python application need generat...,"[python, windows, image]"
2,continuous integration system python codebase,start work hobby project python codebase would...,[python]
3,cxoracle iterate result set,several ways iterate result set tradeoff,"[python, sql, database]"
4,use match attribute python object array,remember whether dream seem recall function al...,"[python, arrays]"
5,class view django,django view point function problem want change...,"[python, django, oop]"
6,python mysql,get python work postgresql cannot get work mys...,"[python, mysql, postgresql]"
7,use python itertoolsgroupby,able find understandable explanation actually ...,[python]
8,add method exist object instance,read possible add method exist object eg class...,"[python, oop]"
9,express binary literals python,express integer binary number python literals ...,[python]


In [40]:
X1 = filtered_questions['Body']
X2 = filtered_questions['Title']
y = filtered_questions['Tags']

In [41]:
vectorizer_X1 = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

vectorizer_X2 = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

In [42]:
X1_tfidf = vectorizer_X1.fit_transform(X1)
X2_tfidf = vectorizer_X2.fit_transform(X2)

In [43]:
X_tfidf = hstack([X1_tfidf,X2_tfidf])

In [44]:
multilabel_binarizer = MultiLabelBinarizer()
y_bin = multilabel_binarizer.fit_transform(y)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_bin, test_size = 0.2, random_state = 0)

In [46]:
def avg_jacard(y_true,y_pred):
    jacard = np.minimum(y_true,y_pred).sum(axis=1) / np.maximum(y_true,y_pred).sum(axis=1)
    
    return jacard.mean()*100

def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    print("Jacard score: {}".format(avg_jacard(y_test, y_pred)))
    print("Hamming loss: {}".format(hamming_loss(y_pred, y_test)*100))
    print("---") 

In [47]:
sgd = SGDClassifier()
clf = OneVsRestClassifier(sgd)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print_score(y_pred, sgd)



Clf:  SGDClassifier
Jacard score: 77.42534967782491
Hamming loss: 0.593941537010844
---


In [48]:
for i in range(y_train.shape[1]):
    print(multilabel_binarizer.classes_[i])
    print(confusion_matrix(y_test[:,i], y_pred[:,i]))
    print("")

algorithm
[[8385    5]
 [  84   10]]

argparse
[[8455    0]
 [   3   26]]

arrays
[[8397    0]
 [  84    3]]

beautifulsoup
[[8429    1]
 [  23   31]]

c
[[8407    0]
 [  77    0]]

c++
[[8401    9]
 [  47   27]]

celery
[[8428    1]
 [   6   49]]

class
[[8419    0]
 [  65    0]]

csv
[[8417    9]
 [  14   44]]

cython
[[8437    1]
 [  12   34]]

database
[[8446    0]
 [  38    0]]

dataframe
[[8429    0]
 [  55    0]]

datetime
[[8392   10]
 [  53   29]]

debugging
[[8440    3]
 [  38    3]]

decorator
[[8428    8]
 [  13   35]]

dictionary
[[8284   43]
 [  54  103]]

django
[[7700   37]
 [ 143  604]]

django-admin
[[8419   13]
 [  19   33]]

django-models
[[8404    0]
 [  80    0]]

django-templates
[[8451    0]
 [  33    0]]

exception
[[8417    7]
 [  54    6]]

file
[[8427    0]
 [  57    0]]

file-io
[[8445    0]
 [  39    0]]

flask
[[8312    3]
 [  51  118]]

function
[[8437    0]
 [  47    0]]

generator
[[8438   10]
 [  20   16]]

google-app-engine
[[8359    9]
 [  43   73]]

In [68]:
answers = pd.read_csv("Answers.csv")
answers.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,497,50.0,2008-08-02T16:56:53Z,469,4,<p>open up a terminal (Applications-&gt;Utilit...
1,518,153.0,2008-08-02T17:42:28Z,469,2,<p>I haven't been able to find anything that d...
2,536,161.0,2008-08-02T18:49:07Z,502,9,<p>You can use ImageMagick's convert utility f...
3,538,156.0,2008-08-02T18:56:56Z,535,23,<p>One possibility is Hudson. It's written in...
4,541,157.0,2008-08-02T19:06:40Z,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B..."


In [56]:
grouped_tags_final.head(5)

Unnamed: 0,Id,Tags
0,469,python osx fonts photoshop
1,502,python windows image pdf
2,535,python continuous-integration extreme-programming
3,594,python sql database oracle cx-oracle
4,683,python arrays iteration


In [69]:
answers.drop(columns=['OwnerUserId', 'CreationDate', 'Id'], inplace=True)

In [70]:
answers.rename(columns = {'ParentId':'Id'},inplace=True)

In [71]:
answers = answers.merge(grouped_tags_final, on='Id')
answers.head()

Unnamed: 0,Id,Score,Body,Tags
0,469,4,<p>open up a terminal (Applications-&gt;Utilit...,python osx fonts photoshop
1,469,2,<p>I haven't been able to find anything that d...,python osx fonts photoshop
2,469,12,<p>Unfortunately the only API that isn't depre...,python osx fonts photoshop
3,469,1,<p>There must be a method in Cocoa to get a li...,python osx fonts photoshop
4,502,9,<p>You can use ImageMagick's convert utility f...,python windows image pdf


In [72]:
answers = answers[answers['Score']>5]
answers.head()

Unnamed: 0,Id,Score,Body,Tags
2,469,12,<p>Unfortunately the only API that isn't depre...,python osx fonts photoshop
4,502,9,<p>You can use ImageMagick's convert utility f...,python windows image pdf
6,502,25,<p>ImageMagick delegates the PDF->bitmap conve...,python windows image pdf
7,535,23,<p>One possibility is Hudson. It's written in...,python continuous-integration extreme-programming
8,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B...",python continuous-integration extreme-programming


In [73]:
answers['Tags'] = answers['Tags'].apply(lambda x: x.split())
answers.head()

Unnamed: 0,Id,Score,Body,Tags
2,469,12,<p>Unfortunately the only API that isn't depre...,"[python, osx, fonts, photoshop]"
4,502,9,<p>You can use ImageMagick's convert utility f...,"[python, windows, image, pdf]"
6,502,25,<p>ImageMagick delegates the PDF->bitmap conve...,"[python, windows, image, pdf]"
7,535,23,<p>One possibility is Hudson. It's written in...,"[python, continuous-integration, extreme-progr..."
8,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B...","[python, continuous-integration, extreme-progr..."


In [74]:
answers.shape

(93192, 4)

In [75]:
answers.drop(columns=['Id', 'Score'], inplace=True)
answers.head()

Unnamed: 0,Body,Tags
2,<p>Unfortunately the only API that isn't depre...,"[python, osx, fonts, photoshop]"
4,<p>You can use ImageMagick's convert utility f...,"[python, windows, image, pdf]"
6,<p>ImageMagick delegates the PDF->bitmap conve...,"[python, windows, image, pdf]"
7,<p>One possibility is Hudson. It's written in...,"[python, continuous-integration, extreme-progr..."
8,"<p>We run <a href=""http://buildbot.net/trac"">B...","[python, continuous-integration, extreme-progr..."


In [76]:
answers['Body'] = answers['Body'].apply(lambda x: BeautifulSoup(x).get_text()) 

In [77]:
answers['Body'] = answers['Body'].apply(lambda x: clean_text(x))

In [78]:
answers['Body'] = answers['Body'].apply(lambda x: clean_punct(x))
answers['Body'][2]

'unfortunately the only api that is not deprecated is located in the applicationservices framework which does not have a bridge support file and thus is not available in the bridge if you are wanting to use ctypes you can use atsfontgetfilereference after looking up the atsfontref cocoa does not have any native support at least as of 105 for getting the location of a font'

In [80]:
answers['Body'] = answers['Body'].apply(lambda x: lemitizeWords(x)) 
answers['Body'] = answers['Body'].apply(lambda x: stopWordsRemove(x)) 

answers.head()

Unnamed: 0,Body,Tags
2,unfortunately api deprecate locate application...,"[python, osx, fonts, photoshop]"
4,use imagemagick convert utility see examples h...,"[python, windows, image, pdf]"
6,imagemagick delegate pdf-bitmap conversion gho...,"[python, windows, image, pdf]"
7,one possibility hudson write java integration ...,"[python, continuous-integration, extreme-progr..."
8,run buildbot - trac work use much since code b...,"[python, continuous-integration, extreme-progr..."


In [81]:
def print_top10(feature_names, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("--------------------------------------------")
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in top10)))
        print("--------------------------------------------")

In [82]:
feature_names = vectorizer_X1.get_feature_names() + vectorizer_X2.get_feature_names()

In [48]:
samq = '''In a multilabel classification problem, i use MultiLabelBinarizer to transform my 20 text labels into a binary list of zeros and ones.

After prediction I get my list of 20 binary values, and I would like to output the corresponding text labels.

I am just wondering whether MultiLabelBinarizer() provides a getting back transformation or I should do it manually.


'''

In [49]:
print(samq)

In a multilabel classification problem, i use MultiLabelBinarizer to transform my 20 text labels into a binary list of zeros and ones.

After prediction I get my list of 20 binary values, and I would like to output the corresponding text labels.

I am just wondering whether MultiLabelBinarizer() provides a getting back transformation or I should do it manually.





In [50]:
samq = BeautifulSoup(samq).get_text()

In [51]:
samq = clean_text(samq)
samq = clean_punct(samq)

In [52]:
samq = lemitizeWords(samq)
samq = stopWordsRemove(samq)

In [53]:
samq

'multilabel classification problem use multilabelbinarizer transform 20 text label binary list zero ones prediction get list 20 binary value would like output correspond text label wonder whether multilabelbinarizer provide get back transformation manually'

In [54]:
samq = vectorizer_X1.transform([samq])

In [55]:
samq

<1x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [61]:
multilabel_binarizer.inverse_transform(clf.predict(X_test))

[('python',),
 ('python',),
 ('linux', 'python'),
 ('numpy', 'python'),
 ('django', 'python'),
 ('list', 'python', 'tuples'),
 ('decorator', 'python'),
 ('python',),
 ('python',),
 ('python',),
 ('python',),
 ('pycharm', 'python'),
 ('python',),
 ('nltk', 'python'),
 ('python', 'tkinter'),
 ('python', 'scrapy'),
 ('python',),
 ('mongodb', 'pandas', 'python'),
 ('python',),
 ('python', 'scikit-learn'),
 ('python', 'sqlalchemy'),
 ('python',),
 ('python',),
 ('django', 'python'),
 ('numpy', 'python'),
 ('python',),
 ('python', 'windows'),
 ('python',),
 ('jinja2', 'python'),
 ('python',),
 ('python',),
 ('django', 'python'),
 ('matplotlib', 'python'),
 ('algorithm', 'python'),
 ('python', 'regex'),
 ('matplotlib', 'python'),
 ('python',),
 ('pip', 'python'),
 ('python',),
 ('python',),
 ('pip', 'python'),
 ('python',),
 ('python',),
 ('python',),
 ('python',),
 ('python',),
 ('python',),
 ('logging', 'python'),
 ('google-app-engine', 'python'),
 ('python',),
 ('multiprocessing', 'python'

In [56]:
samt = 'Scikit Learn Multilabel Classification, Getting back labels from MultiLabelBinarizer'
samt = clean_text(samt)
samt = clean_punct(samt)
samt = lemitizeWords(samt)
samt = stopWordsRemove(samt)
print(samt)

scikit learn multilabel classification get back label multilabelbinarizer


In [57]:
samt = vectorizer_X2.transform([samt])

In [58]:
samt

<1x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [59]:
q = hstack([samq, samt])

In [60]:
multilabel_binarizer.inverse_transform(clf.predict(q))

[('python', 'scikit-learn')]