In [5]:
import pandas as pd
from ast import literal_eval

In [6]:
def read_data(filename):
    data = pd.read_csv(filename, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)
    return data

train = read_data('data/train.tsv')
validation = read_data('data/validation.tsv')
test = pd.read_csv('data/test.tsv', sep='\t')

X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

In [7]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    text = text.lower()
    text = re.sub(REPLACE_BY_SPACE_RE, ' ', text)
    text = re.sub(BAD_SYMBOLS_RE, '', text)
    return ' '.join(word for word in word_tokenize(text) if word not in STOPWORDS)

X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]

In [9]:
from collections import Counter

words_counts = Counter(word for sentence in X_train for word in sentence.split())
tags_counts = Counter(tag for tags in y_train for tag in tags)

In [10]:
import numpy as np
from scipy import sparse as sp_sparse

DICT_SIZE = 5000
most_common_words = [word for word, _ in words_counts.most_common(DICT_SIZE)]
WORDS_TO_INDEX = {word: i for i, word in enumerate(most_common_words)}
INDEX_TO_WORDS = {i: word for word, i in WORDS_TO_INDEX.items()}

def my_bag_of_words(text, words_to_index, dict_size):
    result_vector = np.zeros(dict_size)
    for word in text.split():
        if word in words_to_index:
            result_vector[words_to_index[word]] += 1
    return result_vector

X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(X_train, X_val, X_test):
    vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern=r'(\S+)')
    X_train = vectorizer.fit_transform(X_train)
    X_val = vectorizer.transform(X_val)
    X_test = vectorizer.transform(X_test)
    return X_train, X_val, X_test, vectorizer.vocabulary_

X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i: word for word, i in tfidf_vocab.items()}

In [12]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_val = mlb.transform(y_val)

In [13]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

def train_classifier(X_train, y_train, penalty='l2'):
    if penalty == 'l1':
        model = OneVsRestClassifier(LogisticRegression(penalty=penalty, solver='liblinear'))
    else:
        model = OneVsRestClassifier(LogisticRegression(penalty=penalty))
    return model.fit(X_train, y_train)

classifier_mybag_l1 = train_classifier(X_train_mybag, y_train, 'l1')
classifier_mybag_l2 = train_classifier(X_train_mybag, y_train, 'l2')
classifier_tfidf_l1 = train_classifier(X_train_tfidf, y_train, 'l1')
classifier_tfidf_l2 = train_classifier(X_train_tfidf, y_train, 'l2')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [14]:
from sklearn.metrics import accuracy_score, f1_score, average_precision_score

def evaluate(y_val, predicted):
    print("Accuracy:", accuracy_score(y_val, predicted))
    print("F1 Macro:", f1_score(y_val, predicted, average='macro'))
    print("F1 Micro:", f1_score(y_val, predicted, average='micro'))
    print("Precision Macro:", average_precision_score(y_val, predicted, average='macro'))

print('Bag-of-words L1')
evaluate(y_val, classifier_mybag_l1.predict(X_val_mybag))
print('Bag-of-words L2')
evaluate(y_val, classifier_mybag_l2.predict(X_val_mybag))
print('Tfidf L1')
evaluate(y_val, classifier_tfidf_l1.predict(X_val_tfidf))
print('Tfidf L2')
evaluate(y_val, classifier_tfidf_l2.predict(X_val_tfidf))

Bag-of-words L1
Accuracy: 0.36336666666666667
F1 Macro: 0.5184369982501239
F1 Micro: 0.6771429748321457
Precision Macro: 0.35548568534985414
Bag-of-words L2
Accuracy: 0.3566666666666667
F1 Macro: 0.5038295669070777
F1 Micro: 0.670000935813586
Precision Macro: 0.3435869170207129
Tfidf L1
Accuracy: 0.3653666666666667
F1 Macro: 0.5090399703429909
F1 Micro: 0.6750588774723328
Precision Macro: 0.3492932305071853
Tfidf L2
Accuracy: 0.33436666666666665
F1 Macro: 0.44489652742155206
F1 Micro: 0.6414867055061545
Precision Macro: 0.3014584947034525


In [15]:
y_test_predicted_labels_tfidf_l1 = classifier_tfidf_l1.predict(X_test_tfidf)
y_test_predicted = mlb.inverse_transform(y_test_predicted_labels_tfidf_l1)

for title, tags in zip(test['title'].values[:20], y_test_predicted[:20]):
    print(f"{title}\n{tags}\n")

('mysql', 'php')

get click coordinates from <input type='image'> via javascript
('javascript',)

How to implement cloud storage for media assets in ZF?
()

What is catcomplete in jQuery's autocomplete plugin?
('javascript', 'jquery')

Error building Android app with Cordova 3.1 CLI
('android', 'java')

How to Parse XML File in PHP
('php', 'xml')

Uploading files via JSON Post request to a Web Service provided by Teambox
('json',)

Adding rows to JTable in the right order.
('java', 'swing')

How to read input file in Python?
('python',)

PDF generation from an html containing images and text
('html',)

Trying to get sql query to be dynamic with jquery
('jquery',)

Fiting 2-parameters weibull distribution for tabulated data
('r',)

Add six months in php
('php',)

Where/How to code Constants in Rails 3 Application
('ruby-on-rails', 'ruby-on-rails-3')

Comparing list of items with one of the column in DataTable
('c#',)

python sort upper case and lower case
('python',)

How do I make a po

In [17]:
def print_words_for_tag(classifier, tag, index_to_words):
    print(f'Tag: {tag}')
    est = classifier.estimators_[mlb.classes.index(tag)]
    top_positive_words = [index_to_words[i] for i in est.coef_.argsort()[0][-5:]]
    top_negative_words = [index_to_words[i] for i in est.coef_.argsort()[0][:5]]
    print('Top positive words:', ', '.join(top_positive_words))
    print('Top negative words:', ', '.join(top_negative_words))

for tag in ['c', 'c++', 'linux', 'python', 'java', 'android', 'r', 'ios', 'windows']:
    print_words_for_tag(classifier_tfidf_l1, tag, tfidf_reversed_vocab)

Tag: c
Top positive words: fscanf, printf, malloc, scanf, c
Top negative words: c #, php, javascript, java, python
Tag: c++
Top positive words: stl, mfc, qt, boost, c++
Top negative words: php, java, javascript, python, jquery
Tag: linux
Top positive words: ubuntu, system call, signal, kernel space, linux
Top negative words: #, javascript, jquery, array, aspnet
Tag: python
Top positive words: tkinter, matplotlib, pandas, numpy, python
Top negative words: php, java, c, django python, jquery
Tag: java
Top positive words: jtable, jar, hibernate, spring, java
Top negative words: php, python, ruby, rails, django
Tag: android
Top positive words: intent, edittext, asynctask, retrofit, android
Top negative words: python, c, swift, iphone, phonegap android
Tag: r
Top positive words: rstudio, shiny, ggplot, ggplot2, r
Top negative words: android, php, python, java, c
Tag: ios
Top positive words: uilabel, afnetworking, uicollectionview, swift, ios
Top negative words: java, python, php, jquery, ra