In [1]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

In [2]:
from ast import literal_eval
import pandas as pd
import numpy as np

In [3]:
def read_data(filename):
    data = pd.read_csv(filename, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [4]:
tmp = pd.read_csv('train.tsv', sep = '\t')
train = read_data('train.tsv')
validation = read_data('validation.tsv')
test = pd.read_csv('test.tsv', sep='\t')

In [5]:
train.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [6]:
tmp.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,['r']
1,mysql select all records where a datetime fiel...,"['php', 'mysql']"
2,How to terminate windows phone 8.1 app,['c#']
3,get current time in a specific country via jquery,"['javascript', 'jquery']"
4,Configuring Tomcat to Use SSL,['java']


In [7]:
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

In [8]:
y_train[1]

['php', 'mysql']

In [9]:
import re

In [10]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [11]:
# prepared_questions = []
# for line in open('text_prepare_tests.tsv', encoding='utf-8'):
#     line = text_prepare(line.strip())
#     prepared_questions.append(line)
# text_prepare_results = '\n'.join(prepared_questions)

FileNotFoundError: [Errno 2] No such file or directory: 'text_prepare_tests.tsv'

In [12]:
X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]

In [13]:
X_train[:3]

['draw stacked dotplot r',
 'mysql select records datetime field less specified value',
 'terminate windows phone 81 app']

In [14]:
from collections import defaultdict
tags_counts = defaultdict(int)
words_counts = defaultdict(int)
for tags in y_train:
    for tag in tags:
        tags_counts[tag] += 1
for text in X_train:
    for word in text.split():
        words_counts[word] += 1

In [15]:
most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:10]
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]

In [16]:
most_common_tags

[('javascript', 19078),
 ('c#', 19077),
 ('java', 18661),
 ('php', 13907),
 ('python', 8940),
 ('jquery', 7510),
 ('c++', 6469),
 ('html', 4668),
 ('objective-c', 4338),
 ('asp.net', 3939)]

In [17]:
from scipy import sparse as sp_sparse

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
??TfidVectorizer

Object `TfidVectorizer` not found.


In [20]:
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2),token_pattern='(\S+)')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
tfidf_vocab = tfidf_vectorizer.vocabulary_
# X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

In [21]:
tfidf_vocab['c++']

1976

In [22]:
from sklearn.preprocessing import MultiLabelBinarizer

In [23]:
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_val = mlb.fit_transform(y_val)

In [24]:
y_train[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Train the classifiers for different data transformations: *bag-of-words* and *tf-idf*.

In [None]:
%%time
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
clf = OneVsRestClassifier(RidgeClassifier(normalize=True))
clf.fit(X_train_tfidf, y_train)
# classifier_tfidf = train_classifier(X_train_tfidf, y_train)

Now you can create predictions for the data. You will need two types of predictions: labels and scores.

In [27]:
%%time
y_val_predicted_labels_tfidf = clf.predict(X_val_tfidf)
# y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)

Wall time: 184 ms


Now take a look at how classifier, which uses TF-IDF, works for a few examples:

In [28]:
que = ['','How to avoid Java code in JSP files? I am new to Java EE and I know that something like the following three lines','Providing white space in a Swing GUI', 'How to use Servlets and Ajax?']
que = [text_prepare(i) for i in que]
# que = [que]
que = tfidf_vectorizer.transform(que)
yp = clf.predict(que)
print(mlb.inverse_transform(yp))

[('java', 'jsp', 'servlets'), ('java', 'swing'), ('ajax', 'java', 'javascript', 'jquery', 'php', 'servlets')]


In [29]:
import pickle
import os 
pickle.dump(clf, open('model.pkl', 'wb'))
pickle.dump(mlb, open('tags.pkl', 'wb'))
pickle.dump(tfidf_vectorizer, open('tf.pkl', 'wb'))