In [89]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from ast import literal_eval
import pandas as pd
import numpy as np
import re
from scipy import sparse as sp_sparse
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def read_data(filename):
	data = pd.read_csv(filename, sep='\t')
	data['tags'] = data['tags'].apply(literal_eval)

	return data

In [3]:
train = read_data('data/train.tsv')
validation = read_data('data/validation.tsv')
test = pd.read_csv('data/test.tsv', sep='\t')

In [49]:
X_train, y_train = train['title'].values, train['tags'].values
X_validation, y_validation = validation['title'].values, validation['tags'].values
X_test = test['title'].values

In [50]:
X_train, y_train = X_train[:10], y_train[:10]
X_val, y_val = X_validation[:5], y_validation[:5]

In [121]:
X_train = ['How to draw a stacked dotplot in R?',
 'mysql select all records where a datetime field is less than a specified value mysql',
 'How to terminate windows phone 8.1 app',
 'get current time in a specific country via jquery',
 'Configuring Tomcat to Use SSL',
 'Awesome nested set plugin - how to add new children to the tree at various levels plugin',
 'How to create map from JSON response in Ruby on Rails 3.8?',
 'rspec test if method is called Ruby on Rails',
 'SpringBoot Catalina LifeCycle Exception',
 'How to import data from excel to mysql import database using import php']

In [75]:
print(len(X_train))
#print(X_train.shape)
#print(X_train, '\n', y_train)

10


In [76]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOP_WORDS = set(stopwords.words('english'))

In [122]:
def text_prepare(text):

	text = text.lower()
	text = REPLACE_BY_SPACE_RE.sub(' ', text)
	text = BAD_SYMBOLS_RE.sub('', text)

	text_tokens = word_tokenize(text)
	final_text = ""
	#text = [w for w in text_tokens if not w in STOP_WORDS]
	for token in text_tokens:
		if token not in STOP_WORDS:
			final_text += token + " "

	final_text = final_text.strip()
	#print(final_text)
	return final_text

In [62]:
def test_text_prepare():
	examples = ["SQL Server(} - any equivalent; of !Excel's CHOOSE@ ~function?", "How to free c++ memory vector<int> * arr?"]
	answers = ["sql server equivalent excels choose function", "free c++ memory vectorint arr"]
	for ex, ans in zip(examples, answers):
		if text_prepare(ex) != ans:
			return "Wrong answer for the case: '%s'" % ex
	return 'Basic tests are passed.'

In [63]:
print(test_text_prepare())

Basic tests are passed.


In [123]:
X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]

In [124]:
print(X_train)#, '\n', y_train)

['draw stacked dotplot r', 'mysql select records datetime field less specified value mysql', 'terminate windows phone 81 app', 'get current time specific country via jquery', 'configuring tomcat use ssl', 'awesome nested set plugin add new children tree various levels plugin', 'create map json response ruby rails 38', 'rspec test method called ruby rails', 'springboot catalina lifecycle exception', 'import data excel mysql import database using import php']


In [135]:
train_tokens = [word_tokenize(i) for i in X_train]
train_words_tokens=[]
for each in X_train:
	tokens = word_tokenize(each)
	for token in tokens:
		train_words_tokens.append(token)
        
train_tags_tokens=[]
for each in y_train:
	#tokens = word_tokenize(each)
	for token in each:
		train_tags_tokens.append(token)

fdist_ttags = nltk.FreqDist(train_tags_tokens)

tags_counts = dict((word, freq) for word, freq in fdist_ttags.items() if not word.isdigit())

In [126]:
print(len(train_words_tokens))
print(train_words_tokens)

66
['draw', 'stacked', 'dotplot', 'r', 'mysql', 'select', 'records', 'datetime', 'field', 'less', 'specified', 'value', 'mysql', 'terminate', 'windows', 'phone', '81', 'app', 'get', 'current', 'time', 'specific', 'country', 'via', 'jquery', 'configuring', 'tomcat', 'use', 'ssl', 'awesome', 'nested', 'set', 'plugin', 'add', 'new', 'children', 'tree', 'various', 'levels', 'plugin', 'create', 'map', 'json', 'response', 'ruby', 'rails', '38', 'rspec', 'test', 'method', 'called', 'ruby', 'rails', 'springboot', 'catalina', 'lifecycle', 'exception', 'import', 'data', 'excel', 'mysql', 'import', 'database', 'using', 'import', 'php']


In [127]:
fdist_twords = nltk.FreqDist(train_words_tokens)
words_counts = dict((word, freq) for word, freq in fdist_twords.items() if not word.isdigit())
most_common_wordss = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)#[:3]
most_common_words = [each[0] for each in most_common_wordss]

In [128]:
print(len(most_common_words))
print(most_common_words)

57
['mysql', 'import', 'plugin', 'ruby', 'rails', 'draw', 'stacked', 'dotplot', 'r', 'select', 'records', 'datetime', 'field', 'less', 'specified', 'value', 'terminate', 'windows', 'phone', 'app', 'get', 'current', 'time', 'specific', 'country', 'via', 'jquery', 'configuring', 'tomcat', 'use', 'ssl', 'awesome', 'nested', 'set', 'add', 'new', 'children', 'tree', 'various', 'levels', 'create', 'map', 'json', 'response', 'rspec', 'test', 'method', 'called', 'springboot', 'catalina', 'lifecycle', 'exception', 'data', 'excel', 'database', 'using', 'php']


In [129]:
#DICT_SIZE = 5000
DICT_SIZE = len(most_common_words)
WORDS_TO_INDEX = {}
INDEX_TO_WORDS = {}
index = 0

for words in most_common_words[:DICT_SIZE]:
	WORDS_TO_INDEX[words] = index
	INDEX_TO_WORDS[index] = words
	index += 1

ALL_WORDS = WORDS_TO_INDEX.keys()

In [130]:
def my_bag_of_words(text, words_to_index, dict_size):
	result_vector = np.zeros(dict_size)
	#all_words = words_to_index.keys()
	tokens = word_tokenize(text)
	for each in tokens:
		if each in ALL_WORDS:
			i = words_to_index[each]
			result_vector[i] += 1

	return result_vector

In [142]:
#from scipy import sparse as sp_sparse
#X_train_mybag = [my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE) for text in X_train]
#X_val, y_val = X_validation[:5], y_validation[:5]
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])

In [143]:
#print(len(X_train_mybag[2]))
print(X_train_mybag.shape)
print(X_val_mybag.shape)

(10, 57)
(5, 57)


In [150]:
#tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=5, max_df=0.90, decode_error='ignore', token_pattern='(\S+)')
print(most_common_words)
tfidf_vectorizer = TfidfVectorizer(input=most_common_words, max_features=50)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.fit_transform(X_val)

['mysql', 'import', 'plugin', 'ruby', 'rails', 'draw', 'stacked', 'dotplot', 'r', 'select', 'records', 'datetime', 'field', 'less', 'specified', 'value', 'terminate', 'windows', 'phone', 'app', 'get', 'current', 'time', 'specific', 'country', 'via', 'jquery', 'configuring', 'tomcat', 'use', 'ssl', 'awesome', 'nested', 'set', 'add', 'new', 'children', 'tree', 'various', 'levels', 'create', 'map', 'json', 'response', 'rspec', 'test', 'method', 'called', 'springboot', 'catalina', 'lifecycle', 'exception', 'data', 'excel', 'database', 'using', 'php']


In [151]:
#print(type(X_train_tfidf))
#print(X_train_tfidf.todense().shape)
#vocabs = tfidf_vectorizer.vocabulary_.keys()
#print(vocabs)
print(X_train_tfidf.todense().shape)
print(X_val_tfidf.todense().shape)

(10, 50)
(5, 34)


In [136]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train = mlb.fit_transform(y_train)

In [138]:
print(y_train.shape)

(10, 14)


In [140]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

model = OneVsRestClassifier(LogisticRegression())
#model.fit(X_train_mybag, y_train)
model.fit(X_train_tfidf, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [149]:
model.predict(X_val_tfidf)

ValueError: X has 34 features per sample; expecting 58