In [14]:
#%pip install contractions

In [15]:
import pandas as pd 

In [16]:
# carregando os dados
questions_df = pd.read_csv('data/Questions.csv', encoding="ISO-8859-1", usecols =['Id','Score', 'Title', 'Body'])
questions_df = questions_df.sample(n=10000)

In [17]:
# Cleaning the data
import re
import string
import contractions
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

def normalize_text(s):
    s = s.lower()
    return s

def remove_html_tags(text):
    text = re.sub('<pre>.*?</pre>', '', text, flags=re.DOTALL)
    text = re.sub('<code>.*?</code>', '', text, flags=re.DOTALL)
    text = re.sub('<[^>]+>', '', text, flags=re.DOTALL)
    return text.replace("\n", "")

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)    

def remove_contractions(text):
    return contractions.fix(text)

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(w) for w in word_tokenize(text)])

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([w for w in word_tokenize(text) if not w in stop_words])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Danilo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
questions_df['Body'] = questions_df['Body'].apply(remove_html_tags)
questions_df['Body'] = questions_df['Body'].apply(remove_contractions)
questions_df['Body'] = questions_df['Body'].apply(normalize_text)
questions_df['Body'] = questions_df['Body'].apply(remove_stopwords)
questions_df['Body'] = questions_df['Body'].apply(lemmatize)
questions_df['Title'] = questions_df['Title'].apply(remove_html_tags)
questions_df['Title'] = questions_df['Title'].apply(remove_contractions)
questions_df['Title'] = questions_df['Title'].apply(normalize_text)
questions_df['Title'] = questions_df['Title'].apply(remove_stopwords)
questions_df['Title'] = questions_df['Title'].apply(lemmatize)

In [19]:
# Print the first rows of `questions_df`
print(questions_df.head())

               Id  Score                                              Title   
1101444  35869610      7            use access_coarse_location permission ?  \
484133   16906290      1               create custom filetype folder file ?   
444158   15594330      0                             rail flash.now partial   
570227   19707260      0    long running process new thread win form vb.net   
158154    6203020      4  starting subprocess via python multiprocessing...   

                                                      Body  
1101444  building android app track user 's geolocation...  
484133   question simple , bunch file ( image plain tex...  
444158   controller supposed display flash message . fl...  
570227   calling web service window form ( vb.net 4.5 )...  
158154   using pyaudio listen audio device `` stuff `` ...  


In [20]:

def join_strings(x):
    return " ".join(x)


# Load the tags dataset
tags_df = pd.read_csv("data/Tags.csv", encoding="ISO-8859-1", dtype={'Tag': str})
tags_df['Tag'] = tags_df['Tag'].astype(str)

tags_group = tags_df.groupby("Id")['Tag'].apply(join_strings)
tags = pd.DataFrame({'Id':tags_group.index, 'Tags':tags_group.values})


In [21]:
tags

Unnamed: 0,Id,Tags
0,80,flex actionscript-3 air
1,90,svn tortoisesvn branch branching-and-merging
2,120,sql asp.net sitemap
3,180,algorithm language-agnostic colors color-space
4,260,c# .net scripting compiler-construction
...,...,...
1264211,40143210,php .htaccess
1264212,40143300,google-bigquery
1264213,40143340,android android-studio
1264214,40143360,javascript vue.js


In [22]:
merged_df = pd.merge(questions_df, tags, on='Id')

In [23]:
# For y we need the order of tags to not affect the result, so we use an encoder to transform the tags into numbers
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(merged_df['Title'] + ' ' + merged_df['Body'])

mlb = MultiLabelBinarizer(sparse_output=True)

y = mlb.fit_transform(merged_df['Tags'].str.split(" "))

In [24]:
# Splitting the data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Training the model for a multilabel classification, using a OneVsRestClassifier with a LogisticRegression and knowing that the data is sparse
# We need the 4 most probable tags for each question, so we use predict_proba instead of predict

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(lr)

clf.fit(X_train, y_train)



In [25]:
y_pred = clf.predict_proba(X_test)

In [26]:
# Display the 4 most probable tags for each question
import numpy as np

for i in range(10):
    print("Question: " + merged_df['Title'].iloc[i])
    print("Real tags: " + str(np.array(mlb.inverse_transform(y_test)[i])))
    print("Predicted tags: " + str(np.array(mlb.classes_[y_pred[i].argsort()[-4:][::-1]])))
    print()

Question: use access_coarse_location permission ?
Real tags: ['css' 'frontend' 'html']
Predicted tags: ['java' 'c#' 'php' 'html']

Question: create custom filetype folder file ?
Real tags: ['c' 'extern']
Predicted tags: ['javascript' 'php' 'java' 'c#']

Question: rail flash.now partial
Real tags: ['gdb']
Predicted tags: ['c#' 'java' 'javascript' 'php']

Question: long running process new thread win form vb.net
Real tags: ['vb.net']
Predicted tags: ['c#' 'java' 'javascript' 'android']

Question: starting subprocess via python multiprocessing hang
Real tags: ['scala' 'spark-dataframe' 'apache-spark']
Predicted tags: ['c#' 'java' 'javascript' 'python']

Question: simulating android camera
Real tags: ['angularjs']
Predicted tags: ['ios' 'angularjs' 'c#' 'ruby-on-rails']

Question: method correct find second third fourth highest value column mysql
Real tags: ['odbc' 'matlab' 'histogram' 'wavelet' 'cbir']
Predicted tags: ['c#' 'java' 'android' 'php']

Question: selecting gradle build file di