**PART 1: Cleaning Data and Exploratory Data Analysis**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from string import punctuation
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import warnings, pickle, time, re, nltk
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.svm import LinearSVC
from sklearn.metrics import make_scorer
from sklearn.metrics import hamming_loss
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import os 
print(os.listdir("../input"))

['Answers.csv', 'Tags.csv', 'Questions.csv']


In [3]:
df = pd.read_csv("../input/Questions.csv", encoding="ISO-8859-1")

In [4]:
print(df.shape)
df.head(5)

(1264216, 7)


Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [5]:
tags = pd.read_csv("../input/Tags.csv", encoding="ISO-8859-1", dtype={'Tag': str})

In [6]:
print(tags.shape)
tags.head(5)

(3750994, 2)


Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [7]:
df.isnull().sum()

Id                    0
OwnerUserId       14454
CreationDate          0
ClosedDate      1208257
Score                 0
Title                 0
Body                  0
dtype: int64

In [8]:
tags.isnull().sum()

Id        0
Tag    1113
dtype: int64

First, what I want to do is to merge both dataframes. In order to do that, I'll have to group tags by the id of the post since a post can have multiple tags. I'll just use the groupeby function and then merge the dataframes on the id. 

In [9]:
tags['Tag'] = tags['Tag'].astype(str)

In [10]:
grouped_tags = tags.groupby("Id")['Tag'].apply(lambda tags: ' '.join(tags)).reset_index()

In [11]:
grouped_tags.head(5)

Unnamed: 0,Id,Tag
0,80,flex actionscript-3 air
1,90,svn tortoisesvn branch branching-and-merging
2,120,sql asp.net sitemap
3,180,algorithm language-agnostic colors color-space
4,260,c# .net scripting compiler-construction


In [12]:
df.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate'], inplace=True)

In [13]:
df = df.merge(grouped_tags, on='Id')

In [14]:
print(df.shape)
df.head(5)

(1264216, 5)


Unnamed: 0,Id,Score,Title,Body,Tag
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex actionscript-3 air
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn tortoisesvn branch branching-and-merging
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql asp.net sitemap
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm language-agnostic colors color-space
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c# .net scripting compiler-construction


In [15]:
df.isnull().sum()

Id       0
Score    0
Title    0
Body     0
Tag      0
dtype: int64

All the rows have some Tag associated with it.

Now, I'll take only quesions with score greater than 5 because posts will probably be with a better quality and will be better tagged since they have lots of upvotes. 


In [16]:
df = df[df['Score']>5].copy()

**1.2 Cleaning Data**

In [17]:
print('Duplicate entries: {}'.format(df.duplicated().sum()))
df.drop_duplicates(inplace = True)

Duplicate entries: 0


This is a very good dataset since there are no missing values or duplicate values. 

In [18]:
df.drop(columns=['Id', 'Score'], inplace=True)

### Tags

Let's do some cleaning on the tags' column. Furthermore, I decided to keep the 100 most popular tags because I'll be easier to predict the right tag from 100 words than from 14,000 and because we want to keep macro tags and not be too specific since it's only a recommendation for a post, the user can add more specific tags himself. 

In [19]:
from collections import Counter

In [20]:
df.head(5)

Unnamed: 0,Title,Body,Tag
0,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex actionscript-3 air
1,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn tortoisesvn branch branching-and-merging
2,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql asp.net sitemap
3,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm language-agnostic colors color-space
4,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c# .net scripting compiler-construction


In [21]:
df['Tag'] = df['Tag'].apply(lambda x: x.split())

In [22]:
all_tags = [item for sublist in df['Tag'].values for item in sublist]

In [23]:
len(all_tags)

224129

In [24]:
my_set = set(all_tags)
unique_tags = list(my_set)
len(unique_tags)

14883

In [25]:
counts = Counter(all_tags)

In [26]:
print(counts.most_common(25))

[('c#', 6907), ('java', 6862), ('javascript', 5560), ('android', 5052), ('python', 4442), ('c++', 4369), ('php', 3024), ('jquery', 2770), ('.net', 2705), ('ios', 2685), ('html', 2129), ('css', 1980), ('c', 1845), ('iphone', 1781), ('objective-c', 1768), ('ruby-on-rails', 1524), ('sql', 1389), ('asp.net', 1302), ('mysql', 1286), ('ruby', 1249), ('r', 1120), ('git', 1013), ('asp.net-mvc', 1004), ('linux', 935), ('sql-server', 907)]


In [27]:
s=0
for i in counts.most_common(25):
    s+=i[1]
print(s)

65608


In [28]:
frequencies_words = counts.most_common(25)
tags_features = [word[0] for word in frequencies_words]

In [29]:
print(tags_features)

['c#', 'java', 'javascript', 'android', 'python', 'c++', 'php', 'jquery', '.net', 'ios', 'html', 'css', 'c', 'iphone', 'objective-c', 'ruby-on-rails', 'sql', 'asp.net', 'mysql', 'ruby', 'r', 'git', 'asp.net-mvc', 'linux', 'sql-server']


In [30]:
def most_common(tags):
    tags_filtered = []
    for i in range(0, len(tags)):
        if tags[i] in tags_features:
            tags_filtered.append(tags[i])
    return tags_filtered

In [31]:
df['Tag'] = df['Tag'].apply(lambda x: most_common(x))
df['Tag'] = df['Tag'].apply(lambda x: x if len(x)>0 else None)

In [32]:
df.head()

Unnamed: 0,Title,Body,Tag
0,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,
1,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,
2,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,"[sql, asp.net]"
3,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,
4,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"[c#, .net]"


In [33]:
df.dropna(subset=['Tag'], inplace=True)

In [34]:
df.shape

(52418, 3)

**1.2.2 Body**

In the next two columns: Body and Title, I'll use lots of text processing:
* Removing html format 
* Lowering text
* Transforming abbreviations 
* Removing punctuation (but keeping words like c# since it's the most popular tag)
* Lemmatizing words
* Removing stop words

In [35]:
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git

Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-qy6zuiyh
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l- \ done
[?25h  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.1.0-py3-none-any.whl size=11713 sha256=3f762f494990aae906c8666b335f82459bf67d796118ed1a0176a2bcdc53f179
  Stored in directory: /tmp/pip-ephem-wheel-cache-dizmbfct/wheels/0d/b3/29/bfe3deffda68980088d17b81331be6667e837ffb4a071bae82
Successfully built preprocess-kgptalkie
Installing collected packages: preprocess-kgptalkie
Successfully installed preprocess-kgptalkie-0.1.0


In [36]:
import preprocess_kgptalkie as ps

In [37]:
def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_html_tags(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x

In [38]:
df['Body'] = df['Body'].apply(lambda x: get_clean(x))

In [39]:
df['Title'] = df['Title'].apply(lambda x: get_clean(x))

In [40]:
df['Text'] = df['Title'] + " " +  df['Body']

In [41]:
df.head()

Unnamed: 0,Title,Body,Tag,Text
2,aspnet site mapostscripoint,has anyone got experetweethat isnce creating s...,"[sql, asp.net]",aspnet site mapostscripoint has anyone got exp...
4,adding scripointing fyounctionality to net apa...,i have a little game written in c it youses a ...,"[c#, .net]",adding scripointing fyounctionality to net apa...
5,shoyould i youse nested classes in this case,i am working on a collection of classes yoused...,[c++],shoyould i youse nested classes in this case i...
6,homexamplerown consyoumpointion of web servi ses,i have been writing a few web servi ses for a ...,[.net],homexamplerown consyoumpointion of web servi s...
7,deploying sqyouarel server databases from test...,i wonder how yoyou gyouys manage deployment of...,[sql-server],deploying sqyouarel server databases from test...


**PART 2: Classical classifiers**

**2.1 Data preparation**

In [42]:
y = df['Tag']

In [43]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(y)

In [44]:
y.shape

(52418, 25)

In [45]:
tfidf = TfidfVectorizer(analyzer = 'word', max_features=1000)
X = tfidf.fit_transform(df['Text'])

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

**2.2 One vs Rest**

To evaluate our models, I'll use the jacard score since it's the best fitted for multi label classification. 

In [47]:
def avg_jacard(y_true,y_pred):
    
    jacard = np.minimum(y_true,y_pred).sum(axis=1) / np.maximum(y_true,y_pred).sum(axis=1)
    
    return jacard.mean()*100

def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    print("Jacard score: {}".format(avg_jacard(y_test, y_pred)))
    print("Hamming loss: {}".format(hamming_loss(y_pred, y_test)*100))
    print("---")    

In [48]:
sgd = SGDClassifier()
lr = LogisticRegression()
svc = LinearSVC()
rf = RandomForestClassifier()

for classifier in [sgd, lr, svc]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)

Clf:  SGDClassifier
Jacard score: 42.20123786510661
Hamming loss: 3.246343634745008
---
Clf:  LogisticRegression
Jacard score: 44.32192971300183
Hamming loss: 3.220653694518632
---
Clf:  LinearSVC
Jacard score: 50.1352325236339
Hamming loss: 3.0281063207427192
---


**2.6 Confusion matrix**

In [49]:
for i in range(y_train.shape[1]):
    print(multilabel.classes_[i])
    print(confusion_matrix(y_test[:,i], y_pred[:,i]))
    print("")

.net
[[14861    52]
 [  685   128]]

android
[[14166    34]
 [  322  1204]]

asp.net
[[15325    34]
 [  250   117]]

asp.net-mvc
[[15392    46]
 [  139   149]]

c
[[15080    74]
 [  438   134]]

c#
[[13331   331]
 [ 1325   739]]

c++
[[14259   171]
 [  773   523]]

css
[[15030    60]
 [  234   402]]

git
[[15398    10]
 [   46   272]]

html
[[14945   132]
 [  428   221]]

ios
[[14808   102]
 [  401   415]]

iphone
[[15137    60]
 [  382   147]]

java
[[13496   151]
 [  887  1192]]

javascript
[[13875   212]
 [  778   861]]

jquery
[[14826    66]
 [  295   539]]

linux
[[15408    39]
 [  181    98]]

mysql
[[15307    27]
 [  143   249]]

objective-c
[[15146    63]
 [  440    77]]

php
[[14740    69]
 [  324   593]]

python
[[14309    66]
 [  400   951]]

r
[[15353    37]
 [  222   114]]

ruby
[[15311    30]
 [  218   167]]

ruby-on-rails
[[15246    25]
 [  167   288]]

sql
[[15231    75]
 [  270   150]]

sql-server
[[15430    56]
 [  135   105]]



#### Data Store

In [50]:
df[['Text', 'Tag']].to_csv('stackoverflow.csv')

## Deep Learning

In [51]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import hamming_loss

In [52]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Activation, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D 

In [53]:
from sklearn.model_selection import train_test_split
import ast

In [54]:
df = pd.read_csv('./stackoverflow.csv', index_col = 0)

In [55]:
df.head()

Unnamed: 0,Text,Tag
2,aspnet site mapostscripoint has anyone got exp...,"['sql', 'asp.net']"
4,adding scripointing fyounctionality to net apa...,"['c#', '.net']"
5,shoyould i youse nested classes in this case i...,['c++']
6,homexamplerown consyoumpointion of web servi s...,['.net']
7,deploying sqyouarel server databases from test...,['sql-server']


In [56]:
df['Tag'] = df['Tag'].apply(lambda x: ast.literal_eval(x))

In [57]:
df['Tag']

2          [sql, asp.net]
4              [c#, .net]
5                   [c++]
6                  [.net]
7            [sql-server]
                ...      
1262668             [c++]
1262834             [c++]
1262915          [python]
1263065          [python]
1263454             [c++]
Name: Tag, Length: 52418, dtype: object

In [58]:
df['Tag'].iloc[0]

['sql', 'asp.net']

In [59]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['Tag'])

In [60]:
multilabel.classes_

array(['.net', 'android', 'asp.net', 'asp.net-mvc', 'c', 'c#', 'c++',
       'css', 'git', 'html', 'ios', 'iphone', 'java', 'javascript',
       'jquery', 'linux', 'mysql', 'objective-c', 'php', 'python', 'r',
       'ruby', 'ruby-on-rails', 'sql', 'sql-server'], dtype=object)

In [61]:
text = df['Text'].tolist()

In [62]:
token = Tokenizer()
token.fit_on_texts(text)

In [63]:
y

array([[0, 0, 1, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [64]:
y.shape

(52418, 25)

In [65]:
len(token.word_counts)

572005

In [66]:
vocab_size = len(token.word_index) + 1 #https://keras.io/api/layers/core_layers/embedding/
vocab_size

572006

In [67]:
encoded_text = token.texts_to_sequences(text)

In [68]:
max_length = 100
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')

In [69]:
X.shape, y.shape

((52418, 100), (52418, 25))

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.3)

## Model Building

In [71]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [72]:
vec_size = 50
def get_model():
    model = Sequential()
    model.add(Embedding(vocab_size, vec_size, input_length=max_length))

    model.add(Conv1D(32, 2, activation = 'relu'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.2))

    model.add(Conv1D(64, 3, activation = 'relu'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.3))


    model.add(Dense(128, activation='relu'))

    model.add(GlobalMaxPooling1D())

    model.add(Dense(y.shape[1], activation='softmax'))

    return model

In [73]:
from keras import backend as K
def avg_jacard(y_true,y_pred):
    jacard = K.sum(K.minimum(y_true,y_pred)) / K.sum(K.maximum(y_true,y_pred))
    return K.mean(jacard)

In [74]:
early = EarlyStopping(
    monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto',
    baseline=None, restore_best_weights=False
)

In [75]:
model = get_model()
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics = [avg_jacard])
model.fit(X_train, y_train, epochs = 10, validation_data = (X_test, y_test), batch_size = 128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff9235e8090>

### Model Testing

In [76]:
x = "what is the best way to copy a database i always create a new empointy database afeatyouringer that backyoup and restore of the existing database into it byout is this really the best way as it seemiss very error prone and over compli seeated for me"

In [77]:
def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_html_tags(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x

In [78]:
def get_encoded(x):
    x = token.texts_to_sequences([x])
    x = pad_sequences(x, maxlen=max_length, padding = 'post')
    return x

In [79]:
coded = get_encoded(x)

In [80]:
model.predict_classes(coded)

array([6])

In [81]:
multilabel.classes_[model.predict_classes(coded)]

array(['c++'], dtype=object)