In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import GRU, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

In [4]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [5]:
!wget 'https://www.dropbox.com/s/5721wcs2guuykzl/stacksample.zip?dl=0'

--2023-07-25 05:22:57--  https://www.dropbox.com/s/5721wcs2guuykzl/stacksample.zip?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/5721wcs2guuykzl/stacksample.zip [following]
--2023-07-25 05:22:57--  https://www.dropbox.com/s/raw/5721wcs2guuykzl/stacksample.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc0223ffa9ece9b1b698d23c891b.dl.dropboxusercontent.com/cd/0/inline/CAjtYROfoyLGhpeeWS9oHMdyD6G3GjjZLrN2YYDn1M9gtketEMJWjVZA5j0_Zid7z-iRHEtiOJp4Rd_wxoBJQJqfqsBcICDkvxo13yVOQaNZCNSyqzfP5ivuBXZS2tUkacpY1OeXJvrOHOorvA_nGYM-4PiJ8Hy3m7AVIwto_LVB2A/file# [following]
--2023-07-25 05:22:57--  https://uc0223ffa9ece9b1b698d23c891b.dl.dropboxusercontent.com/cd/0/inline/CAjtYROfoyLGhpeeWS9oHMdyD6G3GjjZLrN2YYDn1M9gtketEMJWjVZA5j0_Zid7z-

In [6]:
!unzip stacksample.zip?dl=0

Archive:  stacksample.zip?dl=0
  inflating: Answers.csv             
  inflating: Questions.csv           
  inflating: Tags.csv                


In [7]:
!rm stacksample.zip?dl=0

In [9]:
questionList = pd.read_csv('/content/Questions.csv', usecols=["Id", "Body"], encoding = "ISO-8859-1")

In [10]:
tagList = pd.read_csv('/content/Tags.csv',usecols = ["Id", "Tag"], encoding = "ISO-8859-1")

In [12]:
def GetProperData(txt):
	soup = BeautifulSoup(txt,'lxml')
	paras=soup.find_all('p')
	paras=[p.text for p in paras]
	return paras

In [13]:
questionList.sort_values(by=['Id'])
questionList['Body'] = questionList.apply(lambda x: GetProperData(x['Body']), axis=1)
questionList['Body'] = questionList.Body.apply(lambda x: ' '.join(x))

In [14]:
tagList.isnull().sum()

Id      0
Tag    32
dtype: int64

In [15]:
tagList["Tag"].fillna("_null", inplace = True)

In [16]:
tagList.isnull().sum()

Id     0
Tag    0
dtype: int64

In [17]:
#top 10 most frequent Tags
print(tagList['Tag'].value_counts())
topTag = tagList['Tag'].value_counts()[:10].index.tolist()
print(topTag)

c#                    3899
.net                  2280
java                  2042
asp.net               1815
php                   1506
                      ... 
webservicetemplate       1
piccolo                  1
freetype                 1
freetype2                1
mdns                     1
Name: Tag, Length: 7291, dtype: int64
['c#', '.net', 'java', 'asp.net', 'php', 'javascript', 'c++', 'python', 'sql', 'jquery']


In [18]:
tagList_top = tagList[tagList['Tag'].isin(topTag)]

In [19]:
tagList = tagList_top
print(tagList)

            Id         Tag
7          120         sql
8          120     asp.net
14         260          c#
15         260        .net
18         330         c++
...        ...         ...
82871  1499830     asp.net
82874  1499950         php
82875  1499950  javascript
82876  1499950      jquery
82878  1499970        java

[17478 rows x 2 columns]


In [20]:
questionList = questionList[questionList['Id'].isin(tagList['Id'])]


In [21]:
def GetAllTags(Tag_id):
    return tagList[tagList['Id'] == Tag_id['Id']].Tag.values

temp = tagList.apply(GetAllTags, axis=1)
print(type(temp))

<class 'pandas.core.series.Series'>


In [22]:
tagList=pd.concat([tagList, temp.rename('Tags')], axis=1)
tagList.head()

Unnamed: 0,Id,Tag,Tags
7,120,sql,"[sql, asp.net]"
8,120,asp.net,"[sql, asp.net]"
14,260,c#,"[c#, .net]"
15,260,.net,"[c#, .net]"
18,330,c++,[c++]


In [23]:
tagList = tagList.drop(['Tag'], axis = 1)

In [24]:
tagList = tagList.drop_duplicates(subset="Id", keep="first", inplace=False)

In [25]:
questionList = pd.merge(questionList, tagList, on='Id')

In [26]:
questionList.isnull().sum()

Id      0
Body    0
Tags    0
dtype: int64

In [27]:
sent_lens=[]
for sent in questionList['Body']:
    sent_lens.append(len(word_tokenize(sent)))

In [28]:
max(sent_lens)

1223

In [29]:
for percentile in [0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97]:
  print('For percentile:', percentile, ":", np.quantile(sent_lens, percentile))


For percentile: 0.81 : 158.0
For percentile: 0.82 : 161.0
For percentile: 0.83 : 165.0
For percentile: 0.84 : 170.0
For percentile: 0.85 : 175.0
For percentile: 0.86 : 180.0
For percentile: 0.87 : 185.0
For percentile: 0.88 : 191.0
For percentile: 0.89 : 197.0
For percentile: 0.91 : 213.0
For percentile: 0.92 : 223.0
For percentile: 0.93 : 233.0
For percentile: 0.94 : 245.0
For percentile: 0.95 : 260.0
For percentile: 0.96 : 279.0
For percentile: 0.97 : 303.0


In [30]:
max_len = int(np.quantile(sent_lens, 0.85))
print(max_len)

175


In [31]:
tok = Tokenizer(char_level=False,split=' ')
tok.fit_on_texts(questionList['Body'])

In [32]:
multilabel = MultiLabelBinarizer()
multilabel.fit_transform(questionList['Tags'])

array([[0, 1, 0, ..., 0, 0, 1],
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [33]:
multilabel.classes_

array(['.net', 'asp.net', 'c#', 'c++', 'java', 'javascript', 'jquery',
       'php', 'python', 'sql'], dtype=object)

In [34]:
Q_train, Q_test = train_test_split(questionList,test_size=0.2,random_state=2)
x_train = Q_train['Body']
y_train = Q_train['Tags']
x_test  = Q_test['Body']
y_test  = Q_test['Tags']

In [35]:
#input to model Train_X = sequences_matrix_train
sequences_train = tok.texts_to_sequences(x_train)
sequences_matrix_train = sequence.pad_sequences(sequences_train,maxlen=max_len)


In [36]:
#input to model Test_X = sequences_matrix_test
sequences_test = tok.texts_to_sequences(x_test)
sequences_matrix_test = sequence.pad_sequences(sequences_test, maxlen=max_len)

In [37]:
vocab_len = len(tok.index_word.keys())
vocab_len

34483

In [38]:
#Output to model Train_Y = y_train_multiLevel
y_train_multiLevel = multilabel.transform(y_train)

In [39]:
outputNode = len(multilabel.classes_)
max_len, outputNode

(175, 10)

In [40]:
#Output to model Test_Y = y_test_multiLevel
y_test_multiLevel = multilabel.transform(y_test)

In [41]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])

    layer = Embedding(vocab_len+1,1500,input_length=max_len,
                      mask_zero=True)(inputs)
    layer = GRU(400)(layer)
    layer = Dense(200,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dense(100,name='FC2')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(outputNode,name='out_layer')(layer)
    #layer = Activation('sigmoid')(layer)
    layer = Activation('softmax')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [42]:
model = RNN()
model.summary()

model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 175)]             0         
                                                                 
 embedding (Embedding)       (None, 175, 1500)         51726000  
                                                                 
 gru (GRU)                   (None, 400)               2282400   
                                                                 
 FC1 (Dense)                 (None, 200)               80200     
                                                                 
 activation (Activation)     (None, 200)               0         
                                                                 
 FC2 (Dense)                 (None, 100)               20100     
                                                                 
 activation_1 (Activation)   (None, 100)               0     

In [None]:
model.fit(sequences_matrix_train,y_train_multiLevel,batch_size=50,
          epochs=3,validation_data=(sequences_matrix_test,y_test_multiLevel))


Epoch 1/3

In [None]:


predictions=model.predict(sequences_matrix_test)

from sklearn.metrics import roc_auc_score

roc_auc_score(y_test_multiLevel,predictions)