In [None]:
!wget https://www.kaggle.com/stackoverflow/statsquestions/downloads/Answers.csv/1

In [None]:
!wget https://www.kaggle.com/stackoverflow/statsquestions/downloads/Questions.csv/1

In [None]:
!wget https://www.kaggle.com/stackoverflow/statsquestions/downloads/Tags.csv/1

In [None]:
!ls

In [None]:
!unzip Answers.csv.zip

In [None]:
!unzip Questions.csv.zip

In [None]:
!unzip Tags.csv.zip

In [None]:
!ls

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import keras

Using TensorFlow backend.


In [2]:
%matplotlib inline

In [3]:
df_questions = pd.read_csv('Questions.csv', encoding='iso-8859-1')
df_tags = pd.read_csv('Tags.csv', encoding='iso-8859-1')
df_questions.head(n=2)

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,6,5.0,2010-07-19T19:14:44Z,272,The Two Cultures: statistics vs. machine learn...,"<p>Last year, I read a blog post from <a href=..."
1,21,59.0,2010-07-19T19:24:36Z,4,Forecasting demographic census,<p>What are some of the ways to forecast demog...


In [4]:
df_tags.head(n=10)

Unnamed: 0,Id,Tag
0,1,bayesian
1,1,prior
2,1,elicitation
3,2,distributions
4,2,normality
5,3,software
6,3,open-source
7,4,distributions
8,4,statistical-significance
9,6,machine-learning


In [5]:
grouped_tags = df_tags.groupby("Tag", sort='count').size().reset_index(name='count')
grouped_tags.Tag.describe()

count              1315
unique             1315
top       concept-drift
freq                  1
Name: Tag, dtype: object

In [6]:
num_classes = 100
grouped_tags = df_tags.groupby("Tag").size().reset_index(name='count')
most_common_tags = grouped_tags.nlargest(num_classes, columns="count")
df_tags.Tag = df_tags.Tag.apply(lambda tag : tag if tag in most_common_tags.Tag.values else None)
df_tags = df_tags.dropna()

In [7]:
import re 

def strip_html_tags(body):
    regex = re.compile('<.*?>')
    return re.sub(regex, '', body)

df_questions['Body'] = df_questions['Body'].apply(strip_html_tags)
df_questions['Text'] = df_questions['Title'] + ' ' + df_questions['Body']


In [8]:
# denormalize tables

def tags_for_question(question_id):
    return df_tags[df_tags['Id'] == question_id].Tag.values

def add_tags_column(row):
    row['Tags'] = tags_for_question(row['Id'])
    return row

df_questions = df_questions.apply(add_tags_column, axis=1)

In [9]:
pd.set_option('display.max_colwidth', 400)
df_questions[['Id', 'Text', 'Tags']].head()

Unnamed: 0,Id,Text,Tags
0,6,"The Two Cultures: statistics vs. machine learning? Last year, I read a blog post from Brendan O'Connor entitled ""Statistics vs. Machine Learning, fight!"" that discussed some of the differences between the two fields. Andrew Gelman responded favorably to this:\n\nSimon Blomberg: \n\n\n From R's fortunes\n package: To paraphrase provocatively,\n 'machine learning is statistics minus\n any c...",[machine-learning]
1,21,"Forecasting demographic census What are some of the ways to forecast demographic census with some validation and calibration techniques?\n\nSome of the concerns:\n\n\nCensus blocks vary in sizes as rural\nareas are a lot larger than condensed\nurban areas. Is there a need to account for the area size difference?\nif let's say I have census data\ndating back to 4 - 5 census periods,\nhow far ca...",[forecasting]
2,22,Bayesian and frequentist reasoning in plain English How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?\n,[bayesian]
3,31,"What is the meaning of p values and t values in statistical tests? After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests. It seems that students easily learn how to perform the calculations required by a given test but get hung up on interpreting the resul...","[hypothesis-testing, t-test, p-value, interpretation]"
4,36,"Examples for teaching: Correlation does not mean causation There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustrate this point:\n\n\nnumber of storks and birth rate in Denmark;\nnumber of priests in America and alcoholism;\nin the start of the 20th century it was noted that there was a strong correlation between 'N...",[correlation]


In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer

In [11]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_questions.Tags)
labels = multilabel_binarizer.classes_

In [12]:
maxlen = 180
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df_questions.Text)

In [13]:
def get_features(text_series):
    """
    transforms text data to feature_vectors that can be used in the ml model.
    tokenizer must be available.
    """
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)


def prediction_to_label(prediction):
    tag_prob = [(labels[i], prob) for i, prob in enumerate(prediction.tolist())]
    return dict(sorted(tag_prob, key=lambda kv: kv[1], reverse=True))

In [14]:
from sklearn.model_selection import train_test_split

x = get_features(df_questions.Text)
y = multilabel_binarizer.transform(df_questions.Tags)
print(x.shape)

(85085, 180)


In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

In [16]:
most_common_tags['class_weight'] = len(df_tags) / most_common_tags['count']
class_weight = {}
for index, label in enumerate(labels):
    class_weight[index] = most_common_tags[most_common_tags['Tag'] == label]['class_weight'].values[0]
    
most_common_tags.head()

Unnamed: 0,Tag,count,class_weight
986,r,13236,11.552811
1020,regression,10959,13.953189
669,machine-learning,6089,25.112991
1220,time-series,5559,27.507285
946,probability,4217,36.261086


In [17]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam

filter_length = 300

model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    class_weight=class_weight,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 180, 20)           100000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 180, 20)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 178, 300)          18300     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               30100     
_________________________________________________________________
activation_1 (Activat

In [18]:


cnn_model = keras.models.load_model('model-conv1d.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))



loss: 0.051430355773455755
categorical_accuracy: 0.3336663336873496
