In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

In [2]:
DATA_FOLDER = 'data/'
TRAIN_FILES = ['biology.csv', 'cooking.csv', 'crypto.csv', 'diy.csv', 'robotics.csv', 'travel.csv']
TEST_FILE = 'test.csv'

In [3]:
DOMAIN_COUNT = len(TRAIN_FILES)

In [4]:
df_trains = [pd.read_csv(DATA_FOLDER + filename) for filename in TRAIN_FILES]

In [5]:
for df in df_trains:
    print(df.shape)

(13196, 4)
(15404, 4)
(10432, 4)
(25918, 4)
(2771, 4)
(19279, 4)


In [6]:
sum([df.shape[0] for df in df_trains])

87000

In [7]:
df_trains[0].head()

Unnamed: 0,id,title,content,tags
0,1,What is the criticality of the ribosome bindin...,"<p>In prokaryotic translation, how critical fo...",ribosome binding-sites translation synthetic-b...
1,2,How is RNAse contamination in RNA based experi...,<p>Does anyone have any suggestions to prevent...,rna biochemistry
2,3,Are lymphocyte sizes clustered in two groups?,<p>Tortora writes in <em>Principles of Anatomy...,immunology cell-biology hematology
3,4,How long does antibiotic-dosed LB maintain goo...,<p>Various people in our lab will prepare a li...,cell-culture
4,5,Is exon order always preserved in splicing?,<p>Are there any cases in which the splicing m...,splicing mrna spliceosome introns exons


In [8]:
# # how many unique tags

# domain_unique_tags = [set() for i in range(DOMAIN_COUNT)]
# unique_tags = set()

# for i in range(DOMAIN_COUNT):
#     df = df_trains[i]
#     posts_tags = df['tags'].tolist()
#     for tags in posts_tags:
#         for tag in tags.split(' '):
#             domain_unique_tags[i].add(tag)
#             unique_tags.add(tag)

# print('domain_unique_tag counts:')
# s = 0
# for tag_set in domain_unique_tags:
#     print(len(tag_set))

# print('sum of domain_unique_tag counts:')
# print(sum([len(tag_set) for tag_set in domain_unique_tags]))
    
# print('all tag count:')
# print(len(unique_tags))


# Cleanse Data

In [13]:
import re
from bs4 import BeautifulSoup
# import nltk
# tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

# remove [n]
# remove pure numbers, don't remove letter+number words, like CO2

# remove stopwords
# consider µL?
# remove formulas

def cleanse_html(content):
    return BeautifulSoup(content, "lxml").get_text()

def cleanse_lower_split(content):
    # remove html tags
    content = BeautifulSoup(content, "lxml").get_text()
    
    # convert to lowercase
    content = content.lower()
    
    words = content.split()
    
    return words
    

# only reserve words. don't split sentences
def cleanse(content):
    # remove html tags
    content = BeautifulSoup(content, "lxml").get_text()

    # remove urls
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
#     # split content to sentences
#     sentences = tokenizer.tokenize(content)
    
#     # replace punctuations with whitespaces
#     content = re.sub(r"[^a-zA-Z0-9]"," ", content)
    
    # replace punctuations and numbers with whitespaces
    content = re.sub(r"[^a-zA-Z]"," ", content)
    
#     # remove words that don't contain english (pure number, number+symbol). reserve something like "CO2"
#     content = re.sub(r"[^a-zA-Z0-9]"," ", content)
    
    # convert to lowercase
    content = content.lower()
    
    # split into words
    words = content.split()
    
    # remove stopwords
    words = [word for word in words if word not in stops]
    
    # concat words to a string
    content = ' '.join(words)
    
    return content

In [10]:
domains_contents = [df['content'].tolist() for df in df_trains]  # list of list of strings
print(len(domains_contents))
print(len(domains_contents[0]))

6
13196


In [14]:
cleansed_contents = [cleanse_lower_split(content) for domain_contents in domains_contents for content in domain_contents]

In [15]:
cleansed_contents[:5]

[['in',
  'prokaryotic',
  'translation,',
  'how',
  'critical',
  'for',
  'efficient',
  'translation',
  'is',
  'the',
  'location',
  'of',
  'the',
  'ribosome',
  'binding',
  'site,',
  'relative',
  'to',
  'the',
  'start',
  'codon?',
  'ideally,',
  'it',
  'is',
  'supposed',
  'to',
  'be',
  '-7b',
  'away',
  'from',
  'the',
  'start.',
  'how',
  'about',
  'if',
  'it',
  'is',
  '-9',
  'bases',
  'away',
  'or',
  'even',
  'more?',
  'will',
  'this',
  'have',
  'an',
  'observable',
  'effect',
  'on',
  'translation?'],
 ['does',
  'anyone',
  'have',
  'any',
  'suggestions',
  'to',
  'prevent',
  'rnase',
  'contamination',
  'when',
  'working',
  'with',
  'rna?',
  'i',
  'tend',
  'to',
  'have',
  'issues',
  'with',
  'degradation',
  'regardless',
  'of',
  'whether',
  'i',
  'use',
  'depc',
  'treated',
  '/',
  'rnase',
  'free',
  'water',
  'and',
  'filtered',
  'pipette',
  'tips.'],
 ['tortora',
  'writes',
  'in',
  'principles',
  'of',
  

# Extract Tags

In [35]:
domains_tags = [df['tags'].tolist() for df in df_trains]
post_tags = [tags for domain_tags in domains_tags for tags in domain_tags]
len(post_tags)

87000

In [36]:
tags_split = [ts.split() for ts in post_tags]
tags_split[:100]

[['ribosome', 'binding-sites', 'translation', 'synthetic-biology'],
 ['rna', 'biochemistry'],
 ['immunology', 'cell-biology', 'hematology'],
 ['cell-culture'],
 ['splicing', 'mrna', 'spliceosome', 'introns', 'exons'],
 ['dna', 'biochemistry', 'molecular-biology'],
 ['neuroscience', 'synapses'],
 ['plasmids'],
 ['molecular-genetics', 'gene-expression', 'experimental-design'],
 ['evolution', 'mitochondria', 'chloroplasts'],
 ['high-throughput', 'cell-based'],
 ['molecular-biology', 'synthetic-biology'],
 ['bioinformatics', 'homework'],
 ['neuroscience', 'immunology'],
 ['splicing', 'histone'],
 ['genomics', 'gene-annotation', 'exons'],
 ['microbiology', 'virology', 'influenza'],
 ['epigenetics'],
 ['molecular-biology', 'dna-isolation'],
 ['cell-membrane', 'adaptation', 'cell-biology'],
 ['transcription', 'chromatin'],
 ['biochemistry', 'neuroscience'],
 ['molecular-biology', 'transcription', 'rna-interference'],
 ['neuroscience', 'endocrinology', 'human-biology'],
 ['bioinformatics', 'ph

In [19]:
tag_set = set()

for tags in tags_split:
    for tag in tags:
        tag_set.add(tag)

len(tag_set)

4268

In [23]:
tag_set

{'sheetrock',
 'minors',
 'reflexes',
 'pet-door',
 'alcohol-content',
 'humidity',
 'bristol',
 'conservation-biology',
 'lawn',
 'otr',
 'work',
 'uht',
 'ciphertext-only-attack',
 'gear',
 'reduction',
 'xian',
 'screens',
 'melting-chocolate',
 'grapes',
 'retaining-wall',
 'baking',
 'yamagata',
 'product-review',
 'volunteering',
 'tiraspol',
 'pdx',
 'afghan-citizens',
 'format-preserving',
 'motion',
 'world-cup',
 'honolulu',
 'proof-of-work',
 'australian-cuisine',
 'crimea',
 'saliva',
 'universal-re-encryption',
 'salt-lake-city',
 'sodium',
 'grating',
 'gps',
 'flight-status',
 'shellfish',
 'saffron',
 'got',
 'boiling',
 'publishing',
 'alternative-energy',
 'upgrades',
 'indian-railways',
 'mutations',
 'vessel',
 'passports',
 'organelle',
 'routers',
 'star-alliance',
 'transplantation',
 'oceania',
 'instinct',
 'beer',
 'steamed-pudding',
 'zad',
 'hmac',
 'excavation',
 'lakes',
 'mozambican-citizen',
 'alternating-step',
 'proofing',
 'nav',
 'yerevan',
 'burgund

In [51]:
useless_tags = set(['can', 'water', 'work', 'got', 'well', \
'light', 'idea', 'information', 'flight', 'planning', 'wire', 'main', 'remove', 'data', \
'power', 'floor', 'current', 'heat', 'ground', 'kitchen', 'switch', 'food', 'security', \
'algorithm', 'ceiling', 'control', 'space', 'building', 'metal', 'level', 'temperature', \
'advice', 'fan', 'saw', 'nice', 'cover', 'human', 'plastic', 'cost', 'taste', 'breaker', \
'pipe', 'hole', 'cell', 'circuit', 'research', 'bits', 'input', 'electric', 'oil', 'family', \
'support', 'wood', 'green', 'search', 'gas', 'countries', 'produce', 'concrete', 'drain', 'service', \
'post', 'salt', 'c', 'range', 'pan', 'fresh', 'keys', 'basement', 'damage', 'sugar', 'species', \
'fridge', 'present', 'gene', 'sounds', 'neutral', 'frame', 'oven', 'life', 'paint', 'table', 'attack', \
'seal', 'walls', 'internet', 'stuck', 'heater', 'methods', 'steps', 'outlets', 'pump', 'meat', 'garage', \
'structure', 'color', 'gap', 'addition', 'bathroom', 'price'])

In [52]:
useful_tag_set = tag_set - useless_tags
len(useful_tag_set)

4168

In [None]:
# 'security', 'algorithm', 

In [53]:
contents_predict_tags = []

for content in cleansed_contents:
    predict_tags = set()
    for word in content:
        if word in useful_tag_set:
            predict_tags.add(word)
    contents_predict_tags.append(list(predict_tags))

len(contents_predict_tags)

87000

In [54]:
contents_predict_tags[:100]

[['translation', 'ribosome'],
 [],
 ['anatomy'],
 ['minipreps', 'stock'],
 ['splicing', 'mrna', 'exons'],
 ['dna'],
 ['resources', 'signalling', 'electrical'],
 [],
 ['experiment', 'fluorescent'],
 ['chloroplasts', 'mitochondria'],
 ['high-throughput'],
 ['dna', 'protocol'],
 ['statistics'],
 ['metabolism', 'science'],
 ['experimental', 'histone'],
 ['exons'],
 [],
 ['basics', 'literature', 'children', 'epigenetics'],
 ['minipreps', 'wash', 'protocol', 'glucose', 'plasmids'],
 ['exchange', 'structural'],
 ['chromatin'],
 ['electrical'],
 ['lead', 'transcription'],
 [],
 ['root'],
 ['introns'],
 ['polymerase', 'literature', 'efficiency', 'transcription'],
 ['community', 'kidney', 'stone', 'mechanism'],
 ['primer'],
 ['microbiology', 'salmonella'],
 ['dna'],
 ['pulses', 'brain', 'electrical'],
 ['mushrooms'],
 ['polymerase', 'recombinant', 'culture'],
 ['brain', 'ventricles', 'pool'],
 ['stain', 'mammals'],
 ['diet', 'insects', 'fish', 'poison'],
 ['staining'],
 ['snp', 'china'],
 ['radi

In [55]:
# from collections import Counter

# tags_flat = []

# for tags in tags_split:
#     tags_flat.extend(tags)
    
# tags_count = Counter(tags_flat)
# tags_count.most_common(100)

In [56]:
# predict_tags_flat = []

# for tags in contents_predict_tags:
#     predict_tags_flat.extend(tags)

# predict_tags_count = Counter(predict_tags_flat)

# predict_tags_count.most_common(100)

# Test

In [57]:
# common_tags = [t[0] for t in predict_tags_count.most_common(100)]

In [45]:
# print(common_tags)

['can', 'water', 'work', 'got', 'well', 'light', 'idea', 'information', 'flight', 'planning', 'wire', 'main', 'remove', 'data', 'power', 'floor', 'current', 'heat', 'ground', 'kitchen', 'switch', 'food', 'security', 'algorithm', 'ceiling', 'control', 'space', 'building', 'metal', 'level', 'temperature', 'advice', 'fan', 'saw', 'nice', 'cover', 'human', 'plastic', 'cost', 'taste', 'breaker', 'pipe', 'hole', 'cell', 'circuit', 'research', 'bits', 'input', 'electric', 'oil', 'family', 'support', 'wood', 'green', 'search', 'gas', 'countries', 'produce', 'concrete', 'drain', 'service', 'post', 'salt', 'c', 'range', 'pan', 'fresh', 'keys', 'basement', 'damage', 'sugar', 'species', 'fridge', 'present', 'gene', 'sounds', 'neutral', 'frame', 'oven', 'life', 'paint', 'table', 'attack', 'seal', 'walls', 'internet', 'stuck', 'heater', 'methods', 'steps', 'outlets', 'pump', 'meat', 'garage', 'structure', 'color', 'gap', 'addition', 'bathroom', 'price']


In [12]:
print(len(cleansed_contents))
print(len(cleansed_contents[0]))
cleansed_contents[0]

87000
182


'prokaryotic translation critical efficient translation location ribosome binding site relative start codon ideally supposed b away start bases away even observable effect translation'

In [13]:
# words = []
# for w in cleansed_contents_words:
#     words.extend(w)
# print(len(words))

# extract content features

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf=False, \
                             stop_words = "english", \
                             tokenizer = None,    \
                             preprocessor = None, \
                             max_features = 10000)

# vectorizer = CountVectorizer(analyzer = "word",   \
#                              tokenizer = None,    \
#                              preprocessor = None, \
#                              stop_words = "english",   \
#                              max_features = 10000) 

In [16]:
contents_feature = vectorizer.fit_transform(cleansed_contents)
contents_feature.shape

(87000, 10000)

In [17]:
print(contents_feature[0])

  (0, 2879)	0.182574185835
  (0, 700)	0.182574185835
  (0, 602)	0.36514837167
  (0, 8753)	0.182574185835
  (0, 4305)	0.182574185835
  (0, 1641)	0.182574185835
  (0, 8490)	0.36514837167
  (0, 7250)	0.182574185835
  (0, 8127)	0.182574185835
  (0, 829)	0.182574185835
  (0, 7475)	0.182574185835
  (0, 5161)	0.182574185835
  (0, 2887)	0.182574185835
  (0, 2107)	0.182574185835
  (0, 9238)	0.547722557505
  (0, 6821)	0.182574185835


In [17]:
# vectorizer.get_feature_names()

# extract title features

In [18]:
# domains_titles = [df['title'].tolist() for df in df_trains]
# titles = [titles for domain_titles in domains_titles for titles in domain_titles]
# len(titles)

In [19]:
# cleansed_titles = [cleanse(title) for title in titles]
# len(cleansed_titles)

In [20]:
# cleansed_titles[:10]

In [21]:
# titles_feature = vectorizer.transform(cleansed_titles)
# titles_feature.shape

In [22]:
# print(titles_feature[:10])

# extract tags features

In [23]:
# tag_vectorizer = CountVectorizer(analyzer = "word",   \
#                              tokenizer = None,    \
#                              preprocessor = None, \
#                              stop_words = None) 

In [18]:
domains_tags = [df['tags'].tolist() for df in df_trains]
tags = [tags for domain_tags in domains_tags for tags in domain_tags]
len(tags)

87000

In [19]:
# tags_feature = tag_vectorizer.fit_transform(tags)
# tags_feature.shape

In [20]:
tags_split = [ts.split() for ts in tags]

In [21]:
tags_split[:10]

[['ribosome', 'binding-sites', 'translation', 'synthetic-biology'],
 ['rna', 'biochemistry'],
 ['immunology', 'cell-biology', 'hematology'],
 ['cell-culture'],
 ['splicing', 'mrna', 'spliceosome', 'introns', 'exons'],
 ['dna', 'biochemistry', 'molecular-biology'],
 ['neuroscience', 'synapses'],
 ['plasmids'],
 ['molecular-genetics', 'gene-expression', 'experimental-design'],
 ['evolution', 'mitochondria', 'chloroplasts']]

In [22]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
tags_feature = mlb.fit_transform(tags_split)

In [23]:
tags_feature.shape

(87000, 4268)

In [32]:
labels = list(mlb.classes_)

# Train

In [24]:
print(contents_feature.shape)
print(tags_feature.shape)

(87000, 10000)
(87000, 4268)


In [25]:
# # sparse to dense
# contents_feature_dense = contents_feature.toarray()

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier

forest = RandomForestClassifier(n_estimators = 100)
# multi_label_forest = OneVsRestClassifier(RandomForestClassifier(n_estimators = 100), n_jobs = -1)
multi_label_forest = OneVsRestClassifier(RandomForestClassifier(n_estimators = 100), n_jobs = 4)

# multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

In [28]:
multi_label_forest.fit(contents_feature, tags_feature)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
          n_jobs=-1)

In [29]:
# from sklearn.externals import joblib

# joblib.dump(multi_label_forest, 'multi_label_forest_100.pkl')

['multi_label_forest_100.pkl']

In [None]:
Y_predict = multi_label_forest.predict(contents_feature)

In [None]:
Y_predict.shape

In [35]:
Y_predict[1][1]

0

In [41]:
def print_labels(Y):
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            if Y[i, j] != 0:
                print(labels[j])
        print()

In [42]:
print_labels(Y_predict)

binding-sites
ribosome
synthetic-biology
translation

biochemistry
rna

cell-biology
hematology
immunology



In [44]:
print_labels(tags_feature[:3])

binding-sites
ribosome
synthetic-biology
translation

biochemistry
rna

cell-biology
hematology
immunology



# Test

In [46]:
df_test = pd.read_csv(DATA_FOLDER + TEST_FILE)
df_test

Unnamed: 0,id,title,content
0,1,What is spin as it relates to subatomic partic...,<p>I often hear about subatomic particles havi...
1,2,What is your simplest explanation of the strin...,<p>How would you explain string theory to non ...
2,3,"Lie theory, Representations and particle physics",<p>This is a question that has been posted at ...
3,7,Will Determinism be ever possible?,<p>What are the main problems that we need to ...
4,9,Hamilton's Principle,<p>Hamilton's principle states that a dynamic ...
5,13,What is sound and how is it produced?,"<p>I've been using the term ""sound"" all my lif..."
6,15,What experiment would disprove string theory?,<p>I know that there's big controversy between...
7,17,Why does the sky change color? Why the sky is ...,<p>Why does the sky change color? Why the sky ...
8,19,How's the energy of particle collisions calcul...,<p>Physicists often refer to the energy of col...
9,21,Monte Carlo use,<p>Where is the Monte Carlo method used in phy...


In [None]:
test_contents = 

In [None]:
# remove [n]
# remove pure numbers, don't remove letter+number words, like CO2
# remove urls

In [None]:
# tf-idf over all contents
# same-category contents concat as one doc

In [None]:
# for each tag, find words that mostly exist in title and contents of that tag
# use entropy to do the above thing
# if a test content contains words only exist in , then the content is very likely about that domain

In [None]:
# as classification problem
# features: bow (normalized)
