In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

In [2]:
DATA_FOLDER = 'data/'
TRAIN_FILES = ['biology.csv', 'cooking.csv', 'crypto.csv', 'diy.csv', 'robotics.csv', 'travel.csv']
TEST_FILE = 'test.csv'

In [3]:
DOMAIN_COUNT = len(TRAIN_FILES)

In [4]:
df_trains = [pd.read_csv(DATA_FOLDER + filename) for filename in TRAIN_FILES]

In [5]:
for df in df_trains:
    print(df.shape)

(13196, 4)
(15404, 4)
(10432, 4)
(25918, 4)
(2771, 4)
(19279, 4)


In [6]:
sum([df.shape[0] for df in df_trains])

87000

In [7]:
df_trains[0].head()

Unnamed: 0,id,title,content,tags
0,1,What is the criticality of the ribosome bindin...,"<p>In prokaryotic translation, how critical fo...",ribosome binding-sites translation synthetic-b...
1,2,How is RNAse contamination in RNA based experi...,<p>Does anyone have any suggestions to prevent...,rna biochemistry
2,3,Are lymphocyte sizes clustered in two groups?,<p>Tortora writes in <em>Principles of Anatomy...,immunology cell-biology hematology
3,4,How long does antibiotic-dosed LB maintain goo...,<p>Various people in our lab will prepare a li...,cell-culture
4,5,Is exon order always preserved in splicing?,<p>Are there any cases in which the splicing m...,splicing mrna spliceosome introns exons


In [8]:
# # how many unique tags

# domain_unique_tags = [set() for i in range(DOMAIN_COUNT)]
# unique_tags = set()

# for i in range(DOMAIN_COUNT):
#     df = df_trains[i]
#     posts_tags = df['tags'].tolist()
#     for tags in posts_tags:
#         for tag in tags.split(' '):
#             domain_unique_tags[i].add(tag)
#             unique_tags.add(tag)

# print('domain_unique_tag counts:')
# s = 0
# for tag_set in domain_unique_tags:
#     print(len(tag_set))

# print('sum of domain_unique_tag counts:')
# print(sum([len(tag_set) for tag_set in domain_unique_tags]))
    
# print('all tag count:')
# print(len(unique_tags))


# Cleanse Data

In [9]:
import re
from bs4 import BeautifulSoup
# import nltk
# tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

# remove [n]
# remove pure numbers, don't remove letter+number words, like CO2

# remove stopwords
# consider µL?
# remove formulas

def cleanse_html(content):
    return BeautifulSoup(content, "lxml").get_text()

# only reserve words. don't split sentences
def cleanse(content):
    # remove html tags
    content = BeautifulSoup(content, "lxml").get_text()

    # remove urls
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
#     # split content to sentences
#     sentences = tokenizer.tokenize(content)
    
#     # replace punctuations with whitespaces
#     content = re.sub(r"[^a-zA-Z0-9]"," ", content)
    
    # replace punctuations and numbers with whitespaces
    content = re.sub(r"[^a-zA-Z]"," ", content)
    
#     # remove words that don't contain english (pure number, number+symbol). reserve something like "CO2"
#     content = re.sub(r"[^a-zA-Z0-9]"," ", content)
    
    # convert to lowercase
    content = content.lower()
    
    # split into words
    words = content.split()
    
    # remove stopwords
    words = [word for word in words if word not in stops]
    
    # concat words to a string
    content = ' '.join(words)
    
    return content

In [10]:
domains_contents = [df['content'].tolist() for df in df_trains]  # list of list of strings
print(len(domains_contents))
print(len(domains_contents[0]))

6
13196


In [11]:
cleansed_contents = [cleanse(content) for domain_contents in domains_contents for content in domain_contents]

In [12]:
print(len(cleansed_contents))
print(len(cleansed_contents[0]))
cleansed_contents[0]

87000
182


'prokaryotic translation critical efficient translation location ribosome binding site relative start codon ideally supposed b away start bases away even observable effect translation'

In [13]:
# words = []
# for w in cleansed_contents_words:
#     words.extend(w)
# print(len(words))

# extract content features

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf=False, \
                             stop_words = "english", \
                             tokenizer = None,    \
                             preprocessor = None, \
                             max_features = 10000)

# vectorizer = CountVectorizer(analyzer = "word",   \
#                              tokenizer = None,    \
#                              preprocessor = None, \
#                              stop_words = "english",   \
#                              max_features = 10000) 

In [15]:
contents_feature = vectorizer.fit_transform(cleansed_contents)
contents_feature.shape

(87000, 10000)

In [16]:
print(contents_feature[0])

  (0, 2879)	0.182574185835
  (0, 700)	0.182574185835
  (0, 602)	0.36514837167
  (0, 8753)	0.182574185835
  (0, 4305)	0.182574185835
  (0, 1641)	0.182574185835
  (0, 8490)	0.36514837167
  (0, 7250)	0.182574185835
  (0, 8127)	0.182574185835
  (0, 829)	0.182574185835
  (0, 7475)	0.182574185835
  (0, 5161)	0.182574185835
  (0, 2887)	0.182574185835
  (0, 2107)	0.182574185835
  (0, 9238)	0.547722557505
  (0, 6821)	0.182574185835


In [17]:
# vectorizer.get_feature_names()

# extract title features

In [18]:
# domains_titles = [df['title'].tolist() for df in df_trains]
# titles = [titles for domain_titles in domains_titles for titles in domain_titles]
# len(titles)

In [19]:
# cleansed_titles = [cleanse(title) for title in titles]
# len(cleansed_titles)

In [20]:
# cleansed_titles[:10]

In [21]:
# titles_feature = vectorizer.transform(cleansed_titles)
# titles_feature.shape

In [22]:
# print(titles_feature[:10])

# extract tags features

In [23]:
# tag_vectorizer = CountVectorizer(analyzer = "word",   \
#                              tokenizer = None,    \
#                              preprocessor = None, \
#                              stop_words = None)

# tag_vectorizer.fit_transform(['apple-juice'])

In [24]:
# tag_vectorizer.get_feature_names()

In [25]:
domains_tags = [df['tags'].tolist() for df in df_trains]
train_tags = [tags for domain_tags in domains_tags for tags in domain_tags]
len(train_tags)

87000

In [26]:
# tags_feature = tag_vectorizer.fit_transform(train_tags)
# tags_feature.shape

In [27]:
tags_split = [ts.split() for ts in train_tags]

In [28]:
tags_split[:10]

[['ribosome', 'binding-sites', 'translation', 'synthetic-biology'],
 ['rna', 'biochemistry'],
 ['immunology', 'cell-biology', 'hematology'],
 ['cell-culture'],
 ['splicing', 'mrna', 'spliceosome', 'introns', 'exons'],
 ['dna', 'biochemistry', 'molecular-biology'],
 ['neuroscience', 'synapses'],
 ['plasmids'],
 ['molecular-genetics', 'gene-expression', 'experimental-design'],
 ['evolution', 'mitochondria', 'chloroplasts']]

In [29]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
tags_feature = mlb.fit_transform(tags_split)

In [30]:
tags_feature.shape

(87000, 4268)

In [31]:
labels = list(mlb.classes_)

# Train

In [32]:
print(contents_feature.shape)
print(tags_feature.shape)

(87000, 10000)
(87000, 4268)


In [33]:
# # sparse to dense
# contents_feature_dense = contents_feature.toarray()

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier

tree_count = 10

# forest = RandomForestClassifier(n_estimators = 10)
# multi_label_forest = OneVsRestClassifier(RandomForestClassifier(n_estimators = 100), n_jobs = -1)
multi_label_forest = OneVsRestClassifier(RandomForestClassifier(n_estimators = tree_count), n_jobs = 4)

# multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

In [35]:
multi_label_forest.fit(contents_feature, tags_feature)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
          n_jobs=4)

In [36]:
from sklearn.externals import joblib

joblib.dump(multi_label_forest, 'multi_label_forest_10.pkl')

['multi_label_forest_10.pkl']

In [41]:
# from sklearn.externals import joblib

# multi_label_forest = joblib.load('multi_label_forest_100.pkl') 

In [37]:
Y_train_predict = multi_label_forest.predict(contents_feature)

In [38]:
def vec_to_tags(Y):
    tags = []
     
    for i in range(Y.shape[0]):
        post_tags = []
        for j in range(Y.shape[1]):
            if Y[i, j] != 0:
                post_tags.append(labels[j])
        tags.append(post_tags)
    
    return tags

In [41]:
# def print_labels(Y):
#     for i in range(Y.shape[0]):
#         for j in range(Y.shape[1]):
#             if Y[i, j] != 0:
#                 print(labels[j])
#         print()

In [39]:
post_tags_predict = vec_to_tags(Y_train_predict)

In [40]:
post_tags_predict[:10]

[['binding-sites', 'synthetic-biology', 'translation'],
 ['biochemistry', 'rna'],
 ['cell-biology', 'hematology'],
 ['cell-culture'],
 ['introns', 'spliceosome', 'splicing'],
 ['biochemistry', 'dna', 'molecular-biology'],
 ['neuroscience'],
 [],
 ['molecular-genetics'],
 ['chloroplasts']]

In [42]:
tags_split[:10]

[['ribosome', 'binding-sites', 'translation', 'synthetic-biology'],
 ['rna', 'biochemistry'],
 ['immunology', 'cell-biology', 'hematology'],
 ['cell-culture'],
 ['splicing', 'mrna', 'spliceosome', 'introns', 'exons'],
 ['dna', 'biochemistry', 'molecular-biology'],
 ['neuroscience', 'synapses'],
 ['plasmids'],
 ['molecular-genetics', 'gene-expression', 'experimental-design'],
 ['evolution', 'mitochondria', 'chloroplasts']]

# Test

In [43]:
df_test = pd.read_csv(DATA_FOLDER + TEST_FILE)
df_test.shape

(81926, 3)

In [44]:
cleansed_test_contents = [cleanse(content) for content in df_test['content'].tolist()]

In [45]:
cleansed_test_contents[:10]

['often hear subatomic particles property called spin also actually relate spinning axis like would think particles spin spin mean actual spinning motion',
 'would explain string theory non physicists specially interested plausible needed successfully prove',
 'question posted many different forums thought maybe someone would better conceptual answer seen physicists care representations lie groups think representation means sort group acting vector space vector space lie group acting certain things invariant group action maybe dumb question thought might good start clarify specifically thinking symmetry groups people think relation standard model care might certain group see group acting acting etc',
 'main problems need solve prove laplace determinism correct overcome uncertainty principle',
 'hamilton principle states dynamic system always follows path action integral stationary maximum minimum action integral stationary basis hamilton state principle',
 'using term sound life really

In [46]:
test_contents_features = vectorizer.transform(cleansed_test_contents)
test_contents_features.shape

(81926, 10000)

In [48]:
test_tags_features_predict = multi_label_forest.predict(test_contents_features)

In [49]:
def vec_to_tags_string(Y):
    tags = []
     
    for i in range(Y.shape[0]):
        post_tags = []
        for j in range(Y.shape[1]):
            if Y[i, j] != 0:
                post_tags.append(labels[j])
        tags.append(' '.join(post_tags))
    
    return tags

In [50]:
test_tags_strings = vec_to_tags_string(test_tags_features_predict)

In [52]:
test_tags_strings[:100]

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'cancer',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'biochemistry',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [None]:
# remove [n]
# remove pure numbers, don't remove letter+number words, like CO2
# remove urls

In [None]:
# tf-idf over all contents
# same-category contents concat as one doc

In [None]:
# for each tag, find words that mostly exist in title and contents of that tag
# use entropy to do the above thing
# if a test content contains words only exist in , then the content is very likely about that domain

In [None]:
# as classification problem
# features: bow (normalized)
