In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import FreqDist
from nltk.corpus import brown
from gensim.models import word2vec
import time
from gensim.models import phrases


tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
nltk_stopwords = set(stopwords.words("english"))
# frequency_list = FreqDist(i.lower() for i in brown.words())

In [3]:
len(nltk_stopwords)

153

In [4]:
DATA_FOLDER = 'data/'
MODEL_FOLDER = 'model/'
OUTPUT_FOLDER = 'output/'
TRAIN_FILES = ['biology.csv', 'cooking.csv', 'crypto.csv', 'diy.csv', 'robotics.csv', 'travel.csv']
TEST_FILE = 'test.csv'

In [5]:
DOMAIN_COUNT = len(TRAIN_FILES)

# Load Data

In [6]:
df_trains = [pd.read_csv(DATA_FOLDER + filename) for filename in TRAIN_FILES]

In [7]:
for df in df_trains:
    print(df.shape)

(13196, 4)
(15404, 4)
(10432, 4)
(25918, 4)
(2771, 4)
(19279, 4)


In [8]:
sum([df.shape[0] for df in df_trains])

87000

In [9]:
df_trains[0].head(10)

Unnamed: 0,id,title,content,tags
0,1,What is the criticality of the ribosome bindin...,"<p>In prokaryotic translation, how critical fo...",ribosome binding-sites translation synthetic-b...
1,2,How is RNAse contamination in RNA based experi...,<p>Does anyone have any suggestions to prevent...,rna biochemistry
2,3,Are lymphocyte sizes clustered in two groups?,<p>Tortora writes in <em>Principles of Anatomy...,immunology cell-biology hematology
3,4,How long does antibiotic-dosed LB maintain goo...,<p>Various people in our lab will prepare a li...,cell-culture
4,5,Is exon order always preserved in splicing?,<p>Are there any cases in which the splicing m...,splicing mrna spliceosome introns exons
5,6,How can I avoid digesting protein-bound DNA?,<p>I'm interested in sequencing and analyzing ...,dna biochemistry molecular-biology
6,8,Under what conditions do dendritic spines form?,<p>I'm looking for resources or any informatio...,neuroscience synapses
7,9,How should I ship plasmids?,<p>I shipped 10 µL of my vector miniprep to a ...,plasmids
8,10,What is the reason behind choosing the reporte...,<p>I noticed within example experiments in cla...,molecular-genetics gene-expression experimenta...
9,11,How many times did endosymbiosis occur?,"<p>According to the endosymbiont theory, mitoc...",evolution mitochondria chloroplasts


In [10]:
train_0_tags = df_trains[0]['tags'].tolist()
train_0_tags[:20]

['ribosome binding-sites translation synthetic-biology',
 'rna biochemistry',
 'immunology cell-biology hematology',
 'cell-culture',
 'splicing mrna spliceosome introns exons',
 'dna biochemistry molecular-biology',
 'neuroscience synapses',
 'plasmids',
 'molecular-genetics gene-expression experimental-design',
 'evolution mitochondria chloroplasts',
 'high-throughput cell-based',
 'molecular-biology synthetic-biology',
 'bioinformatics homework',
 'neuroscience immunology',
 'splicing histone',
 'genomics gene-annotation exons',
 'microbiology virology influenza',
 'epigenetics',
 'molecular-biology dna-isolation',
 'cell-membrane adaptation cell-biology']

In [11]:
train_0_tag_set = set()
for tags in train_0_tags:
    for tag in tags.split():
        train_0_tag_set.add(tag)
print(len(train_0_tag_set))

train_0_unique_tags = list(train_0_tag_set)

print(train_0_unique_tags[:10])

678
['reflexes', 'chloroplasts', 'operons', 'dnapolymerase', 'cytogenetics', 'cloning', 'inflammation', 'dna-replication', 'mutations', 'translation']


In [12]:
df_test = pd.read_csv(DATA_FOLDER + TEST_FILE)
print(df_test.shape)
df_test.head(10)

(81926, 3)


Unnamed: 0,id,title,content
0,1,What is spin as it relates to subatomic partic...,<p>I often hear about subatomic particles havi...
1,2,What is your simplest explanation of the strin...,<p>How would you explain string theory to non ...
2,3,"Lie theory, Representations and particle physics",<p>This is a question that has been posted at ...
3,7,Will Determinism be ever possible?,<p>What are the main problems that we need to ...
4,9,Hamilton's Principle,<p>Hamilton's principle states that a dynamic ...
5,13,What is sound and how is it produced?,"<p>I've been using the term ""sound"" all my lif..."
6,15,What experiment would disprove string theory?,<p>I know that there's big controversy between...
7,17,Why does the sky change color? Why the sky is ...,<p>Why does the sky change color? Why the sky ...
8,19,How's the energy of particle collisions calcul...,<p>Physicists often refer to the energy of col...
9,21,Monte Carlo use,<p>Where is the Monte Carlo method used in phy...


# Prepare data

In [13]:
test_contents = df_test['content'].tolist()
test_titles = df_test['title'].tolist()
test_titles_contents = (df_test['title'] + ' ' + df_test['content']).tolist()

# Cleanse Data

In [14]:
# remove [n]
# remove pure numbers, don't remove letter+number words, like CO2

# remove stopwords
# consider µL?
# remove formulas


numbers = set('0123456789-')

def is_useful_word(word):
    return (word not in useless_words) and (len(word) > 2) and (re.search('[a-z]', word)) and (word[0] not in numbers)
    

def cleanse_html(content):
    content = BeautifulSoup(content, "lxml").get_text()

    # remove urls
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    return content
    

def cleanse_lower_split(content):
    # remove html tags
    content = BeautifulSoup(content, "lxml").get_text()
    
    # remove urls
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    # replace punctuations and numbers with whitespaces
    content = re.sub(r"[^a-zA-Z0-9\-]"," ", content)
        
    # convert to lowercase
    content = content.lower()
    
    words = content.split()
    
    # remove useless words
    # stopwords, pure numbers, short words
    words = [word for word in words if is_useful_word(word)]
    
    return words

# only reserve words.
def cleanse(content):
    # remove html tags
    content = BeautifulSoup(content, "lxml").get_text()
    
    # remove urls
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    # replace punctuations and numbers with whitespaces
    content = re.sub(r"[^a-zA-Z0-9\-]"," ", content)
        
    # convert to lowercase
    content = content.lower()
    
    words = content.split()
    
    # remove useless words
    # stopwords, pure numbers, short words
    words = [word for word in words if is_useful_word(word)]
    
    # concat words to a string
    content = ' '.join(words)
    
    return content


In [33]:
def is_word(s):
    # if s start with english letter 
    return (re.search('[a-z]', s[0]))

def cleanse_and_split_sentence(s):
    # input a sentence, output a list of words
    s = re.sub(r"[^a-zA-Z0-9]"," ", s)
    word_seq = s.lower().split()
    word_seq = [word for word in word_seq if is_word(word)]
        
    return word_seq
    
# cleanse and cut sentences
def cleanse_content_for_w2v(content):
    # output format: [['i', 'don', 't', 'have', 'an', 'apple-pen'], ['there', 's', 'co2', 'in']]
    content = BeautifulSoup(content, "lxml").get_text()
    
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    sentences = tokenizer.tokenize(content)
    
    word_seqs = []
    for s in sentences:
        word_seq = cleanse_and_split_sentence(s)
        # if len(word_seq) >= 3:
        word_seqs.append(word_seq)
    
    return word_seqs

# not removing stopwords here
def cleanse_content_for_tfidf(content):
    # output format: [['i', 'don', 't', 'have', 'an', 'apple-pen'], ['there', 's', 'co2', 'in']]
    content = BeautifulSoup(content, "lxml").get_text()
    
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    word_seq = cleanse_and_split_sentence(content)
    
    return word_seq

In [34]:
content = "I don't 'have' an apple-pen. There's 256 CO2 in 10km^3!"
cleanse_content_for_w2v(content)

[['i', 'don', 't', 'have', 'an', 'apple', 'pen'], ['there', 's', 'co2', 'in']]

In [35]:
for content in test_contents[:3]:
    print(cleanse_content_for_w2v(content))
    print('----')

[['i', 'often', 'hear', 'about', 'subatomic', 'particles', 'having', 'a', 'property', 'called', 'spin', 'but', 'also', 'that', 'it', 'doesn', 't', 'actually', 'relate', 'to', 'spinning', 'about', 'an', 'axis', 'like', 'you', 'would', 'think'], ['which', 'particles', 'have', 'spin'], ['what', 'does', 'spin', 'mean', 'if', 'not', 'an', 'actual', 'spinning', 'motion']]
----
[['how', 'would', 'you', 'explain', 'string', 'theory', 'to', 'non', 'physicists', 'such', 'as', 'myself'], ['i', 'm', 'specially', 'interested', 'on', 'how', 'plausible', 'is', 'it', 'and', 'what', 'is', 'needed', 'to', 'successfully', 'prove', 'it']]
----
[['this', 'is', 'a', 'question', 'that', 'has', 'been', 'posted', 'at', 'many', 'different', 'forums', 'i', 'thought', 'maybe', 'someone', 'here', 'would', 'have', 'a', 'better', 'or', 'more', 'conceptual', 'answer', 'than', 'i', 'have', 'seen', 'before', 'why', 'do', 'physicists', 'care', 'about', 'representations', 'of', 'lie', 'groups'], ['for', 'myself', 'when',

# Prepare w2v training data

In [54]:
def print_line(lst):
    for s in lst:
        print(s)

In [36]:
cleansed_train_domains_titles = []
for df in df_trains:
    word_seqs = []
    for title in df['title'].tolist():
        word_seqs.extend(cleanse_content_for_w2v(title))
    cleansed_train_domains_titles.append(word_seqs)

cleansed_train_domains_contents = []
for df in df_trains:
    word_seqs = []
    for content in df['content'].tolist():
        word_seqs.extend(cleanse_content_for_w2v(content))
    cleansed_train_domains_contents.append(word_seqs)

In [37]:
for cleansed_train_titles in cleansed_train_domains_titles:
    print(len(cleansed_train_titles))
print()
for cleansed_train_contents in cleansed_train_domains_contents:
    print(len(cleansed_train_contents))
print(cleansed_train_domains_titles[5][:3])
print(cleansed_train_domains_contents[5][:3])

13518
15786
10614
26574
2824
20001

69511
82281
69281
180431
19551
104301
[['what', 'are', 'some', 'caribbean', 'cruises', 'for', 'october'], ['how', 'can', 'i', 'find', 'a', 'guide', 'that', 'will', 'take', 'me', 'safely', 'through', 'the', 'amazon', 'jungle'], ['does', 'singapore', 'airlines', 'offer', 'any', 'reward', 'seats', 'on', 'their', 'ewr', 'sin', 'route']]
[['my', 'fianc', 'e', 'and', 'i', 'are', 'looking', 'for', 'a', 'good', 'caribbean', 'cruise', 'in', 'october', 'and', 'were', 'wondering', 'which', 'islands', 'are', 'best', 'to', 'see', 'and', 'which', 'cruise', 'line', 'to', 'take'], ['it', 'seems', 'like', 'a', 'lot', 'of', 'the', 'cruises', 'don', 't', 'run', 'in', 'this', 'month', 'due', 'to', 'hurricane', 'season', 'so', 'i', 'm', 'looking', 'for', 'other', 'good', 'options'], ['edit', 'we', 'll', 'be', 'travelling', 'in']]


In [38]:
df = df_test

cleansed_test_titles = []
for title in df['title'].tolist():
    cleansed_test_titles.extend(cleanse_content_for_w2v(title))

cleansed_test_contents = []
for content in df['content'].tolist():
    cleansed_test_contents.extend(cleanse_content_for_w2v(content))

In [39]:
print(len(cleansed_test_titles))
print(len(cleansed_test_contents))
print(cleansed_test_titles[:3])
print(cleansed_test_contents[:3])

83757
494470
[['what', 'is', 'spin', 'as', 'it', 'relates', 'to', 'subatomic', 'particles'], ['what', 'is', 'your', 'simplest', 'explanation', 'of', 'the', 'string', 'theory'], ['lie', 'theory', 'representations', 'and', 'particle', 'physics']]
[['i', 'often', 'hear', 'about', 'subatomic', 'particles', 'having', 'a', 'property', 'called', 'spin', 'but', 'also', 'that', 'it', 'doesn', 't', 'actually', 'relate', 'to', 'spinning', 'about', 'an', 'axis', 'like', 'you', 'would', 'think'], ['which', 'particles', 'have', 'spin'], ['what', 'does', 'spin', 'mean', 'if', 'not', 'an', 'actual', 'spinning', 'motion']]


In [43]:
cleansed_all_w2v = []
cleansed_train_titles_contents_w2v = []
cleansed_test_titles_contents_w2v = []

for cleansed_train_titles in cleansed_train_domains_titles:
    cleansed_train_titles_contents_w2v.extend(cleansed_train_titles)
for cleansed_train_contents in cleansed_train_domains_contents:
    cleansed_train_titles_contents_w2v.extend(cleansed_train_contents)
    
cleansed_test_titles_contents_w2v.extend(cleansed_test_titles)
cleansed_test_titles_contents_w2v.extend(cleansed_test_contents)

cleansed_all_w2v.extend(cleansed_train_titles_contents_w2v)
cleansed_all_w2v.extend(cleansed_test_titles_contents_w2v)

In [49]:
train_sentence_count = len(cleansed_train_titles_contents_w2v)
test_sentence_count = len(cleansed_test_titles_contents_w2v)
all_sentence_count = len(cleansed_all_w2v)
print(train_sentence_count)
print(test_sentence_count)
print(all_sentence_count)

614673
578227
1192900


### connect phrases by bigram

In [154]:
def connect_phrases_bigram(word_seqs):
    bigram_transformer = phrases.Phrases(word_seqs, min_count=20, threshold=1)
    word_seqs_phrased_bigram = []
    for word_seq in word_seqs:
        word_seq_phrased = bigram_transformer[word_seq]
        word_seq_phrased = [re.sub(r'_', '-', word) for word in word_seq_phrased]
        word_seqs_phrased_bigram.append(word_seq_phrased)
    
    return bigram_transformer, word_seqs_phrased_bigram

In [138]:
test_bigram, test_phrased_bigram = connect_phrases_bigram(cleansed_test_titles_contents_w2v)

In [146]:
print_line(test_phrased_bigram[100:110])

['is', 'electricity', 'instantaneous']
['how-can', 'i', 'determine', 'transmission', 'reflection', 'coefficients', 'for', 'light']
['the', 'principle-behind', 'door', 'peepholes']
['physics', 'and', 'computer-science']
['why-don', 't', 'electric', 'fish', 'shock', 'themselves']
['why', 'were', 'the', 'si', 'base', 'quantities', 'chosen', 'as', 'such']
['how-can', 'i', 'measure', 'the', 'mass', 'of', 'the', 'earth', 'at-home']
['home', 'experiments', 'to-derive', 'the', 'speed-of', 'light']
['symmetrical', 'twin-paradox']
['getting', 'started', 'self-studying', 'general-relativity']


In [141]:
test_bigram.save('test_bigram_count20_threshold1')

In [142]:
test_bigram_vocab_dict = dict(test_bigram.vocab)

In [143]:
import operator
sorted_test_bigram_vocab_dict = sorted(test_bigram_vocab_dict.items(), key=operator.itemgetter(1), reverse=True)

In [147]:
sorted_test_bigram_vocab_dict[100:110]

[(b'no', 15814),
 (b'system', 15786),
 (b'could', 15610),
 (b'd', 15509),
 (b'particle', 15340),
 (b'c', 15248),
 (b'get', 15199),
 (b'for_the', 15194),
 (b'its', 15107),
 (b'what_is', 14863)]

## Remove stopwords then phrase

In [148]:
with open('model/stopwords.txt') as f:
    long_stopwords = f.read().splitlines()

stopwords = set(long_stopwords) | nltk_stopwords

for word in list(stopwords):
    for subword in re.sub(r"[^a-z]", ' ', word).split():
        stopwords.add(subword)
    stopwords.add(re.sub(r"'", '-', word))

In [149]:
len(stopwords)

722

In [150]:
def remove_stopwords(word_seqs):
    removed = []
    for word_seq in word_seqs:
        removed.append([word for word in word_seq if word not in stopwords])
    return removed

In [151]:
cleansed_test_titles_contents_no_stopwords = remove_stopwords(cleansed_test_titles_contents_w2v)

In [153]:
cleansed_test_titles_contents_no_stopwords[:5]

[['spin', 'relates', 'subatomic', 'particles'],
 ['simplest', 'explanation', 'string', 'theory'],
 ['lie', 'theory', 'representations', 'particle', 'physics'],
 ['determinism'],
 ['hamilton', 'principle']]

In [155]:
test_bigram_nostop, test_phrased_bigram_nostop = connect_phrases_bigram(cleansed_test_titles_contents_no_stopwords)

In [156]:
test_phrased_bigram_nostop[:30]

[['spin', 'relates', 'subatomic-particles'],
 ['simplest', 'explanation', 'string-theory'],
 ['lie', 'theory', 'representations', 'particle-physics'],
 ['determinism'],
 ['hamilton-principle'],
 ['sound-produced'],
 ['experiment', 'disprove', 'string-theory'],
 ['sky', 'change-color'],
 ['sky-blue', 'day', 'red', 'sunrise', 'set', 'black', 'night'],
 ['energy', 'particle', 'collisions', 'calculated'],
 ['monte-carlo'],
 ['leaning', 'banking', 'help', 'turning', 'bicycle'],
 ['velocity-object', 'electromagnetic-field'],
 ['difference', 'measurement', 'interaction', 'quantum-mechanics'],
 ['calculate-average', 'speed'],
 ['lay', 'explanation', 'special-theory', 'relativity'],
 ['coriolis', 'irrelevant', 'whirl', 'vortex', 'sink', 'bathtub'],
 ['magnets', 'energy', 'repel'],
 ['check', 'einstein-equations', 'correspondence', 'real'],
 ['impressions', 'topological', 'field-theories', 'mathematics'],
 ['capacitive', 'screen', 'sensing'],
 ['magnets', 'spin', 'positioned', 'precisely'],
 ['l

In [178]:
test_titles_phrased_bigram_nostop = test_phrased_bigram_nostop[:len(cleansed_test_titles)]

In [179]:
len(test_titles_phrased_bigram_nostop)

83757

In [180]:
test_titles_phrased_bigram_nostop[-1]

['gravity', 'manipulation']

In [157]:
test_bigram_nostop.save('test_bigram_nostop_count20_threshold1')

In [None]:
all_bigram, all_phrased_bigram = connect_phrases_bigram(cleansed_all_w2v)

In [46]:
def connect_phrases(word_seqs):
    bigram_transformer = phrases.Phrases(word_seqs)
    word_seqs_phrased_bigram = []
    for word_seq in word_seqs:
        word_seq_phrased = bigram_transformer[word_seq]
        word_seq_phrased = [re.sub(r'_', '-', word) for word in word_seq_phrased]
        word_seqs_phrased_bigram.append(word_seq_phrased)
    
    trigram_transformer = phrases.Phrases(word_seqs_phrased_bigram)
    word_seqs_phrased_trigram = []
    for word_seq in word_seqs_phrased_bigram:
        word_seq_phrased = trigram_transformer[word_seq]
        word_seq_phrased = [re.sub(r'_', '-', word) for word in word_seq_phrased]
        word_seqs_phrased_trigram.append(word_seq_phrased)
    return bigram_transformer, trigram_transformer, word_seqs_phrased_bigram, word_seqs_phrased_trigram

In [47]:
all_bigram, all_trigram_transformer, all_phrased_bigram, all_phrased_trigram = connect_phrases(cleansed_all_w2v)

In [57]:
print_line(all_phrased_trigram[train_sentence_count:train_sentence_count+10])

['what', 'is', 'spin', 'as', 'it', 'relates', 'to', 'subatomic-particles']
['what', 'is', 'your', 'simplest', 'explanation', 'of', 'the', 'string-theory']
['lie', 'theory', 'representations', 'and', 'particle', 'physics']
['will', 'determinism', 'be', 'ever', 'possible']
['hamilton-s-principle']
['what', 'is', 'sound', 'and', 'how', 'is', 'it', 'produced']
['what', 'experiment', 'would', 'disprove-string-theory']
['why', 'does', 'the', 'sky', 'change', 'color']
['why', 'the', 'sky', 'is', 'blue', 'during', 'the', 'day', 'red', 'during', 'sunrise', 'set', 'and', 'black', 'during', 'the', 'night']
['how', 's', 'the', 'energy', 'of', 'particle', 'collisions', 'calculated']


In [53]:
test_bigram, test_trigram, test_phrased_bigram, test_phrased_trigram = connect_phrases(cleansed_test_titles_contents_w2v)

In [176]:
# print_line(test_phrased_bigram[:20])

In [177]:
# print_line(test_phrased_trigram[:20])

In [59]:
test_bigram_vocab = dict(test_bigram.vocab)

In [66]:
test_bigram_vocab_words = list(test_bigram_vocab.keys())

In [67]:
len(test_bigram_vocab_words)

1410768

In [71]:
test_bigram_vocab_words[100000:100000+10]

[b'photograph_was',
 b'visual_aspect',
 b't_torus',
 b'metric_correspond',
 b'fresnel_it',
 b'desperate_hand',
 b'made_thick',
 b'rearranging_s',
 b'generate_pulse',
 b'they_know']

In [226]:
cleansed_test_titles_for_tfidf = [cleanse_content_for_tfidf(title) for title in df_test['title'].tolist()]

In [227]:
cleansed_test_titles_for_tfidf[:5]

[['what', 'is', 'spin', 'as', 'it', 'relates', 'to', 'subatomic', 'particles'],
 ['what',
  'is',
  'your',
  'simplest',
  'explanation',
  'of',
  'the',
  'string',
  'theory'],
 ['lie', 'theory', 'representations', 'and', 'particle', 'physics'],
 ['will', 'determinism', 'be', 'ever', 'possible'],
 ['hamilton', 's', 'principle']]

In [304]:
cleansed_test_titles_bigram = connect_phrases_bigram(cleansed_test_titles_for_tfidf)
cleansed_test_titles_trigram = connect_phrases_trigram(cleansed_test_titles_bigram)

In [305]:
for ws in cleansed_test_titles_trigram[:20]:
    print(ws)

['what', 'is', 'spin', 'as', 'it', 'relates', 'to', 'subatomic-particles']
['what', 'is', 'your', 'simplest', 'explanation', 'of', 'the', 'string-theory']
['lie', 'theory', 'representations', 'and', 'particle-physics']
['will', 'determinism', 'be', 'ever', 'possible']
['hamilton-s-principle']
['what', 'is', 'sound', 'and', 'how', 'is', 'it', 'produced']
['what', 'experiment', 'would', 'disprove-string-theory']
['why', 'does', 'the', 'sky', 'change', 'color', 'why', 'the', 'sky', 'is', 'blue', 'during', 'the', 'day', 'red', 'during', 'sunrise', 'set', 'and', 'black', 'during', 'the', 'night']
['how', 's', 'the', 'energy', 'of', 'particle', 'collisions', 'calculated']
['monte-carlo', 'use']
['does', 'leaning', 'banking', 'help', 'cause', 'turning', 'on', 'a', 'bicycle']
['velocity', 'of', 'object', 'from', 'electromagnetic-field']
['what', 'is', 'the', 'difference-between', 'a', 'measurement', 'and', 'any', 'other', 'interaction', 'in', 'quantum-mechanics']
['how', 'to', 'calculate', 'av

In [229]:
cleansed_test_contents_bigram = connect_phrases_bigram(cleansed_test_contents)
cleansed_test_contents_trigram = connect_phrases_trigram(cleansed_test_contents_bigram)

# Train w2v

In [50]:
# Set values for various parameters
num_features = 500    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 3       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words
num_iter = 1

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
import time

start = time.time()

print("Training model...")
model = word2vec.Word2Vec(w2v_training_data, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, iter=num_iter)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = 'w2v_{}feature_{}minwords_{}iter'.format(num_features, min_word_count, num_iter)
model.save(MODEL_FOLDER + model_name)


end = time.time()
print('elapsed: {} seconds'.format(end - start))

Training model...
elapsed: 90.07816433906555 seconds


In [66]:
model.syn0.shape

(29161, 500)

In [70]:
model.most_similar("molecular-biology")

KeyError: "word 'molecular-biology' not in vocabulary"

In [69]:
len(train_0_unique_tags)

678

In [87]:
train_0_unique_tags

['mass-spectrometry',
 'biological-networks',
 'cancer',
 'exercise',
 'kidney',
 'protein-expression',
 'molecular-evolution',
 'healing',
 'vessel',
 'chronic',
 'gene-annotation',
 'temperature',
 'parasitism',
 'mycology',
 'synthetic-biology',
 'growth-media',
 'sex-ratio',
 'macroevolution',
 'sociobiology',
 'development',
 'vitamins',
 'autoreceptor',
 'host-pathogen-interaction',
 'reptile',
 'gel-electrophoresis',
 'snp',
 'peripheral-nervous-system',
 'alcohol',
 'small-rnaseq',
 'sensation',
 'gastroenterology',
 '3d-structure',
 'plasmids',
 'mouse',
 'muscles',
 'microarray',
 'population-biology',
 'units',
 'reverse-transcription',
 'artificial-selection',
 'dna-methylation',
 'proteins',
 'homeostasis',
 'anthropology',
 'anaerobic-respiration',
 'dna',
 'bioinformatics',
 'ebola',
 'sexuality',
 'gustation',
 'electrical-stimulation',
 'pedigree',
 'biodiversity',
 'hallucinogens',
 'reproductive-biology',
 'cell-based',
 'altruism',
 'amino-acids',
 'blood-transfusio

In [64]:
train_0_unique_tags_with_hyphen = [tag for tag in train_0_unique_tags if '-' in tag]

In [65]:
len(train_0_unique_tags_with_hyphen)

204

In [67]:
train_0_unique_tags_with_hyphen_occur_over_10 = [tag for tag in train_0_unique_tags_with_hyphen if tag in model.vocab]

In [68]:
len(train_0_unique_tags_with_hyphen_occur_over_10)

5

In [71]:
keyword = 'molecular-biology'
counter_title = 0

for word_seq in cleansed_train_domains_titles[0]:
    for word in word_seq:
        if word == keyword:
            counter_title += 1
print(counter_title)

0


In [72]:
keyword = 'molecular-biology'
counter_content = 0

for word_seq in cleansed_train_domains_contents[0]:
    for word in word_seq:
        if word == keyword:
            counter_content += 1
print(counter_content)

1


In [74]:
keywords = ['molecular', 'biology']
counter = 0

for word_seq in cleansed_train_domains_titles[0]:
    for i in range(len(word_seq)-len(keywords)+1):
        if word_seq[i] == keywords[0] and word_seq[i+1] == keywords[1]:
            counter += 1
print(counter)

9


# Phrase then w2v

In [77]:

bigram_transformer = phrases.Phrases(w2v_training_data)

In [90]:
w2v_training_data_phrased = []
for word_seq in w2v_training_data:
    word_seq_phrased = bigram_transformer[word_seq]
    word_seq_phrased = [re.sub(r'_', '-', word) for word in word_seq_phrased]
    w2v_training_data_phrased.append(word_seq_phrased)

In [91]:
w2v_training_data_phrased[:20]

[['what',
  'is',
  'the',
  'criticality',
  'of',
  'the',
  'ribosome-binding',
  'site',
  'relative',
  'to',
  'the',
  'start-codon',
  'in',
  'prokaryotic',
  'translation'],
 ['how',
  'is',
  'rnase-contamination',
  'in',
  'rna',
  'based',
  'experiments',
  'prevented'],
 ['are', 'lymphocyte', 'sizes', 'clustered', 'in', 'two', 'groups'],
 ['how',
  'long',
  'does',
  'antibiotic-dosed',
  'lb',
  'maintain',
  'good',
  'selection'],
 ['is', 'exon', 'order', 'always', 'preserved', 'in', 'splicing'],
 ['how', 'can', 'i', 'avoid', 'digesting', 'protein-bound', 'dna'],
 ['under', 'what', 'conditions', 'do', 'dendritic', 'spines', 'form'],
 ['how', 'should', 'i', 'ship', 'plasmids'],
 ['what',
  'is',
  'the',
  'reason-behind',
  'choosing',
  'the',
  'reporter-gene',
  'when',
  'experimenting',
  'on',
  'your',
  'gene',
  'of',
  'interest'],
 ['how', 'many-times', 'did', 'endosymbiosis', 'occur'],
 ['how',
  'to',
  'reduce',
  'edge',
  'effects',
  'in',
  'cell-b

In [103]:
# Set values for various parameters
num_features = 500    # Word vector dimensionality                      
min_word_count = 5   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words
num_iter = 5

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
import time

start = time.time()

print("Training model...")
model = word2vec.Word2Vec(w2v_training_data_phrased, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, iter=num_iter)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = 'w2v_phrase_{}feature_{}minwords_{}iter'.format(num_features, min_word_count, num_iter)
model.save(MODEL_FOLDER + model_name)

end = time.time()
print('elapsed: {} seconds'.format(end - start))

Training model...
elapsed: 320.4764587879181 seconds


In [None]:
train_0_unique_tags_with_hyphen_in_model = [tag for tag in train_0_unique_tags_with_hyphen if tag in model.vocab]
len(train_0_unique_tags_with_hyphen_in_model)

In [104]:
trigram_transformer = phrases.Phrases(w2v_training_data_phrased)

In [334]:
len(w2v_training_data_phrased_trigram)

1163379

In [107]:
w2v_training_data_phrased_trigram = []
for word_seq in w2v_training_data_phrased:
    word_seq_phrased = trigram_transformer[word_seq]
    word_seq_phrased = [re.sub(r'_', '-', word) for word in word_seq_phrased]
    w2v_training_data_phrased_trigram.append(word_seq_phrased)

In [335]:
# Set values for various parameters
num_features = 200    # Word vector dimensionality                      
min_word_count = 5   # Minimum word count                        
num_iter = 5
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)

start = time.time()

print("Training model...")
model = word2vec.Word2Vec(w2v_training_data_phrased_trigram, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, iter=num_iter)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = 'w2v_phrase_trigram_{}feature_{}minwords_{}iter'.format(num_features, min_word_count, num_iter)
model.save(MODEL_FOLDER + model_name)

end = time.time()
print('elapsed: {} seconds'.format(end - start))

Training model...
elapsed: 155.5711395740509 seconds


In [121]:
train_0_unique_tags_with_hyphen_in_model = [tag for tag in train_0_unique_tags_with_hyphen if tag in model.vocab]
len(train_0_unique_tags_with_hyphen_in_model)

109

In [123]:
model.most_similar('spin')

[('spins', 0.7283067107200623),
 ('orbital-angular-momentum', 0.6553120613098145),
 ('total-spin', 0.6381109952926636),
 ('magnetic-moment', 0.6370653510093689),
 ('helicity', 0.6061174869537354),
 ('spin-angular-momentum', 0.6009663343429565),
 ('angular-momentum', 0.5679385662078857),
 ('total-angular-momentum', 0.564512312412262),
 ('parity', 0.5601943135261536),
 ('spin-up', 0.550516664981842)]

In [124]:
w2v_model = model

# Train Tag Classifier

In [129]:
train_domains_tag_set = []

tag_set = set()

for df in df_trains:
    tag_set = set()
    for tags in df['tags'].tolist():
        for tag in tags.split():
            tag_set.add(tag)
    print(len(tag_set))
    train_domains_tag_set.append(tag_set)

678
736
392
734
231
1645


In [131]:
tag_count = 0
for tag_set in train_domains_tag_set:
    tag_count += len(tag_set)
tag_count

4416

In [130]:
train_tag_set = set()
for tag_set in train_domains_tag_set:
    train_tag_set |= tag_set
print(len(train_tag_set))

4268


In [None]:
train_sentence_count

In [136]:
test_sentence_count = len(cleansed_test_contents) + len(cleansed_test_titles)
print(test_sentence_count)
train_sentence_count = len(w2v_training_data_phrased_trigram) - test_sentence_count
print(train_sentence_count)
print(len(w2v_training_data))

563221
600158
1163379


In [151]:
train_not_tag_set = set()

for sentence in w2v_training_data_phrased_trigram[:train_sentence_count]:
    for word in sentence:
        if word not in train_tag_set:
            train_not_tag_set.add(word)

print(len(train_not_tag_set))

133804


In [143]:
test_possible_tag_set = set()

for sentence in w2v_training_data_phrased_trigram[train_sentence_count:]:
    for word in sentence:
        if (word not in train_not_tag_set) and (word in model.vocab):
            test_possible_tag_set.add(word)

print(len(test_possible_tag_set))

16926


In [144]:
test_possible_tag_list = list(test_possible_tag_set)

In [147]:
test_possible_tag_list[:10]

['lie-group-g',
 'shifman',
 'mass-spectrometry',
 'spin-spin-interaction',
 'euclidean-ads',
 'classical-electrodynamics',
 'b-l',
 'bragg-scattering',
 'integrating-factor',
 'freeman-dyson']

In [154]:
'quantum-mechanics' in test_possible_tag_set

False

In [155]:
'quantum-mechanics' in train_not_tag_set

True

# phrase then tfidf

In [None]:
# cleansed_test_titles_trigram
# cleansed_test_contents_trigram

In [None]:
with open('model/stopwords.txt') as f:
    long_stopwords = f.read().splitlines()

stopwords = set(long_stopwords) | nltk_stopwords | set('question problem change point number size help definition equation proof physics state function derivation'.split())

for word in list(stopwords):
    for subword in re.sub(r"[^a-z]", ' ', word).split():
        stopwords.add(subword)
    stopwords.add(re.sub(r"'", '-', word))

In [None]:
# test_titles_phrased_bigram_nostop

In [None]:
test_title_for_tfidf = []
for word_seq in test_titles_phrased_bigram_nostop:
    test_title_for_tfidf.append( ' '.join([word for word in word_seq if word not in stopwords]) )

In [None]:
test_title_for_tfidf

In [186]:
def predict_tags_from_cleansed_contents(cleansed_contents, max_features):
    vectorizer = TfidfVectorizer(use_idf=False, stop_words = "english", \
                                 tokenizer = str.split,    \
                                 preprocessor = None, \
                                 max_features = max_features)
    features = vectorizer.fit_transform(cleansed_contents)
    feature_words = vectorizer.get_feature_names()
    
    predicted_tags = []

    for feature in features:
        values = feature.data
        _, word_idxs = feature.nonzero()

        index_sorted = np.argsort(values)[::-1]
        keywords = [feature_words[wid] for wid in word_idxs[index_sorted]]
        predicted_tags.append(' '.join(keywords))
    
    return predicted_tags

In [187]:
test_title_predicted_tags = predict_tags_from_cleansed_contents(test_title_for_tfidf, 5000)

AttributeError: 'list' object has no attribute 'lower'

In [None]:
test_predicted_tags_noun = []

for tags in test_title_predicted_tags:
    pos_tagged = nltk.pos_tag(tags.split())
    noun_tags = [i[0] for i in pos_tagged if i[1][0] in "N"]
    noun_tags_str = ' '.join(noun_tags)
    test_predicted_tags_noun.append(noun_tags_str)

In [169]:
test_predicted_tags_noun[:20]

['spin subatomic-particles',
 'explanation string-theory',
 'lie representations particle-physics',
 '',
 '',
 '',
 'experiment',
 'sky',
 'day night',
 'energy particle collisions',
 'monte-carlo',
 'help bicycle',
 'electromagnetic-field',
 'difference interaction quantum-mechanics',
 'speed',
 'explanation relativity',
 'coriolis vortex sink',
 'energy magnets',
 'einstein-equations correspondence',
 'field-theories mathematics']

In [174]:
from collections import Counter

all_tags = []
for tags in test_predicted_tags_noun:
    all_tags.extend(tags.split())

counter = Counter(all_tags)

In [175]:
counter.most_common(100)

[('question', 19422),
 ('time', 15321),
 ('energy', 14626),
 ('equation', 10696),
 ('mass', 10409),
 ('case', 9593),
 ('example', 9291),
 ('problem', 9245),
 ('field', 9059),
 ('point', 8701),
 ('particle', 7381),
 ('space', 7306),
 ('work', 7303),
 ('force', 7150),
 ('physics', 7055),
 ('velocity', 6575),
 ('state', 6119),
 ('particles', 5931),
 ('gravity', 5846),
 ('theory', 5466),
 ('distance', 5337),
 ('water', 5280),
 ('change', 5261),
 ('understand', 5080),
 ('speed', 5071),
 ('terms', 4827),
 ('difference', 4781),
 ('temperature', 4717),
 ('function', 4666),
 ('direction', 4615),
 ('form', 4575),
 ('surface', 4543),
 ('electrons', 4452),
 ('calculate', 4362),
 ('charge', 4361),
 ('equations', 4331),
 ('states', 4205),
 ('matter', 4202),
 ('answer', 4190),
 ('fact', 4010),
 ('motion', 3978),
 ('model', 3958),
 ('book', 3957),
 ('order', 3945),
 ('momentum', 3911),
 ('earth', 3902),
 ('result', 3868),
 ('acceleration', 3775),
 ('help', 3744),
 ('sense', 3704),
 ('solution', 3583),

# Output

In [330]:
output_file_name = 'output/w2v_trigram_title_feature5000_noun_correct_trigram.csv'

df_output = df_test[['id']]
df_output['tags'] = test_predicted_tags_noun

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [331]:
df_output.head(20)

Unnamed: 0,id,tags
0,1,spin subatomic-particles
1,2,explanation string-theory
2,3,lie representations particle-physics
3,7,determinism
4,9,hamilton-s-principle
5,13,sound
6,15,experiment
7,17,sky color day sunrise night
8,19,energy particle collisions
9,21,monte-carlo


In [332]:
import csv

df_output.to_csv(output_file_name, index=False, quoting=csv.QUOTE_NONNUMERIC)