In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import FreqDist
from nltk.corpus import brown
from gensim.models import word2vec
import time
from gensim.models import phrases


tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
nltk_stopwords = set(stopwords.words("english"))
# frequency_list = FreqDist(i.lower() for i in brown.words())

In [2]:
len(nltk_stopwords)

153

In [3]:
DATA_FOLDER = 'data/'
MODEL_FOLDER = 'model/'
OUTPUT_FOLDER = 'output/'
TRAIN_FILES = ['biology.csv', 'cooking.csv', 'crypto.csv', 'diy.csv', 'robotics.csv', 'travel.csv']
TEST_FILE = 'test.csv'

In [4]:
DOMAIN_COUNT = len(TRAIN_FILES)

# Load Data

In [5]:
df_trains = [pd.read_csv(DATA_FOLDER + filename) for filename in TRAIN_FILES]

In [6]:
for df in df_trains:
    print(df.shape)

(13196, 4)
(15404, 4)
(10432, 4)
(25918, 4)
(2771, 4)
(19279, 4)


In [7]:
sum([df.shape[0] for df in df_trains])

87000

In [8]:
df_trains[0].head(5)

Unnamed: 0,id,title,content,tags
0,1,What is the criticality of the ribosome bindin...,"<p>In prokaryotic translation, how critical fo...",ribosome binding-sites translation synthetic-b...
1,2,How is RNAse contamination in RNA based experi...,<p>Does anyone have any suggestions to prevent...,rna biochemistry
2,3,Are lymphocyte sizes clustered in two groups?,<p>Tortora writes in <em>Principles of Anatomy...,immunology cell-biology hematology
3,4,How long does antibiotic-dosed LB maintain goo...,<p>Various people in our lab will prepare a li...,cell-culture
4,5,Is exon order always preserved in splicing?,<p>Are there any cases in which the splicing m...,splicing mrna spliceosome introns exons


In [9]:
train_0_tags = df_trains[0]['tags'].tolist()
train_0_tags[:5]

['ribosome binding-sites translation synthetic-biology',
 'rna biochemistry',
 'immunology cell-biology hematology',
 'cell-culture',
 'splicing mrna spliceosome introns exons']

In [10]:
train_0_tag_set = set()
for tags in train_0_tags:
    for tag in tags.split():
        train_0_tag_set.add(tag)
print(len(train_0_tag_set))

train_0_unique_tags = list(train_0_tag_set)

print(train_0_unique_tags[:10])

678
['sex', 'translation', 'exons', 'dopamine', 'cellular-respiration', 'proteins', 'joints', 'mass-spec', 'hiv', 'menstrual-cycle']


In [25]:
train_domains_tags = []

for df in df_trains:
    tags = df['tags'].tolist()
    split_tags = [tag_string.split() for tag_string in tags]
    train_domains_tags.append(split_tags)

In [26]:
train_domains_tags[0][:10]

[['ribosome', 'binding-sites', 'translation', 'synthetic-biology'],
 ['rna', 'biochemistry'],
 ['immunology', 'cell-biology', 'hematology'],
 ['cell-culture'],
 ['splicing', 'mrna', 'spliceosome', 'introns', 'exons'],
 ['dna', 'biochemistry', 'molecular-biology'],
 ['neuroscience', 'synapses'],
 ['plasmids'],
 ['molecular-genetics', 'gene-expression', 'experimental-design'],
 ['evolution', 'mitochondria', 'chloroplasts']]

In [30]:
tag_count_list = []
for domain_tags in train_domains_tags:
    print(np.mean([len(tags) for tags in domain_tags]))
    tag_count_list.extend([len(tags) for tags in domain_tags])

2.510533495
2.30732277331
2.44286809816
2.28138745274
2.35294117647
3.38886871726


In [32]:
len(tag_count_list)

87000

In [31]:
np.mean(tag_count_list)

2.5877931034482757

In [34]:
domains_tag_sets = [set() for i in range(len(df_trains))]

for i in range(len(df_trains)):
    domain_tags = train_domains_tags[i]
    for tags in domain_tags:
        for tag in tags:
            domains_tag_sets[i].add(tag)

In [35]:
domains_tag_sets[0] - domains_tag_sets[1]

{'3d-structure',
 'abiogenesis',
 'action-potential',
 'adaptation',
 'addiction',
 'agriculture',
 'aids',
 'allele',
 'allelopathy',
 'allergies',
 'allometry',
 'altruism',
 'amino-acids',
 'anaerobic-respiration',
 'analgesia',
 'anatomy',
 'anecdotal-evidence',
 'animal-husbandry',
 'animal-models',
 'ant',
 'anthropology',
 'antibiotic-resistance',
 'antibiotics',
 'antibody',
 'antigen',
 'antihistamines',
 'antipredator-adaptation',
 'apoptosis',
 'aquaculture',
 'arachnology',
 'archaea',
 'artificial-life',
 'artificial-selection',
 'asexual-reproduction',
 'aspirin',
 'assay-development',
 'astrobiology',
 'autoimmune',
 'autonomic-nervous-system',
 'autophagy',
 'autoreceptor',
 'auxology',
 'bacterial-toxins',
 'bacteriology',
 'balance',
 'behavior',
 'behaviour',
 'benzodiazepine',
 'bile',
 'binding-sites',
 'bio-mechanics',
 'biochemistry',
 'biodiversity',
 'bioenergetics',
 'biofeedback',
 'biofilms',
 'biogeography',
 'bioinformatics',
 'bioinorganic-chemistry',
 'b

In [11]:
df_test = pd.read_csv(DATA_FOLDER + TEST_FILE)
print(df_test.shape)
df_test.head(10)

(81926, 3)


Unnamed: 0,id,title,content
0,1,What is spin as it relates to subatomic partic...,<p>I often hear about subatomic particles havi...
1,2,What is your simplest explanation of the strin...,<p>How would you explain string theory to non ...
2,3,"Lie theory, Representations and particle physics",<p>This is a question that has been posted at ...
3,7,Will Determinism be ever possible?,<p>What are the main problems that we need to ...
4,9,Hamilton's Principle,<p>Hamilton's principle states that a dynamic ...
5,13,What is sound and how is it produced?,"<p>I've been using the term ""sound"" all my lif..."
6,15,What experiment would disprove string theory?,<p>I know that there's big controversy between...
7,17,Why does the sky change color? Why the sky is ...,<p>Why does the sky change color? Why the sky ...
8,19,How's the energy of particle collisions calcul...,<p>Physicists often refer to the energy of col...
9,21,Monte Carlo use,<p>Where is the Monte Carlo method used in phy...


# Prepare data

In [12]:
test_contents = df_test['content'].tolist()
test_titles = df_test['title'].tolist()
test_titles_contents = (df_test['title'] + ' ' + df_test['content']).tolist()

# Cleanse Data

In [13]:
# remove [n]
# remove pure numbers, don't remove letter+number words, like CO2

# remove stopwords
# consider µL?
# remove formulas


numbers = set('0123456789-')

def is_useful_word(word):
    return (word not in useless_words) and (len(word) > 2) and (re.search('[a-z]', word)) and (word[0] not in numbers)
    

def cleanse_html(content):
    content = BeautifulSoup(content, "lxml").get_text()

    # remove urls
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    return content
    

def cleanse_lower_split(content):
    # remove html tags
    content = BeautifulSoup(content, "lxml").get_text()
    
    # remove urls
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    # replace punctuations and numbers with whitespaces
    content = re.sub(r"[^a-zA-Z0-9\-]"," ", content)
        
    # convert to lowercase
    content = content.lower()
    
    words = content.split()
    
    # remove useless words
    # stopwords, pure numbers, short words
    words = [word for word in words if is_useful_word(word)]
    
    return words

# only reserve words.
def cleanse(content):
    # remove html tags
    content = BeautifulSoup(content, "lxml").get_text()
    
    # remove urls
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    # replace punctuations and numbers with whitespaces
    content = re.sub(r"[^a-zA-Z0-9\-]"," ", content)
        
    # convert to lowercase
    content = content.lower()
    
    words = content.split()
    
    # remove useless words
    # stopwords, pure numbers, short words
    words = [word for word in words if is_useful_word(word)]
    
    # concat words to a string
    content = ' '.join(words)
    
    return content


In [14]:
def is_word(s):
    # if s start with english letter 
    return (re.search('[a-z]', s[0]))

def cleanse_and_split_sentence(s):
    # input a sentence, output a list of words
    s = re.sub(r"[^a-zA-Z0-9]"," ", s)
    word_seq = s.lower().split()
    word_seq = [word for word in word_seq if is_word(word)]
        
    return word_seq
    
# cleanse and cut sentences
def cleanse_content_for_w2v(content):
    # output format: [['i', 'don', 't', 'have', 'an', 'apple-pen'], ['there', 's', 'co2', 'in']]
    content = BeautifulSoup(content, "lxml").get_text()
    
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    sentences = tokenizer.tokenize(content)
    
    word_seqs = []
    for s in sentences:
        word_seq = cleanse_and_split_sentence(s)
        # if len(word_seq) >= 3:
        word_seqs.append(word_seq)
    
    return word_seqs

# not removing stopwords here
def cleanse_content_for_tfidf(content):
    # output format: [['i', 'don', 't', 'have', 'an', 'apple-pen'], ['there', 's', 'co2', 'in']]
    content = BeautifulSoup(content, "lxml").get_text()
    
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    word_seq = cleanse_and_split_sentence(content)
    
    return word_seq

In [15]:
content = "I don't 'have' an apple-pen. There's 256 CO2 in 10km^3!"
cleanse_content_for_w2v(content)

[['i', 'don', 't', 'have', 'an', 'apple', 'pen'], ['there', 's', 'co2', 'in']]

In [16]:
for content in test_contents[:3]:
    print(cleanse_content_for_w2v(content))
    print('----')

[['i', 'often', 'hear', 'about', 'subatomic', 'particles', 'having', 'a', 'property', 'called', 'spin', 'but', 'also', 'that', 'it', 'doesn', 't', 'actually', 'relate', 'to', 'spinning', 'about', 'an', 'axis', 'like', 'you', 'would', 'think'], ['which', 'particles', 'have', 'spin'], ['what', 'does', 'spin', 'mean', 'if', 'not', 'an', 'actual', 'spinning', 'motion']]
----
[['how', 'would', 'you', 'explain', 'string', 'theory', 'to', 'non', 'physicists', 'such', 'as', 'myself'], ['i', 'm', 'specially', 'interested', 'on', 'how', 'plausible', 'is', 'it', 'and', 'what', 'is', 'needed', 'to', 'successfully', 'prove', 'it']]
----
[['this', 'is', 'a', 'question', 'that', 'has', 'been', 'posted', 'at', 'many', 'different', 'forums', 'i', 'thought', 'maybe', 'someone', 'here', 'would', 'have', 'a', 'better', 'or', 'more', 'conceptual', 'answer', 'than', 'i', 'have', 'seen', 'before', 'why', 'do', 'physicists', 'care', 'about', 'representations', 'of', 'lie', 'groups'], ['for', 'myself', 'when',

# Prepare w2v training data

In [17]:
def print_line(lst):
    for s in lst:
        print(s)

In [18]:
cleansed_train_domains_titles = []
for df in df_trains:
    word_seqs = []
    for title in df['title'].tolist():
        word_seqs.extend(cleanse_content_for_w2v(title))
    cleansed_train_domains_titles.append(word_seqs)

cleansed_train_domains_contents = []
for df in df_trains:
    word_seqs = []
    for content in df['content'].tolist():
        word_seqs.extend(cleanse_content_for_w2v(content))
    cleansed_train_domains_contents.append(word_seqs)

In [19]:
for cleansed_train_titles in cleansed_train_domains_titles:
    print(len(cleansed_train_titles))
print()
for cleansed_train_contents in cleansed_train_domains_contents:
    print(len(cleansed_train_contents))
print(cleansed_train_domains_titles[5][:3])
print(cleansed_train_domains_contents[5][:3])

13518
15786
10614
26574
2824
20001

69511
82281
69281
180431
19551
104301
[['what', 'are', 'some', 'caribbean', 'cruises', 'for', 'october'], ['how', 'can', 'i', 'find', 'a', 'guide', 'that', 'will', 'take', 'me', 'safely', 'through', 'the', 'amazon', 'jungle'], ['does', 'singapore', 'airlines', 'offer', 'any', 'reward', 'seats', 'on', 'their', 'ewr', 'sin', 'route']]
[['my', 'fianc', 'e', 'and', 'i', 'are', 'looking', 'for', 'a', 'good', 'caribbean', 'cruise', 'in', 'october', 'and', 'were', 'wondering', 'which', 'islands', 'are', 'best', 'to', 'see', 'and', 'which', 'cruise', 'line', 'to', 'take'], ['it', 'seems', 'like', 'a', 'lot', 'of', 'the', 'cruises', 'don', 't', 'run', 'in', 'this', 'month', 'due', 'to', 'hurricane', 'season', 'so', 'i', 'm', 'looking', 'for', 'other', 'good', 'options'], ['edit', 'we', 'll', 'be', 'travelling', 'in']]


In [20]:
df = df_test

cleansed_test_titles = []
for title in df['title'].tolist():
    cleansed_test_titles.extend(cleanse_content_for_w2v(title))

cleansed_test_contents = []
for content in df['content'].tolist():
    cleansed_test_contents.extend(cleanse_content_for_w2v(content))

In [21]:
print(len(cleansed_test_titles))
print(len(cleansed_test_contents))
print(cleansed_test_titles[:3])
print(cleansed_test_contents[:3])

83757
494470
[['what', 'is', 'spin', 'as', 'it', 'relates', 'to', 'subatomic', 'particles'], ['what', 'is', 'your', 'simplest', 'explanation', 'of', 'the', 'string', 'theory'], ['lie', 'theory', 'representations', 'and', 'particle', 'physics']]
[['i', 'often', 'hear', 'about', 'subatomic', 'particles', 'having', 'a', 'property', 'called', 'spin', 'but', 'also', 'that', 'it', 'doesn', 't', 'actually', 'relate', 'to', 'spinning', 'about', 'an', 'axis', 'like', 'you', 'would', 'think'], ['which', 'particles', 'have', 'spin'], ['what', 'does', 'spin', 'mean', 'if', 'not', 'an', 'actual', 'spinning', 'motion']]


In [22]:
cleansed_all_w2v = []
cleansed_train_titles_contents_w2v = []
cleansed_test_titles_contents_w2v = []

for cleansed_train_titles in cleansed_train_domains_titles:
    cleansed_train_titles_contents_w2v.extend(cleansed_train_titles)
for cleansed_train_contents in cleansed_train_domains_contents:
    cleansed_train_titles_contents_w2v.extend(cleansed_train_contents)
    
cleansed_test_titles_contents_w2v.extend(cleansed_test_titles)
cleansed_test_titles_contents_w2v.extend(cleansed_test_contents)

cleansed_all_w2v.extend(cleansed_train_titles_contents_w2v)
cleansed_all_w2v.extend(cleansed_test_titles_contents_w2v)

In [23]:
train_sentence_count = len(cleansed_train_titles_contents_w2v)
test_sentence_count = len(cleansed_test_titles_contents_w2v)
all_sentence_count = len(cleansed_all_w2v)
print(train_sentence_count)
print(test_sentence_count)
print(all_sentence_count)

614673
578227
1192900


In [24]:
df = df_test

cleansed_test_titles_tfidf = []
for title in df['title'].tolist():
    cleansed_test_titles_tfidf.append(cleanse_content_for_tfidf(title))

print(len(cleansed_test_titles_tfidf))
cleansed_test_titles_tfidf[:10]

81926


[['what', 'is', 'spin', 'as', 'it', 'relates', 'to', 'subatomic', 'particles'],
 ['what',
  'is',
  'your',
  'simplest',
  'explanation',
  'of',
  'the',
  'string',
  'theory'],
 ['lie', 'theory', 'representations', 'and', 'particle', 'physics'],
 ['will', 'determinism', 'be', 'ever', 'possible'],
 ['hamilton', 's', 'principle'],
 ['what', 'is', 'sound', 'and', 'how', 'is', 'it', 'produced'],
 ['what', 'experiment', 'would', 'disprove', 'string', 'theory'],
 ['why',
  'does',
  'the',
  'sky',
  'change',
  'color',
  'why',
  'the',
  'sky',
  'is',
  'blue',
  'during',
  'the',
  'day',
  'red',
  'during',
  'sunrise',
  'set',
  'and',
  'black',
  'during',
  'the',
  'night'],
 ['how', 's', 'the', 'energy', 'of', 'particle', 'collisions', 'calculated'],
 ['monte', 'carlo', 'use']]

### connect phrases by bigram

In [24]:
def connect_phrases_bigram(word_seqs):
    bigram_transformer = phrases.Phrases(word_seqs, min_count=20, threshold=1)
    word_seqs_phrased_bigram = []
    for word_seq in word_seqs:
        word_seq_phrased = bigram_transformer[word_seq]
        word_seq_phrased = [re.sub(r'_', '-', word) for word in word_seq_phrased]
        word_seqs_phrased_bigram.append(word_seq_phrased)
    
    return bigram_transformer, word_seqs_phrased_bigram

In [55]:
def connect_phrases_with_bigram(word_seqs, bigram_transformer):
    word_seqs_phrased_bigram = []
    for word_seq in word_seqs:
        word_seq_phrased = bigram_transformer[word_seq]
        word_seq_phrased = [re.sub(r'_', '-', word) for word in word_seq_phrased]
        word_seqs_phrased_bigram.append(word_seq_phrased)
    return word_seqs_phrased_bigram

In [138]:
# test_bigram, test_phrased_bigram = connect_phrases_bigram(cleansed_test_titles_contents_w2v)

In [146]:
# print_line(test_phrased_bigram[100:110])

['is', 'electricity', 'instantaneous']
['how-can', 'i', 'determine', 'transmission', 'reflection', 'coefficients', 'for', 'light']
['the', 'principle-behind', 'door', 'peepholes']
['physics', 'and', 'computer-science']
['why-don', 't', 'electric', 'fish', 'shock', 'themselves']
['why', 'were', 'the', 'si', 'base', 'quantities', 'chosen', 'as', 'such']
['how-can', 'i', 'measure', 'the', 'mass', 'of', 'the', 'earth', 'at-home']
['home', 'experiments', 'to-derive', 'the', 'speed-of', 'light']
['symmetrical', 'twin-paradox']
['getting', 'started', 'self-studying', 'general-relativity']


In [141]:
# test_bigram.save('test_bigram_count20_threshold1')

In [142]:
# test_bigram_vocab_dict = dict(test_bigram.vocab)

In [143]:
# import operator
# sorted_test_bigram_vocab_dict = sorted(test_bigram_vocab_dict.items(), key=operator.itemgetter(1), reverse=True)

In [147]:
# sorted_test_bigram_vocab_dict[100:110]

[(b'no', 15814),
 (b'system', 15786),
 (b'could', 15610),
 (b'd', 15509),
 (b'particle', 15340),
 (b'c', 15248),
 (b'get', 15199),
 (b'for_the', 15194),
 (b'its', 15107),
 (b'what_is', 14863)]

## Remove stopwords then phrase

In [67]:
with open('model/stopwords.txt') as f:
    long_stopwords = f.read().splitlines()

stopwords = set(long_stopwords) | nltk_stopwords

for word in list(stopwords):
    for subword in re.sub(r"[^a-z]", ' ', word).split():
        stopwords.add(subword)
    stopwords.add(re.sub(r"'", '-', word))

In [68]:
len(stopwords)

722

In [69]:
def remove_stopwords(word_seqs):
    removed = []
    for word_seq in word_seqs:
        removed.append([word for word in word_seq if word not in stopwords])
    return removed

In [30]:
cleansed_test_titles_contents_no_stopwords = remove_stopwords(cleansed_test_titles_contents_w2v)

In [31]:
cleansed_test_titles_contents_no_stopwords[:5]

[['spin', 'relates', 'subatomic', 'particles'],
 ['simplest', 'explanation', 'string', 'theory'],
 ['lie', 'theory', 'representations', 'particle', 'physics'],
 ['determinism'],
 ['hamilton', 'principle']]

In [32]:
test_bigram_nostop, test_phrased_bigram_nostop = connect_phrases_bigram(cleansed_test_titles_contents_no_stopwords)

In [63]:
test_phrased_bigram_nostop[:30]

[['spin', 'relates', 'subatomic-particles'],
 ['simplest', 'explanation', 'string-theory'],
 ['lie', 'theory', 'representations', 'particle-physics'],
 ['determinism'],
 ['hamilton-principle'],
 ['sound-produced'],
 ['experiment', 'disprove', 'string-theory'],
 ['sky', 'change-color'],
 ['sky-blue', 'day', 'red', 'sunrise', 'set', 'black', 'night'],
 ['energy', 'particle', 'collisions', 'calculated'],
 ['monte-carlo'],
 ['leaning', 'banking', 'help', 'turning', 'bicycle'],
 ['velocity-object', 'electromagnetic-field'],
 ['difference', 'measurement', 'interaction', 'quantum-mechanics'],
 ['calculate-average', 'speed'],
 ['lay', 'explanation', 'special-theory', 'relativity'],
 ['coriolis', 'irrelevant', 'whirl', 'vortex', 'sink', 'bathtub'],
 ['magnets', 'energy', 'repel'],
 ['check', 'einstein-equations', 'correspondence', 'real'],
 ['impressions', 'topological', 'field-theories', 'mathematics'],
 ['capacitive', 'screen', 'sensing'],
 ['magnets', 'spin', 'positioned', 'precisely'],
 ['l

In [66]:
cleansed_test_titles_tfidf[:10]

[['what', 'is', 'spin', 'as', 'it', 'relates', 'to', 'subatomic', 'particles'],
 ['what',
  'is',
  'your',
  'simplest',
  'explanation',
  'of',
  'the',
  'string',
  'theory'],
 ['lie', 'theory', 'representations', 'and', 'particle', 'physics'],
 ['will', 'determinism', 'be', 'ever', 'possible'],
 ['hamilton', 's', 'principle'],
 ['what', 'is', 'sound', 'and', 'how', 'is', 'it', 'produced'],
 ['what', 'experiment', 'would', 'disprove', 'string', 'theory'],
 ['why',
  'does',
  'the',
  'sky',
  'change',
  'color',
  'why',
  'the',
  'sky',
  'is',
  'blue',
  'during',
  'the',
  'day',
  'red',
  'during',
  'sunrise',
  'set',
  'and',
  'black',
  'during',
  'the',
  'night'],
 ['how', 's', 'the', 'energy', 'of', 'particle', 'collisions', 'calculated'],
 ['monte', 'carlo', 'use']]

In [70]:
cleansed_test_titles_tfidf_nostop = remove_stopwords(cleansed_test_titles_tfidf)

In [76]:
cleansed_test_titles_tfidf_nostop[:10]

[['spin', 'relates', 'subatomic', 'particles'],
 ['simplest', 'explanation', 'string', 'theory'],
 ['lie', 'theory', 'representations', 'particle', 'physics'],
 ['determinism'],
 ['hamilton', 'principle'],
 ['sound', 'produced'],
 ['experiment', 'disprove', 'string', 'theory'],
 ['sky',
  'change',
  'color',
  'sky',
  'blue',
  'day',
  'red',
  'sunrise',
  'set',
  'black',
  'night'],
 ['energy', 'particle', 'collisions', 'calculated'],
 ['monte', 'carlo']]

In [77]:
# cleansed_test_titles_tfidf

cleansed_test_titles_tfidf_nostop_phrased = connect_phrases_with_bigram(cleansed_test_titles_tfidf_nostop, test_bigram_nostop)

In [78]:
len(cleansed_test_titles_tfidf_nostop_phrased)

81926

In [79]:
cleansed_test_titles_tfidf_nostop_phrased[:10]

[['spin', 'relates', 'subatomic-particles'],
 ['simplest', 'explanation', 'string-theory'],
 ['lie', 'theory', 'representations', 'particle-physics'],
 ['determinism'],
 ['hamilton-principle'],
 ['sound-produced'],
 ['experiment', 'disprove', 'string-theory'],
 ['sky',
  'change-color',
  'sky-blue',
  'day',
  'red',
  'sunrise',
  'set',
  'black',
  'night'],
 ['energy', 'particle', 'collisions', 'calculated'],
 ['monte-carlo']]

# Trigram

In [46]:
def connect_phrases(word_seqs):
    bigram_transformer = phrases.Phrases(word_seqs)
    word_seqs_phrased_bigram = []
    for word_seq in word_seqs:
        word_seq_phrased = bigram_transformer[word_seq]
        word_seq_phrased = [re.sub(r'_', '-', word) for word in word_seq_phrased]
        word_seqs_phrased_bigram.append(word_seq_phrased)
    
    trigram_transformer = phrases.Phrases(word_seqs_phrased_bigram)
    word_seqs_phrased_trigram = []
    for word_seq in word_seqs_phrased_bigram:
        word_seq_phrased = trigram_transformer[word_seq]
        word_seq_phrased = [re.sub(r'_', '-', word) for word in word_seq_phrased]
        word_seqs_phrased_trigram.append(word_seq_phrased)
    return bigram_transformer, trigram_transformer, word_seqs_phrased_bigram, word_seqs_phrased_trigram

In [47]:
all_bigram, all_trigram_transformer, all_phrased_bigram, all_phrased_trigram = connect_phrases(cleansed_all_w2v)

In [57]:
print_line(all_phrased_trigram[train_sentence_count:train_sentence_count+10])

['what', 'is', 'spin', 'as', 'it', 'relates', 'to', 'subatomic-particles']
['what', 'is', 'your', 'simplest', 'explanation', 'of', 'the', 'string-theory']
['lie', 'theory', 'representations', 'and', 'particle', 'physics']
['will', 'determinism', 'be', 'ever', 'possible']
['hamilton-s-principle']
['what', 'is', 'sound', 'and', 'how', 'is', 'it', 'produced']
['what', 'experiment', 'would', 'disprove-string-theory']
['why', 'does', 'the', 'sky', 'change', 'color']
['why', 'the', 'sky', 'is', 'blue', 'during', 'the', 'day', 'red', 'during', 'sunrise', 'set', 'and', 'black', 'during', 'the', 'night']
['how', 's', 'the', 'energy', 'of', 'particle', 'collisions', 'calculated']


In [53]:
test_bigram, test_trigram, test_phrased_bigram, test_phrased_trigram = connect_phrases(cleansed_test_titles_contents_w2v)

In [176]:
# print_line(test_phrased_bigram[:20])

In [177]:
# print_line(test_phrased_trigram[:20])

In [59]:
test_bigram_vocab = dict(test_bigram.vocab)

In [66]:
test_bigram_vocab_words = list(test_bigram_vocab.keys())

In [67]:
len(test_bigram_vocab_words)

1410768

In [71]:
test_bigram_vocab_words[100000:100000+10]

[b'photograph_was',
 b'visual_aspect',
 b't_torus',
 b'metric_correspond',
 b'fresnel_it',
 b'desperate_hand',
 b'made_thick',
 b'rearranging_s',
 b'generate_pulse',
 b'they_know']

In [226]:
cleansed_test_titles_for_tfidf = [cleanse_content_for_tfidf(title) for title in df_test['title'].tolist()]

In [227]:
cleansed_test_titles_for_tfidf[:5]

[['what', 'is', 'spin', 'as', 'it', 'relates', 'to', 'subatomic', 'particles'],
 ['what',
  'is',
  'your',
  'simplest',
  'explanation',
  'of',
  'the',
  'string',
  'theory'],
 ['lie', 'theory', 'representations', 'and', 'particle', 'physics'],
 ['will', 'determinism', 'be', 'ever', 'possible'],
 ['hamilton', 's', 'principle']]

In [304]:
cleansed_test_titles_bigram = connect_phrases_bigram(cleansed_test_titles_for_tfidf)
cleansed_test_titles_trigram = connect_phrases_trigram(cleansed_test_titles_bigram)

In [305]:
for ws in cleansed_test_titles_trigram[:20]:
    print(ws)

['what', 'is', 'spin', 'as', 'it', 'relates', 'to', 'subatomic-particles']
['what', 'is', 'your', 'simplest', 'explanation', 'of', 'the', 'string-theory']
['lie', 'theory', 'representations', 'and', 'particle-physics']
['will', 'determinism', 'be', 'ever', 'possible']
['hamilton-s-principle']
['what', 'is', 'sound', 'and', 'how', 'is', 'it', 'produced']
['what', 'experiment', 'would', 'disprove-string-theory']
['why', 'does', 'the', 'sky', 'change', 'color', 'why', 'the', 'sky', 'is', 'blue', 'during', 'the', 'day', 'red', 'during', 'sunrise', 'set', 'and', 'black', 'during', 'the', 'night']
['how', 's', 'the', 'energy', 'of', 'particle', 'collisions', 'calculated']
['monte-carlo', 'use']
['does', 'leaning', 'banking', 'help', 'cause', 'turning', 'on', 'a', 'bicycle']
['velocity', 'of', 'object', 'from', 'electromagnetic-field']
['what', 'is', 'the', 'difference-between', 'a', 'measurement', 'and', 'any', 'other', 'interaction', 'in', 'quantum-mechanics']
['how', 'to', 'calculate', 'av

In [229]:
cleansed_test_contents_bigram = connect_phrases_bigram(cleansed_test_contents)
cleansed_test_contents_trigram = connect_phrases_trigram(cleansed_test_contents_bigram)

# phrase then tfidf

In [None]:
# cleansed_test_titles_trigram
# cleansed_test_contents_trigram

In [171]:
with open('model/stopwords.txt') as f:
    long_stopwords = f.read().splitlines()

temp_stopwords = set('nan sky change color circuits wire phase explanation resistance solution solutions source principle properties questions calculation mechanics confusion determine car states formula direction fields object terms law quantum equations surface body spin measure form operator objects relationship field difference theory motion calculate relation objects difference question problem change point number size help equation proof physics state function derivation'.split())
    
stopwords = set(long_stopwords) | nltk_stopwords | temp_stopwords

for word in list(stopwords):
    for subword in re.sub(r"[^a-z]", ' ', word).split():
        stopwords.add(subword)
    stopwords.add(re.sub(r"'", '-', word))

In [104]:
# cleansed_test_titles_tfidf_nostop_phrased

In [105]:
test_title_for_tfidf = []
for word_seq in cleansed_test_titles_tfidf_nostop_phrased:
    test_title_for_tfidf.append( ' '.join([word for word in word_seq if word not in stopwords]) )

In [106]:
test_title_for_tfidf[:10]

['relates subatomic-particles',
 'simplest string-theory',
 'lie representations particle-physics',
 'determinism',
 'hamilton-principle',
 'sound-produced',
 'experiment disprove string-theory',
 'sky change-color sky-blue day red sunrise set black night',
 'energy particle collisions calculated',
 'monte-carlo']

In [107]:
def predict_tags_from_cleansed_contents(cleansed_contents, max_features):
    vectorizer = TfidfVectorizer(use_idf=False, stop_words = "english", \
                                 tokenizer = str.split,    \
                                 preprocessor = None, \
                                 max_features = max_features)
    features = vectorizer.fit_transform(cleansed_contents)
    feature_words = vectorizer.get_feature_names()
    
    predicted_tags = []

    for feature in features:
        values = feature.data
        _, word_idxs = feature.nonzero()

        index_sorted = np.argsort(values)[::-1]
        keywords = [feature_words[wid] for wid in word_idxs[index_sorted]]
        predicted_tags.append(' '.join(keywords))
    
    return predicted_tags

In [116]:
test_title_predicted_tags = predict_tags_from_cleansed_contents(test_title_for_tfidf, 5000)

In [117]:
test_predicted_tags_noun = []

for tags in test_title_predicted_tags:
    pos_tagged = nltk.pos_tag(tags.split())
    noun_tags = [i[0] for i in pos_tagged if i[1][0] in "N"]
    noun_tags_str = ' '.join(noun_tags)
    test_predicted_tags_noun.append(noun_tags_str)

In [118]:
test_predicted_tags_noun[:20]

['subatomic-particles',
 'string-theory',
 'representations particle-physics',
 'determinism',
 'hamilton-principle',
 '',
 'experiment disprove',
 'day night',
 'energy particle collisions',
 'monte-carlo',
 'bicycle',
 'electromagnetic-field',
 'interaction quantum-mechanics',
 'speed',
 'relativity',
 'coriolis sink',
 'energy magnets',
 'einstein-equations correspondence',
 'field-theories mathematics',
 'screen']

In [119]:
len(test_predicted_tags_noun)

81926

In [120]:
from collections import Counter

all_tags = []
for tags in test_predicted_tags_noun:
    all_tags.extend(tags.split())

counter = Counter(all_tags)

In [121]:
counter.most_common(100)

[('energy', 1960),
 ('time', 1354),
 ('mass', 1179),
 ('gravity', 1023),
 ('work', 1018),
 ('force', 941),
 ('space', 843),
 ('particle', 807),
 ('water', 772),
 ('velocity', 707),
 ('temperature', 653),
 ('speed', 622),
 ('particles', 603),
 ('quantum-mechanics', 601),
 ('charge', 562),
 ('acceleration', 539),
 ('momentum', 531),
 ('model', 505),
 ('earth', 499),
 ('photons', 482),
 ('electrons', 475),
 ('distance', 474),
 ('photon', 463),
 ('pressure', 461),
 ('matter', 438),
 ('electron', 437),
 ('air', 422),
 ('heat', 396),
 ('power', 390),
 ('definition', 368),
 ('vacuum', 357),
 ('radiation', 349),
 ('forces', 345),
 ('frequency', 345),
 ('density', 337),
 ('tensor', 319),
 ('vector', 319),
 ('gas', 318),
 ('symmetry', 318),
 ('general-relativity', 313),
 ('friction', 306),
 ('special-relativity', 288),
 ('interaction', 287),
 ('sun', 283),
 ('waves', 281),
 ('entropy', 277),
 ('black-holes', 276),
 ('relativity', 271),
 ('thermodynamics', 271),
 ('operators', 269),
 ('atoms', 26

# Output

In [122]:
output_file_name = 'output/20161229_w2v_bigram_title_feature5000_noun_remove-common.csv'

df_output = df_test[['id']]
df_output['tags'] = test_predicted_tags_noun

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [123]:
df_output.head(20)

Unnamed: 0,id,tags
0,1,subatomic-particles
1,2,string-theory
2,3,representations particle-physics
3,7,determinism
4,9,hamilton-principle
5,13,
6,15,experiment disprove
7,17,day night
8,19,energy particle collisions
9,21,monte-carlo


In [124]:
import csv

df_output.to_csv(output_file_name, index=False, quoting=csv.QUOTE_NONNUMERIC)

# change output file

In [172]:
input_file = 'output/w2v_trigram_title_feature5000_noun_2.csv'
df_input = pd.read_csv(input_file)

In [173]:
df_input.head()

Unnamed: 0,id,tags
0,1,spin subatomic-particles
1,2,explanation string-theory
2,3,lie theory representations physics
3,7,determinism
4,9,principle


In [174]:
input_tags = df_input['tags'].astype(str).tolist()

In [175]:
transform_dict = {}

In [176]:
def transform(word):
    if word in transform_dict:
        return transform_dict[word]
    return word

In [177]:
updated_tags = []
for tags in input_tags:
    updated_tags.append(' '.join([transform(tag) for tag in tags.split() if tag not in stopwords]))

In [178]:
updated_tags[:30]

['subatomic-particles',
 'string-theory',
 'lie representations',
 'determinism',
 '',
 'sound',
 'experiment disprove',
 'day sunrise night',
 'particles energy collisions',
 'monte-carlo',
 'bicycle',
 'velocity electromagnetic-field',
 'interaction quantum-mechanics',
 'speed',
 'relativity',
 'vortex sink',
 'energy magnets',
 'check real-world',
 'field-theories mathematics',
 'screen',
 'magnets',
 'lhc',
 'materials stress',
 'intuitive-explanation',
 'proton treatment',
 'physicists',
 'materials',
 'neutrons',
 'interaction quantum-entanglement',
 'light']

In [179]:
len(updated_tags)

81926

In [180]:
output_file_name = 'output/w2v_trigram_title_feature5000_noun_2_remove-common_plural.csv'

df_output = df_test[['id']]
df_output['tags'] = updated_tags

df_output.to_csv(output_file_name, index=False, quoting=csv.QUOTE_NONNUMERIC)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
