In [1]:
# TODO: Remove this
# https://www.kaggle.com/sudalairajkumar/transfer-learning-on-stack-exchange-tags/frequent-words-model-v3/code

In [2]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import FreqDist
from nltk.corpus import brown

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stops = set(stopwords.words("english"))
frequency_list = FreqDist(i.lower() for i in brown.words())

In [3]:
len(stops)

153

In [4]:
DATA_FOLDER = 'data/'
TRAIN_FILES = ['biology.csv', 'cooking.csv', 'crypto.csv', 'diy.csv', 'robotics.csv', 'travel.csv']
TEST_FILE = 'test.csv'

In [5]:
DOMAIN_COUNT = len(TRAIN_FILES)

# Load Data

In [6]:
df_trains = [pd.read_csv(DATA_FOLDER + filename) for filename in TRAIN_FILES]

In [7]:
for df in df_trains:
    print(df.shape)

(13196, 4)
(15404, 4)
(10432, 4)
(25918, 4)
(2771, 4)
(19279, 4)


In [8]:
sum([df.shape[0] for df in df_trains])

87000

In [9]:
df_trains[0].head()

Unnamed: 0,id,title,content,tags
0,1,What is the criticality of the ribosome bindin...,"<p>In prokaryotic translation, how critical fo...",ribosome binding-sites translation synthetic-b...
1,2,How is RNAse contamination in RNA based experi...,<p>Does anyone have any suggestions to prevent...,rna biochemistry
2,3,Are lymphocyte sizes clustered in two groups?,<p>Tortora writes in <em>Principles of Anatomy...,immunology cell-biology hematology
3,4,How long does antibiotic-dosed LB maintain goo...,<p>Various people in our lab will prepare a li...,cell-culture
4,5,Is exon order always preserved in splicing?,<p>Are there any cases in which the splicing m...,splicing mrna spliceosome introns exons


In [21]:
train_tags = df_trains[0]['tags'].tolist()
train_tags[:20]

['ribosome binding-sites translation synthetic-biology',
 'rna biochemistry',
 'immunology cell-biology hematology',
 'cell-culture',
 'splicing mrna spliceosome introns exons',
 'dna biochemistry molecular-biology',
 'neuroscience synapses',
 'plasmids',
 'molecular-genetics gene-expression experimental-design',
 'evolution mitochondria chloroplasts',
 'high-throughput cell-based',
 'molecular-biology synthetic-biology',
 'bioinformatics homework',
 'neuroscience immunology',
 'splicing histone',
 'genomics gene-annotation exons',
 'microbiology virology influenza',
 'epigenetics',
 'molecular-biology dna-isolation',
 'cell-membrane adaptation cell-biology']

In [11]:
df_test = pd.read_csv(DATA_FOLDER + TEST_FILE)
print(df_test.shape)
df_test.head(10)

(81926, 3)


Unnamed: 0,id,title,content
0,1,What is spin as it relates to subatomic partic...,<p>I often hear about subatomic particles havi...
1,2,What is your simplest explanation of the strin...,<p>How would you explain string theory to non ...
2,3,"Lie theory, Representations and particle physics",<p>This is a question that has been posted at ...
3,7,Will Determinism be ever possible?,<p>What are the main problems that we need to ...
4,9,Hamilton's Principle,<p>Hamilton's principle states that a dynamic ...
5,13,What is sound and how is it produced?,"<p>I've been using the term ""sound"" all my lif..."
6,15,What experiment would disprove string theory?,<p>I know that there's big controversy between...
7,17,Why does the sky change color? Why the sky is ...,<p>Why does the sky change color? Why the sky ...
8,19,How's the energy of particle collisions calcul...,<p>Physicists often refer to the energy of col...
9,21,Monte Carlo use,<p>Where is the Monte Carlo method used in phy...


# Cleanse Data

In [12]:
commmon_words = set()
for word, frequent in frequency_list.most_common()[:20000]:
    commmon_words.add(word)

wrong_tags = set(('edit update dont thats video videos btw tricky'
+ ' google googling websites website online wiki wikipedia internet'
+ ' homework exam arxiv stackexchange cancelled basics approx'
+ ' beginner colour colours bike bicycle textbooks info youtube com'
+ ' whats pdf physicist physicists mathematician mathematicians ref').split())
    
useless_words = commmon_words | stops | wrong_tags
print(len(useless_words))

20054


In [13]:
# remove [n]
# remove pure numbers, don't remove letter+number words, like CO2

# remove stopwords
# consider µL?
# remove formulas


numbers = set('0123456789-')

def is_useful_word(word):
    return (word not in useless_words) and (len(word) > 2) and (re.search('[a-z]', word)) and (word[0] not in numbers)
    

def cleanse_html(content):
    content = BeautifulSoup(content, "lxml").get_text()

    # remove urls
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    return content
    

def cleanse_lower_split(content):
    # remove html tags
    content = BeautifulSoup(content, "lxml").get_text()
    
    # remove urls
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    # replace punctuations and numbers with whitespaces
    content = re.sub(r"[^a-zA-Z0-9\-]"," ", content)
        
    # convert to lowercase
    content = content.lower()
    
    words = content.split()
    
    # remove useless words
    # stopwords, pure numbers, short words
    words = [word for word in words if is_useful_word(word)]
    
    return words

# only reserve words.
def cleanse(content):
    # remove html tags
    content = BeautifulSoup(content, "lxml").get_text()
    
    # remove urls
    content = re.sub(r"\S+:/\S+","", content)  # "xxx:/xxx"
    content = re.sub(r"\S+\\\S+","", content)  # "xxx\xxx"
    
    # replace punctuations and numbers with whitespaces
    content = re.sub(r"[^a-zA-Z0-9\-]"," ", content)
        
    # convert to lowercase
    content = content.lower()
    
    words = content.split()
    
    # remove useless words
    # stopwords, pure numbers, short words
    words = [word for word in words if is_useful_word(word)]
    
    # concat words to a string
    content = ' '.join(words)
    
    return content

In [14]:
def cleanse_contents_of_df(df):
    contents = df['content'].tolist()
    cleansed_contents = [cleanse(content) for content in contents]
    return cleansed_contents

def cleanse_titles_of_df(df):
    contents = df['title'].tolist()
    cleansed_contents = [cleanse(content) for content in contents]
    return cleansed_contents

def cleanse_strings(strList):
    cleansed = [cleanse(s) for s in strList]
    return cleansed


# Predict Tags

In [15]:
def predict_tags_from_cleansed_contents(cleansed_contents, max_features):
    vectorizer = TfidfVectorizer(use_idf=False, \
                                 stop_words = "english", \
                                 tokenizer = str.split,    \
                                 preprocessor = None, \
                                 max_features = max_features)
    features = vectorizer.fit_transform(cleansed_contents)
    feature_words = vectorizer.get_feature_names()
    
    predicted_tags = []

    for feature in features:
        values = feature.data
        _, word_idxs = feature.nonzero()

        index_sorted = np.argsort(values)[::-1]
        keywords = [feature_words[wid] for wid in word_idxs[index_sorted]]
        predicted_tags.append(' '.join(keywords))
    
    return predicted_tags

# Experiments - Train

In [16]:
# train_0_cleansed_contents = cleanse_contents_of_df(df_trains[0])

In [17]:
# train_0_predicted_tags = predict_tags_from_cleansed_contents(train_0_cleansed_contents, max_features=1000)

In [18]:
# train_0_predicted_tags[:30]

In [19]:
# train_0_tags = df_trains[0]['tags'].tolist()
# train_0_tags[:30]

# Prepare data

In [28]:
test_contents = df_test['content'].tolist()
test_titles = df_test['title'].tolist()
test_titles_contents = (df_test['title'] + ' ' + df_test['content']).tolist()

# Contents

In [None]:
test_cleansed_contents = cleanse_contents_of_df(df_test)

In [None]:
test_predicted_tags = predict_tags_from_cleansed_contents(test_cleansed_contents, max_features=2000)

In [None]:
test_predicted_tags_noun = []

for tags in test_predicted_tags:
    pos_tagged = nltk.pos_tag(tags.split())
    noun_tags = [i[0] for i in pos_tagged if i[1][0] in "N"]
    noun_tags_str = ' '.join(noun_tags)
    test_predicted_tags_noun.append(noun_tags_str)

In [None]:
test_predicted_tags_noun[:1000]

In [None]:
pos_tagged = nltk.pos_tag('gravitational'.split())
print(pos_tagged)
noun_tags = [i[0] for i in pos_tagged if i[1][0] in "N"]
print(noun_tags)

In [None]:
test_predicted_tags[:2000]

# Titles

In [None]:
test_cleansed_titles = cleanse_titles_of_df(df_test)

In [None]:
test_predicted_tags = predict_tags_from_cleansed_contents(test_cleansed_titles, max_features=2000)

In [None]:
test_predicted_tags_noun = []

for tags in test_predicted_tags:
    pos_tagged = nltk.pos_tag(tags.split())
    noun_tags = [i[0] for i in pos_tagged if i[1][0] in "N"]
    noun_tags_str = ' '.join(noun_tags)
    test_predicted_tags_noun.append(noun_tags_str)

In [None]:
test_predicted_tags_noun[:2000]

# Title + Content

In [30]:
test_cleansed_titles_contents = cleanse_strings(test_titles_contents)

In [33]:
test_predicted_tags = predict_tags_from_cleansed_contents(test_cleansed_titles_contents, max_features=1000)

test_predicted_tags_noun = []

for tags in test_predicted_tags:
    pos_tagged = nltk.pos_tag(tags.split())
    noun_tags = [i[0] for i in pos_tagged if i[1][0] in "N"]
    noun_tags_str = ' '.join(noun_tags)
    test_predicted_tags_noun.append(noun_tags_str)

In [34]:
test_predicted_tags_noun[:2000]

['',
 '',
 '',
 'laplace',
 '',
 '',
 'disprove',
 '',
 'collisions higgs boson',
 'carlo',
 '',
 'projectile gauss',
 'collapses interacts',
 '',
 '',
 'coriolis rotate',
 'magnets',
 '',
 'qft invariants manifolds topology',
 '',
 'magnets',
 'lhc',
 '',
 'laser lasers',
 '',
 '',
 '',
 'neutrons',
 'entanglement photons',
 'laser photon poisson',
 'neutron',
 'tensor approximations',
 'emit',
 'coulomb integrating',
 'laser polarizations poincar',
 'cosmological',
 'isotope mev isotopes',
 'neutron',
 'collider collides',
 'centripetal',
 'formalism',
 'mathematica',
 'photons',
 '',
 'mathematica graphs',
 'higgs boson lhc',
 'neutrons boson fermions',
 'neutrinos',
 'navier-stokes',
 '',
 '',
 'rightarrow',
 '',
 '',
 'projectile calculus algebra',
 'anti-matter',
 'gauss convection',
 'compress',
 '',
 'boson collisions luminosity quark',
 'propeller',
 '',
 'manifolds topology algebras',
 'causality scenarios scenario',
 'photons',
 'neutrons quarks macroscopic',
 'transformatio

# Output

In [35]:
output_file_name = 'output/title-content_common20000_feature1000_noun.csv'

df_output = df_test[['id']]
df_output['tags'] = test_predicted_tags_noun

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [36]:
df_output.head(20)

Unnamed: 0,id,tags
0,1,
1,2,
2,3,
3,7,laplace
4,9,
5,13,
6,15,disprove
7,17,
8,19,collisions higgs boson
9,21,carlo


In [37]:
import csv

df_output.to_csv(output_file_name, index=False, quoting=csv.QUOTE_NONNUMERIC)

# procedure

* cleanse title and contents
* apply tf idf

## try to guess if a word is a keyword

* train the guess function with training data
* input: tf, idf of title, content

## try to guess the tags of a post



In [None]:
# remove [n]
# remove pure numbers, don't remove letter+number words, like CO2
# remove urls

In [None]:
# tf-idf over all contents
# same-category contents concat as one doc

In [None]:
# for each tag, find words that mostly exist in title and contents of that tag
# use entropy to do the above thing
# if a test content contains words only exist in , then the content is very likely about that domain

In [None]:
# as classification problem
# features: bow (normalized)


In [None]:

# stop_words = {'a', "a's", 'able', 'about', 'above', 'according', 'accordingly',
#               'across', 'actually', 'after', 'afterwards', 'again', 'against',
#               "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along',
#               'already', 'also', 'although', 'always', 'am', 'among', 'amongst',
#               'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone',
#               'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear',
#               'appreciate', 'appropriate', 'are', "aren't", 'around', 'as',
#               'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away',
#               'awfully', 'b', 'be', 'became', 'because', 'become', 'becomes',
#               'becoming', 'been', 'before', 'beforehand', 'behind', 'being',
#               'believe', 'below', 'beside', 'besides', 'best', 'better',
#               'between', 'beyond', 'both', 'brief', 'but', 'by', 'c', "c'mon",
#               "c's", 'came', 'can', "can't", 'cannot', 'cant', 'cause',
#               'causes', 'certain', 'certainly', 'changes', 'clearly', 'co',
#               'com', 'come', 'comes', 'concerning', 'consequently', 'consider',
#               'considering', 'contain', 'containing', 'contains',
#               'corresponding', 'could', "couldn't", 'course', 'currently', 'd',
#               'definitely', 'described', 'despite', 'did', "didn't",
#               'different', 'do', 'does', "doesn't", 'doing', "don't", 'done',
#               'down', 'downwards', 'during', 'e', 'each', 'edu', 'eg', 'eight',
#               'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially',
#               'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone',
#               'everything', 'everywhere', 'ex', 'exactly', 'example', 'except',
#               'f', 'far', 'few', 'fifth', 'first', 'five', 'followed',
#               'following', 'follows', 'for', 'former', 'formerly', 'forth',
#               'four', 'from', 'further', 'furthermore', 'g', 'get', 'gets',
#               'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got',
#               'gotten', 'greetings', 'h', 'had', "hadn't", 'happens', 'hardly',
#               'has', "hasn't", 'have', "haven't", 'having', 'he', "he's",
#               'hello', 'help', 'hence', 'her', 'here', "here's", 'hereafter',
#               'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him',
#               'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit',
#               'however', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if',
#               'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed',
#               'indicate', 'indicated', 'indicates', 'inner', 'insofar',
#               'instead', 'into', 'inward', 'is', "isn't", 'it', "it'd", "it'll",
#               "it's", 'its', 'itself', 'j', 'just', 'k', 'keep', 'keeps',
#               'kept', 'know', 'knows', 'known', 'l', 'last', 'lately', 'later',
#               'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's",
#               'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks',
#               'ltd', 'm', 'mainly', 'many', 'may', 'maybe', 'me', 'mean',
#               'meanwhile', 'merely', 'might', 'more', 'moreover', 'most',
#               'mostly', 'much', 'must', 'my', 'myself', 'n', 'name', 'namely',
#               'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither',
#               'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody',
#               'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing',
#               'novel', 'now', 'nowhere', 'o', 'obviously', 'of', 'off', 'often',
#               'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only',
#               'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our',
#               'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own',
#               'p', 'particular', 'particularly', 'per', 'perhaps', 'placed',
#               'please', 'plus', 'possible', 'presumably', 'probably',
#               'provides', 'q', 'que', 'quite', 'qv', 'r', 'rather', 'rd', 're',
#               'really', 'reasonably', 'regarding', 'regardless', 'regards',
#               'relatively', 'respectively', 'right', 's', 'said', 'same', 'saw',
#               'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing',
#               'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves',
#               'sensible', 'sent', 'serious', 'seriously', 'seven', 'several',
#               'shall', 'she', 'should', "shouldn't", 'since', 'six', 'so',
#               'some', 'somebody', 'somehow', 'someone', 'something', 'sometime',
#               'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry',
#               'specified', 'specify', 'specifying', 'still', 'sub', 'such',
#               'sup', 'sure', 't', "t's", 'take', 'taken', 'tell', 'tends', 'th',
#               'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats',
#               'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence',
#               'there', "there's", 'thereafter', 'thereby', 'therefore',
#               'therein', 'theres', 'thereupon', 'these', 'they', "they'd",
#               "they'll", "they're", "they've", 'think', 'third', 'this',
#               'thorough', 'thoroughly', 'those', 'though', 'three', 'through',
#               'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took',
#               'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying',
#               'twice', 'two', 'u', 'un', 'under', 'unfortunately', 'unless',
#               'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used',
#               'useful', 'uses', 'using', 'usually', 'uucp', 'v', 'value',
#               'various', 'very', 'via', 'viz', 'vs', 'w', 'want', 'wants',
#               'was', "wasn't", 'way', 'we', "we'd", "we'll", "we're", "we've",
#               'welcome', 'well', 'went', 'were', "weren't", 'what', "what's",
#               'whatever', 'when', 'whence', 'whenever', 'where', "where's",
#               'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
#               'wherever', 'whether', 'which', 'while', 'whither', 'who',
#               "who's", 'whoever', 'whole', 'whom', 'whose', 'why', 'will',
#               'willing', 'wish', 'with', 'within', 'without', "won't", 'wonder',
#               'would', 'would', "wouldn't", 'x', 'y', 'yes', 'yet', 'you',
#               "you'd", "you'll", "you're", "you've", 'your', 'yours',
#               'yourself', 'yourselves', 'z', 'zero', ''}

# len(stop_words)

In [None]:
start = 30000
for word, frequent in frequency_list.most_common()[start:start + 2000]:
    print(word + ' ', end='')