## Installing required packages
Install nltk, pandas, gensim packages by performing the following commands in the terminal or command prompt:
`pip install --upgrade nltk pandas gensim`
or
`conda install nltk pandas gensim`
based on the python environment of your choice

In [2]:
import numpy as np
from nltk.tokenize import WordPunctTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("./train.csv")
df.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0,What am I losing when using extension tubes in...,After playing around with macro photography on...,ysap,https://photo.stackexchange.com/users/1024,"I just got extension tubes, so here's the skin...",rfusca,https://photo.stackexchange.com/users/1917,http://photo.stackexchange.com/questions/9169/...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0
1,1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,russellpierce,https://rpg.stackexchange.com/users/8774,It might be helpful to look into the definitio...,Erik Schmidt,https://rpg.stackexchange.com/users/1871,http://rpg.stackexchange.com/questions/47820/w...,CULTURE,...,0.888889,0.888889,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889
2,2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Joe Baker,https://electronics.stackexchange.com/users/10157,Do you even need grooves? We make several pro...,Dwayne Reid,https://electronics.stackexchange.com/users/64754,http://electronics.stackexchange.com/questions...,SCIENCE,...,0.777778,0.777778,0.555556,1.0,1.0,0.666667,0.0,0.333333,1.0,0.888889
3,3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...",Scimonster,https://judaism.stackexchange.com/users/5151,"Sending an ""affidavit"" it is a dispute between...",Y e z,https://judaism.stackexchange.com/users/4794,http://judaism.stackexchange.com/questions/551...,CULTURE,...,0.888889,0.833333,0.333333,0.833333,1.0,0.8,0.0,0.0,1.0,1.0
4,5,How do you make a binary image in Photoshop?,I am trying to make a binary image. I want mor...,leigero,https://graphicdesign.stackexchange.com/users/...,Check out Image Trace in Adobe Illustrator. \n...,q2ra,https://graphicdesign.stackexchange.com/users/...,http://graphicdesign.stackexchange.com/questio...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,1.0,1.0


In [4]:
print("Total records in the data frame: {}" .format(df.shape[0]))

Total records in the data frame: 6079


In [23]:
x_columns = list()
x_columns.append("qa_id")

for x in df.columns.values[1:].tolist()[0:10] :
    x_columns.append(x)

y_columns = df.columns.values[1:].tolist()[10:40]
# print(df.columns.values[1:][0:10])
print(x_columns)

['qa_id', 'question_title', 'question_body', 'question_user_name', 'question_user_page', 'answer', 'answer_user_name', 'answer_user_page', 'url', 'category', 'host']


In [6]:
def returnColumn(x, start, stop) :
    return list(x[start:stop])

In [24]:
X = list()
for row in df.values :
    X.append(returnColumn(row, 0, 11))
X = pd.DataFrame(X, columns=x_columns)
X.head()
X.shape

(6079, 11)

In [8]:
Y = list()
for row in df.values:
    Y.append(returnColumn(row, 11, 41))
Y = pd.DataFrame(Y, columns=y_columns)
Y.head()
Y.shape

(6079, 30)

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(4863, 11) (4863, 30)
(1216, 11) (1216, 30)


### Remove stop-words from the corpus

In [54]:
from gensim.parsing.preprocessing import remove_stopwords

In [55]:
q_title_no_sw = list()
for x in X_train["question_title"] :
    q_title_no_sw.append(remove_stopwords(x))

q_body_no_sw = list()
for x in X_train["question_body"] :
    q_body_no_sw.append(remove_stopwords(x))

answer_no_sw = list()
for x in X_train["answer"] :
    answer_no_sw.append(remove_stopwords(x))

### Create object for word Tokenizer

In [57]:
tokenizer = WordPunctTokenizer()

### Create tokenized lists of question_title, question_body and answer

In [58]:
tokenized_q_title = list()
for line in q_title_no_sw:
    tokenized_q_title.append(tokenizer.tokenize(line.lower()))

tokenized_q_body = list()
for line in q_body_no_sw:
    tokenized_q_body.append(tokenizer.tokenize(line.lower()))

tokenized_answer = list()
for line in answer_no_sw:
    tokenized_answer.append(tokenizer.tokenize(line.lower()))

In [59]:
# Checking if the answer block is well tokenized
print(tokenized_answer[34])

['for', 'novels', 'fiction', 'i', 'prefer', 'e', 'reader', '.', 'for', 'reference', 'i', 'prefer', 'paper', '.', 'why', '?', 'the', 'design', 'teams', 'e', 'readers', 'focus', 'novels', 'assumption', 'going', 'research', 'use', 'computer', '.', 'this', 'problem', 'can', "'", 't', 'fixed', ',', 'hasn', "'", 't', 'been', '.']


In [60]:
# Joining the tokenized answer
print(' '.join(tokenized_answer[34]))

for novels fiction i prefer e reader . for reference i prefer paper . why ? the design teams e readers focus novels assumption going research use computer . this problem can ' t fixed , hasn ' t been .


### Using Word2Vec model for vectorization

In [45]:
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import remove_stopwords

In [53]:

q_body_no_sw = list()
for x in question_body :
    q_body_no_sw.append(remove_stopwords(x))

I am trying to understand what kinds of places the spam values on p 231 refer to in the 5th Edition main book for Shadowrun.

Per p 15, a sprawl is a plex, a plex is a "metropolitan complex, short for metroplex". Per Google a metroplex is " a very large metropolitan area, especially one that is an aggregation of two or more cities".  A city downtown and sprawl downtown would tend to have similar densities, but for some reason the sprawl (which includes suburbs?) has a higher spam zone noise rating (p 231).  Similarly, I'd think of a downtown as being more dense and noisy (e.g. Office buildings and street vendors) than a commercial district, e.g. an outdoor mall.  The noise ratings make me think that I am thinking about this incorrectly. What is a better way of thinking of them?

I have command to list system process by memory usage:

ps -A --sort -rss -o comm,pmem


Which list a table like

COMMAND         %MEM
firefox         28.2
chrome           5.4
compiz           4.8
atom        

'I command list process memory usage: ps -A --sort -rss -o comm,pmem Which list table like COMMAND %MEM firefox 28.2 chrome 5.4 compiz 4.8 atom 2.5 chrome 2.3 Xorg 2.3 skype 2.2 chrome 2.0 chrome 1.9 atom 1.9 nautilus 1.8 hud-service 1.5 evince 1.3 I like total memory share programs instead process programs. So I output like COMMAND %MEM firefox 28.2 chrome 11.6 compiz 4.8 atom 4.4 Xorg 2.3 skype 2.2 nautilus 1.8 hud-service 1.5 evince 1.3 I thought awk, I don\'t know much. Ended like: ps -A --sort -rss -o comm,pmem | awk -F "\\t" \' {processes[$0] += $1;} {End for(i processes) { print i,"\\t",processes[i]; } }\' But didn\'t work. How I correct this?'

In [44]:
q_body_vec_model = Word2Vec(tokenized_q_body, size=40, min_count=5, window=6).wv

In [36]:
q_body_vec_model.get_vector("the")       # Some random word in the question_title column in the dataset

array([ 3.1617975 , -0.23722498,  1.409274  , -1.2606311 ,  1.2696685 ,
        0.4476379 ,  1.275167  ,  1.7570505 , -1.0324943 ,  1.3642502 ,
       -1.3477625 ,  0.48774233, -1.049456  , -1.0753771 ,  1.0931644 ,
       -2.870512  , -0.4607861 , -2.0377328 ,  0.86905247,  0.97090083,
        2.2748911 , -1.0888474 , -1.7159182 ,  1.833918  ,  0.37761867,
        0.14117195, -0.28042018,  0.6066532 ,  1.2705051 , -0.5000519 ,
        3.198017  , -1.1838781 ,  2.0664518 , -0.34193024,  0.7450827 ,
       -1.0491439 ,  0.7574018 , -1.1073203 ,  1.7441026 , -1.9327567 ],
      dtype=float32)

In [39]:
model.most_similar("local")

[('api', 0.9232462048530579),
 ('mysql', 0.9133329391479492),
 ('directory', 0.9076052904129028),
 ('via', 0.870556652545929),
 ('home', 0.866324782371521),
 ('host', 0.8631211519241333),
 ('vhosts', 0.861056923866272),
 ('facebook', 0.8583757877349854),
 ('cache', 0.8558299541473389),
 ('mysolution', 0.851349413394928)]

In [40]:
all_sentences = list()      # All corpus available in the dataset

for line in tokenized_q_title:
    all_sentences.append(line)

for line in tokenized_q_body:
    all_sentences.append(line)

for line in tokenized_answer:
    all_sentences.append(line)

# print(all_sentences[100])

large_model = Word2Vec(all_sentences, size=40, min_count=5, window=6).wv

In [41]:
large_model.get_vector("framework")

array([ 0.12671103,  0.56058407,  0.29067558, -0.34022012, -0.2182405 ,
        0.2881518 , -0.34776968,  0.34765828,  0.36139885,  0.6663553 ,
       -0.01369049,  0.5019986 ,  0.24421854, -0.02098736, -0.01419102,
       -0.34052455,  0.33240741, -0.41203988, -0.21594433,  0.23528314,
        0.5927213 ,  0.4069568 ,  0.06646444, -0.15180974, -0.08081192,
       -0.01744933, -0.51154923, -0.39859498, -0.05054428,  0.1355095 ,
        0.3408807 ,  0.09500741,  0.21086383, -0.13736713,  0.49145755,
        0.53496087,  0.15213656, -0.15006955, -0.04783138,  0.39332587],
      dtype=float32)

In [42]:
large_model.most_similar("local")
# large_model.most_similar(positive=["project"], negative=["wordpress"])

[('web', 0.8885996341705322),
 ('remote', 0.8876430988311768),
 ('media', 0.8769919872283936),
 ('router', 0.8736147880554199),
 ('desktop', 0.8681355714797974),
 ('library', 0.8669867515563965),
 ('home', 0.8624612092971802),
 ('host', 0.8615283966064453),
 ('directory', 0.8584851622581482),
 ('facebook', 0.8516136407852173)]

In [14]:
# Select the 1000 most frequently used words and sort them in descending order based on their frequency of usage
words = sorted(model.vocab.keys(), key=lambda word: model.vocab[word].count, reverse=True)[:1000]

print(words[::100])

['.', 'up', 'very', 'did', 'option', 'save', 'property', 'problems', 'packages', 'vector']
