## Installing required packages
Install nltk, pandas, gensim packages by performing the following commands in the terminal or command prompt:
`pip install --upgrade nltk pandas gensim`
or
`conda install nltk pandas gensim`
based on the python environment of your choice

In [1]:
import numpy as np
from nltk.tokenize import WordPunctTokenizer
import pandas as pd
from gensim.models import Word2Vec

In [2]:
df = pd.read_csv("./train.csv")
df.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0,What am I losing when using extension tubes in...,After playing around with macro photography on...,ysap,https://photo.stackexchange.com/users/1024,"I just got extension tubes, so here's the skin...",rfusca,https://photo.stackexchange.com/users/1917,http://photo.stackexchange.com/questions/9169/...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0
1,1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,russellpierce,https://rpg.stackexchange.com/users/8774,It might be helpful to look into the definitio...,Erik Schmidt,https://rpg.stackexchange.com/users/1871,http://rpg.stackexchange.com/questions/47820/w...,CULTURE,...,0.888889,0.888889,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889
2,2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Joe Baker,https://electronics.stackexchange.com/users/10157,Do you even need grooves? We make several pro...,Dwayne Reid,https://electronics.stackexchange.com/users/64754,http://electronics.stackexchange.com/questions...,SCIENCE,...,0.777778,0.777778,0.555556,1.0,1.0,0.666667,0.0,0.333333,1.0,0.888889
3,3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...",Scimonster,https://judaism.stackexchange.com/users/5151,"Sending an ""affidavit"" it is a dispute between...",Y e z,https://judaism.stackexchange.com/users/4794,http://judaism.stackexchange.com/questions/551...,CULTURE,...,0.888889,0.833333,0.333333,0.833333,1.0,0.8,0.0,0.0,1.0,1.0
4,5,How do you make a binary image in Photoshop?,I am trying to make a binary image. I want mor...,leigero,https://graphicdesign.stackexchange.com/users/...,Check out Image Trace in Adobe Illustrator. \n...,q2ra,https://graphicdesign.stackexchange.com/users/...,http://graphicdesign.stackexchange.com/questio...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,1.0,1.0


In [3]:
question_title = df["question_title"]
question_body = df["question_body"]
answer = df["answer"]

In [4]:
# Create an object for word tokenizer
tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize(question_title[20].lower()))

['how', 'often', 'do', 'i', 'need', 'to', 'change', 'my', 'oil', 'in', 'a', 'vw', 'tdi', '?']


In [5]:
# Create tokenized lists of question title, question body and answer
tokenized_q_title = list()
for line in question_title:
    tokenized_q_title.append(tokenizer.tokenize(line.lower()))

tokenized_q_body = list()
for line in question_body:
    tokenized_q_body.append(tokenizer.tokenize(line.lower()))

tokenized_answer = list()
for line in answer:
    tokenized_answer.append(tokenizer.tokenize(line.lower()))

In [6]:
# Checking if the answer block is well tokenized
print(tokenized_answer[34])

['i', 'think', 'you', 'need', 'to', 'read', 'about', 'projects', 'and', 'solutions', ':', 'http', '://', 'msdn', '.', 'microsoft', '.', 'com', '/', 'en', '-', 'us', '/', 'library', '/', 'ee817674', '.', 'aspx', 'and', 'after', 'that', 'it', 'all', 'becomes', 'more', 'clear', ':', 'have', 'one', 'solution', ',', 'inside', 'of', 'that', 'solution', 'create', 'a', 'project', 'for', 'your', 'application', 'and', 'a', 'project', 'for', 'your', 'unit', '-', 'tests', '.', 'in', 'test', '-', 'project', 'add', 'reference', 'to', 'the', 'testing', 'framework', 'of', 'your', 'choice', 'and', 'a', 'reference', 'to', 'your', 'application', '-', 'project', '.', 'this', 'way', 'your', 'application', 'does', 'not', 'know', 'about', 'your', 'tests', 'and', 'compiled', 'into', 'one', 'assembly', '.', 'at', 'the', 'same', 'time', 'your', 'tests', 'depend', 'on', 'your', 'application', ',', 'but', 'compiled', 'into', 'another', 'assembly', ',', 'which', 'can', 'be', 'used', 'by', 'your', 'test', '-', 'run

In [7]:
# Joining the tokenized answer
print(' '.join(tokenized_answer[34]))

i think you need to read about projects and solutions : http :// msdn . microsoft . com / en - us / library / ee817674 . aspx and after that it all becomes more clear : have one solution , inside of that solution create a project for your application and a project for your unit - tests . in test - project add reference to the testing framework of your choice and a reference to your application - project . this way your application does not know about your tests and compiled into one assembly . at the same time your tests depend on your application , but compiled into another assembly , which can be used by your test - runner gui / ci or whatever else you use . and to answer your next question , for test - project you need to choose project type of " library " ( console application will work as well if you like )


In [8]:
model = Word2Vec(tokenized_q_body, size=40, min_count=5, window=6).wv

In [9]:
model.get_vector("framework")       # Some random word in the question_title column in the dataset

array([ 0.18203267, -0.1384882 ,  0.00966626,  0.12690865, -0.28199565,
        0.14752762,  0.09058268,  0.18658027, -0.3836865 , -0.2635417 ,
       -0.26915047, -0.2784006 , -0.15969494, -0.20479788, -0.05521997,
        0.03018031,  0.65504354, -0.07847816,  0.24263659,  0.5578137 ,
       -0.18462616,  0.19743758, -0.37640756,  0.06871933,  0.04111285,
        0.23510045, -0.0541443 , -0.041373  , -0.1284858 ,  0.0113073 ,
       -0.18097658,  0.21724087,  0.4504592 ,  0.31373334,  0.15891254,
        0.23489682, -0.25656214,  0.07341169, -0.16092066,  0.40183562],
      dtype=float32)

In [10]:
model.most_similar("project")

[('app', 0.859724760055542),
 ('website', 0.783863365650177),
 ('configuration', 0.7782309055328369),
 ('setup', 0.7724300026893616),
 ('wordpress', 0.7642975449562073),
 ('application', 0.7601040601730347),
 ('web', 0.7523177862167358),
 ('successfully', 0.7475973963737488),
 ('computer', 0.7407457828521729),
 ('working', 0.7406514286994934)]

In [11]:
all_sentences = list()      # All corpus available in the dataset

for line in tokenized_q_title:
    all_sentences.append(line)

for line in tokenized_q_body:
    all_sentences.append(line)

for line in tokenized_answer:
    all_sentences.append(line)

# print(all_sentences[100])

large_model = Word2Vec(all_sentences, size=40, min_count=5, window=6).wv

In [12]:
large_model.get_vector("framework")

array([ 0.3438575 , -0.47977808, -0.37075818,  0.64499336, -0.15111974,
        0.11152916, -0.3981679 , -0.31392008, -0.22092393, -0.8772451 ,
       -0.21132411,  0.0458063 , -0.38828593, -0.38899082, -0.12015202,
       -0.34931326,  0.8198952 , -0.3177678 , -0.13692953,  0.60298496,
       -0.59684587,  0.12603085, -0.53726774, -0.16795228, -0.14232261,
        0.15524663, -0.46503446,  0.16323006, -0.3397545 , -0.08717102,
       -0.2570972 ,  0.20362075,  0.30667374,  0.12126993,  0.43788797,
       -0.43325633, -0.7696548 ,  0.06870513,  0.15370421,  0.10153654],
      dtype=float32)

In [13]:
large_model.most_similar("project")
# large_model.most_similar(positive=["project"], negative=["wordpress"])

[('app', 0.8827044367790222),
 ('website', 0.8477051854133606),
 ('application', 0.8145719766616821),
 ('plugin', 0.8136230707168579),
 ('setup', 0.7963675260543823),
 ('configuration', 0.7911832332611084),
 ('blog', 0.7865884304046631),
 ('wordpress', 0.7836558818817139),
 ('browser', 0.7812919616699219),
 ('created', 0.7765824794769287)]

In [14]:
# Select the 1000 most frequently used words and sort them in descending order based on their frequency of usage
words = sorted(model.vocab.keys(), key=lambda word: model.vocab[word].count, reverse=True)[:1000]

print(words[::100])

['.', 'up', 'very', 'did', 'option', 'save', 'property', 'problems', 'packages', 'vector']
