## Installing required packages
Install nltk, pandas, gensim packages by performing the following commands in the terminal or command prompt:
`pip install --upgrade nltk pandas gensim`
or
`conda install nltk pandas gensim`
based on the python environment of your choice

In [2]:
import numpy as np
from nltk.tokenize import WordPunctTokenizer
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("./train.csv")
df.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0,What am I losing when using extension tubes in...,After playing around with macro photography on...,ysap,https://photo.stackexchange.com/users/1024,"I just got extension tubes, so here's the skin...",rfusca,https://photo.stackexchange.com/users/1917,http://photo.stackexchange.com/questions/9169/...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0
1,1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,russellpierce,https://rpg.stackexchange.com/users/8774,It might be helpful to look into the definitio...,Erik Schmidt,https://rpg.stackexchange.com/users/1871,http://rpg.stackexchange.com/questions/47820/w...,CULTURE,...,0.888889,0.888889,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889
2,2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Joe Baker,https://electronics.stackexchange.com/users/10157,Do you even need grooves? We make several pro...,Dwayne Reid,https://electronics.stackexchange.com/users/64754,http://electronics.stackexchange.com/questions...,SCIENCE,...,0.777778,0.777778,0.555556,1.0,1.0,0.666667,0.0,0.333333,1.0,0.888889
3,3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...",Scimonster,https://judaism.stackexchange.com/users/5151,"Sending an ""affidavit"" it is a dispute between...",Y e z,https://judaism.stackexchange.com/users/4794,http://judaism.stackexchange.com/questions/551...,CULTURE,...,0.888889,0.833333,0.333333,0.833333,1.0,0.8,0.0,0.0,1.0,1.0
4,5,How do you make a binary image in Photoshop?,I am trying to make a binary image. I want mor...,leigero,https://graphicdesign.stackexchange.com/users/...,Check out Image Trace in Adobe Illustrator. \n...,q2ra,https://graphicdesign.stackexchange.com/users/...,http://graphicdesign.stackexchange.com/questio...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,1.0,1.0


In [4]:
print("Total records in the data frame: {}" .format(df.shape[0]))

Total records in the data frame: 6079


In [5]:
x_columns = df.columns.values[1:].tolist()[0:10]

y_columns = df.columns.values[1:].tolist()[10:40]
len(y_columns)

30

In [6]:
def returnColumn(x, start, stop) :
    return list(x[start:stop])

In [7]:
X = list()
for row in df.values :
    X.append(returnColumn(row, 0, 10))
X = pd.DataFrame(X, columns=x_columns)
X.head()
X.shape

(6079, 10)

In [8]:
Y = list()
for row in df.values:
    Y.append(returnColumn(row, 11, 41))
Y = pd.DataFrame(Y, columns=y_columns)
Y.head()
Y.shape

(6079, 30)

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(4863, 10) (4863, 30)
(1216, 10) (1216, 30)


In [3]:
question_title = df["question_title"]
question_body = df["question_body"]
answer = df["answer"]

In [4]:
# Create an object for word tokenizer
tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize(question_title[20].lower()))

['how', 'often', 'do', 'i', 'need', 'to', 'change', 'my', 'oil', 'in', 'a', 'vw', 'tdi', '?']


In [5]:
# Create tokenized lists of question title, question body and answer
tokenized_q_title = list()
for line in question_title:
    tokenized_q_title.append(tokenizer.tokenize(line.lower()))

tokenized_q_body = list()
for line in question_body:
    tokenized_q_body.append(tokenizer.tokenize(line.lower()))

tokenized_answer = list()
for line in answer:
    tokenized_answer.append(tokenizer.tokenize(line.lower()))

In [6]:
# Checking if the answer block is well tokenized
print(tokenized_answer[34])

['i', 'think', 'you', 'need', 'to', 'read', 'about', 'projects', 'and', 'solutions', ':', 'http', '://', 'msdn', '.', 'microsoft', '.', 'com', '/', 'en', '-', 'us', '/', 'library', '/', 'ee817674', '.', 'aspx', 'and', 'after', 'that', 'it', 'all', 'becomes', 'more', 'clear', ':', 'have', 'one', 'solution', ',', 'inside', 'of', 'that', 'solution', 'create', 'a', 'project', 'for', 'your', 'application', 'and', 'a', 'project', 'for', 'your', 'unit', '-', 'tests', '.', 'in', 'test', '-', 'project', 'add', 'reference', 'to', 'the', 'testing', 'framework', 'of', 'your', 'choice', 'and', 'a', 'reference', 'to', 'your', 'application', '-', 'project', '.', 'this', 'way', 'your', 'application', 'does', 'not', 'know', 'about', 'your', 'tests', 'and', 'compiled', 'into', 'one', 'assembly', '.', 'at', 'the', 'same', 'time', 'your', 'tests', 'depend', 'on', 'your', 'application', ',', 'but', 'compiled', 'into', 'another', 'assembly', ',', 'which', 'can', 'be', 'used', 'by', 'your', 'test', '-', 'run

In [7]:
# Joining the tokenized answer
print(' '.join(tokenized_answer[34]))

i think you need to read about projects and solutions : http :// msdn . microsoft . com / en - us / library / ee817674 . aspx and after that it all becomes more clear : have one solution , inside of that solution create a project for your application and a project for your unit - tests . in test - project add reference to the testing framework of your choice and a reference to your application - project . this way your application does not know about your tests and compiled into one assembly . at the same time your tests depend on your application , but compiled into another assembly , which can be used by your test - runner gui / ci or whatever else you use . and to answer your next question , for test - project you need to choose project type of " library " ( console application will work as well if you like )


In [8]:
model = Word2Vec(tokenized_q_body, size=40, min_count=5, window=6).wv

In [9]:
model.get_vector("framework")       # Some random word in the question_title column in the dataset

array([-0.23921941,  0.13013904, -0.19032788,  0.01522156, -0.47804752,
        0.16582343, -0.19456829, -0.21829945, -0.2174466 ,  0.30487815,
        0.22614269, -0.41361937, -0.0401662 ,  0.5422716 ,  0.1694647 ,
        0.5381826 , -0.2069952 , -0.33938038,  0.04561912,  0.16476537,
       -0.16241425,  0.09392896,  0.00770915,  0.3346215 ,  0.25071475,
       -0.10255306, -0.03189423,  0.20490131, -0.5458475 ,  0.05711024,
        0.36852008, -0.1193844 ,  0.001635  ,  0.47262362, -0.19054806,
        0.02217318,  0.17378293,  0.06826955,  0.11293535, -0.19327132],
      dtype=float32)

In [10]:
model.most_similar("project")

[('app', 0.8447062969207764),
 ('wordpress', 0.7844392657279968),
 ('website', 0.7842112183570862),
 ('setup', 0.7810954451560974),
 ('configuration', 0.7742068767547607),
 ('application', 0.7726293802261353),
 ('successfully', 0.7659796476364136),
 ('web', 0.7500545978546143),
 ('blog', 0.7442182302474976),
 ('site', 0.7438467144966125)]

In [11]:
all_sentences = list()      # All corpus available in the dataset

for line in tokenized_q_title:
    all_sentences.append(line)

for line in tokenized_q_body:
    all_sentences.append(line)

for line in tokenized_answer:
    all_sentences.append(line)

# print(all_sentences[100])

large_model = Word2Vec(all_sentences, size=40, min_count=5, window=6).wv

In [12]:
large_model.get_vector("framework")

array([ 0.22614406,  0.18407774,  0.6273944 ,  0.94544655,  0.22035408,
        0.18214102, -0.3569308 , -0.0504012 , -0.2627686 , -0.21002813,
       -0.16079727,  0.26206252,  0.1478542 ,  0.68146396, -0.01279862,
        0.36174372,  0.579124  , -0.4415604 ,  0.03232605,  0.17362575,
       -0.14464389,  0.09915953,  0.51756305,  0.49285233,  0.59476954,
        0.12805662, -0.5631401 , -0.35877237, -0.37548807,  0.40246978,
        0.63577414, -0.19661742, -0.18706313,  0.35731423, -0.5009254 ,
       -0.15280549,  0.07309064,  0.03921694, -0.05773719, -0.3923961 ],
      dtype=float32)

In [13]:
large_model.most_similar("project")
# large_model.most_similar(positive=["project"], negative=["wordpress"])

[('app', 0.9100373387336731),
 ('application', 0.826718270778656),
 ('website', 0.8237828016281128),
 ('setup', 0.8108820915222168),
 ('client', 0.7995128035545349),
 ('browser', 0.7990148067474365),
 ('internet', 0.7952461242675781),
 ('plugin', 0.7892935276031494),
 ('wordpress', 0.7768241763114929),
 ('web', 0.7724286913871765)]

In [14]:
# Select the 1000 most frequently used words and sort them in descending order based on their frequency of usage
words = sorted(model.vocab.keys(), key=lambda word: model.vocab[word].count, reverse=True)[:1000]

print(words[::100])

['.', 'up', 'very', 'did', 'option', 'save', 'property', 'problems', 'packages', 'vector']
