In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

  import pandas.util.testing as tm


In [26]:
pip install beautifulsoup4




In [0]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models import FastText
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from sklearn.neighbors import NearestNeighbors


In [4]:
df_ques= pd.read_csv('/content/drive/My Drive/Projects/pythonstalkoverflow/Questions.csv',encoding = "ISO-8859-1", nrows=30000,usecols=['Id','Title','Body'])

df_ques.head()

Unnamed: 0,Id,Title,Body
0,469,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...


In [5]:
print('Question1:' ,df_ques.iloc[0,2])
print('Question2:',df_ques.iloc[1,2])

Question1: <p>I am using the Photoshop's javascript API to find the fonts in a given PSD.</p>

<p>Given a font name returned by the API, I want to find the actual physical font file that that font name corresponds to on the disc.</p>

<p>This is all happening in a python program running on OSX so I guess I'm looking for one of:</p>

<ul>
<li>Some Photoshop javascript</li>
<li>A Python function</li>
<li>An OSX API that I can call from python</li>
</ul>

Question2: <p>I have a cross-platform (Python) application which needs to generate a JPEG preview of the first page of a PDF.</p>

<p>On the Mac I am spawning <a href="http://developer.apple.com/documentation/Darwin/Reference/ManPages/man1/sips.1.html">sips</a>.  Is there something similarly simple I can do on Windows?</p>



##### **As We can see some of the questions are quite long and some are just 1 line. There are alot of cleaning stuff that needs to be done.** 

### **Pre-Processing**

In [0]:
###using beautifulsoup for clearning the HTML text

#Using beautiful soup to grab text inside 'p' tags and concatenate it
def get_question(html_text):
  soup = BeautifulSoup(html_text, 'lxml')
  question = ' '.join([t.text for t in soup.find_all('p')]) #concatenating all p tags
  return question

#Transforming questions to list for ease of processing
df_ques['Body'] = df_ques['Body'].apply(get_question)

In [7]:

print(' top 5 Question:',df_ques['Body'].head())


 top 5 Question: 0    I am using the Photoshop's javascript API to f...
1    I have a cross-platform (Python) application w...
2    I'm starting work on a hobby project with a py...
3    There are several ways to iterate over a resul...
4    I don't remember whether I was dreaming or not...
Name: Body, dtype: object


In [8]:
##toekinzing
nltk.download('stopwords')
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stop_words= stopwords.words('english')
stemmer= SnowballStemmer('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)
df_ques['Body'] = df_ques['Body'].apply(lambda x: preprocess(x))

In [10]:
df_ques['Body'].head()

0    using photoshop javascript api find fonts give...
1    cross platform python application needs genera...
2    starting work hobby project python codebase wo...
3             several ways iterate result set tradeoff
4    remember whether dreaming seem recall function...
Name: Body, dtype: object

In [0]:
#len(question_list)
#[' '.join(question) for question in df_ques['Body']]

In [0]:
documents = [_text.split() for _text in df_ques.Body]


In [12]:
len(documents)

30000

### **In this notebook, I will be comparing Word2Vec and FastText**

#### **Word2Vec**

In [13]:
n = 50
model = Word2Vec(documents, size = n, window = 8)

#Training model using questions corpora
model.train(documents, total_examples=len(documents), epochs=10)


(12221933, 13741490)

#### Let's Inspect the words by checking the most similar words in the corpus

In [19]:
#Let's see how it worked
word_vectors = model.wv
word_vectors.most_similar(positive='database')

  if np.issubdtype(vec.dtype, np.int):


[('db', 0.8869057297706604),
 ('sqlite', 0.7295145988464355),
 ('tables', 0.7214229106903076),
 ('postgres', 0.7214202880859375),
 ('databases', 0.6955385804176331),
 ('querying', 0.6863582134246826),
 ('postgresql', 0.683992862701416),
 ('table', 0.6683526039123535),
 ('mysql', 0.6638216972351074),
 ('schema', 0.635815441608429)]

## **We Can see Word2vec worked good on this corpus. Now let's see how FastText can work on this.**

**The main difference of FastText from Word2Vec is that it uses sub-word information (i.e character n-grams). While it brings additional utility to the embeddings, it also considerably slows down the process.**

In [0]:
ft_model = FastText(documents, size=n, window=8, min_count=5, workers=2,sg=1)

In [17]:
print('Words similar to "array" are: ', ft_model.wv.most_similar('array'))

Words similar to "array" are:  [('arrays', 0.925803005695343), ('ndarray', 0.9246947765350342), ('asarray', 0.9188598394393921), ('recarray', 0.9007734656333923), ('arraylist', 0.8880296945571899), ('dimension', 0.8681841492652893), ('bitarray', 0.8564947843551636), ('dimensional', 0.8557138442993164), ('multidimensional', 0.8469366431236267), ('1d', 0.8217758536338806)]


  if np.issubdtype(vec.dtype, np.int):


In [18]:
print('Words similar to "database" are: ', ft_model.wv.most_similar('database'))

Words similar to "database" are:  [('databases', 0.9340032339096069), ('db', 0.8906856775283813), ('postgres', 0.8789547681808472), ('dbs', 0.8717832565307617), ('postgresql', 0.8495760560035706), ('sqlite', 0.8375387191772461), ('databaseconnection', 0.8281651735305786), ('mssql', 0.8183775544166565), ('sqlobject', 0.815682590007782), ('zodb', 0.8081094026565552)]


  if np.issubdtype(vec.dtype, np.int):


#### **We have noticed that Skip gram worked better with Fasttext**

Here we can see that FastText has produced different vector embeddings. 'Array' now is close to the words which also contain the ngram 'array' and 'database' is close to different ngrams of the word database plus some variations of database tools.

We can clearly see the difference between embbedding methods - Word2Vec puts the words which occur in the same context closer in the vector space, while FastText does the same but also allows to incorporate less frequent words into this vector space. Use of n-grams really does play a key role in word embbedings and hence, I will proceed with using FastText embbeddings as a basis for sentence embeddings.

In [0]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


In [0]:
documents1 = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
model = Doc2Vec(documents1, vector_size=n, window=8, min_count=5, workers=2, dm = 1, epochs=20)

In [43]:
print(df_ques.Body[42], ' \nis similar to \n')
print([df_ques.Body[similar[0]] for similar in model.docvecs.most_similar(42)])

best way sanitise user input python based web application single function remove html characters necessary characters combinations prevent xss sql injection attack  
is similar to 

['cgi escape seems like one possible choice work well something considered better', 'want use sqlite memory database testing postgresql development production server sql syntax dbs ex sqlite autoincrement postgresql serial easy port sql script sqlite postgresql solutions want use standard sql go generating primary key databases', 'script loops series four less characters strings example able implement nested loops like sort loop nesting bad thing would better way accomplishing', 'simply delete directory python installation lingering files must delete', 'code getting error conditional wrong', 'python function sanitisation input parameters caters arguments passed none rather empty strings easier concise way loop round function parameters apply expression actual function nine parameters', 'question concerns tw

  if np.issubdtype(vec.dtype, np.int):
