In [1]:
!pip install psycopg2



In [2]:
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler


%matplotlib inline

In [3]:
connection = pg2.connect(host='postgres',
                         user='postgres',
                         database='postgres')

In [4]:
def connect_to_db():
    con = pg2.connect(host='postgres', 
                      dbname='postgres', 
                      user='postgres')
    cur = con.cursor(cursor_factory=RealDictCursor)
    return con, cur


#creates a connection and a cursor
#uses the cursor to execute a query
#if fetch_res is True it fetches the results, otherwise results are None
#closes the connection and returns results
def query_to_dictionary(query, fetch_res=True):
    con, cur = connect_to_db()
    cur.execute(query)
    if fetch_res:
        results = cur.fetchall()
    else:
        results = None
    con.close()
    return results

def query_to_dataframe(query):
    return DataFrame(query_to_dictionary(query))

In [5]:
query_to_dataframe('SELECT * FROM contents LIMIT 5')


Unnamed: 0,category,content,pageid,title
0,Business software,Business_Software,1037763,Business software or a business application is...
1,AccuSystems,Business_Software,41270069,AccuSystems LLC is an American company headqua...
2,Active policy management,Business_Software,5211212,Active policy management is business-oriented ...
3,Alexandria (library software),Business_Software,28502793,Alexandria is browser based cross-platform lib...
4,Alteryx,Business_Software,44133735,Alteryx is an American computer software compa...


In [6]:
content_q_df = query_to_dataframe("""
    SELECT category as title, content as category, pageid, title as content
    FROM contents"""
)


In [7]:
content_q_df.head()


Unnamed: 0,category,content,pageid,title
0,Business_Software,Business software or a business application is...,1037763,Business software
1,Business_Software,AccuSystems LLC is an American company headqua...,41270069,AccuSystems
2,Business_Software,Active policy management is business-oriented ...,5211212,Active policy management
3,Business_Software,Alexandria is browser based cross-platform lib...,28502793,Alexandria (library software)
4,Business_Software,Alteryx is an American computer software compa...,44133735,Alteryx


In [8]:
content_q_df.shape


(4130, 4)

In [9]:
from sklearn.preprocessing import LabelEncoder


In [11]:
import re

In [12]:
query = """SELECT category, COUNT( title) FROM contents GROUP BY category;"""
query = re.sub( '\s+', " ", query)
query_to_dataframe( query)

Unnamed: 0,category,count
0,Oracle Fusion Middleware,1
1,OJB,1
2,Lexicon (program),1
3,Koru search engine,1
4,Local case-control sampling,1
5,Structured sparsity regularization,1
6,Restaurant Bigwig,1
7,Application sharing,1
8,List of content management systems,1
9,Truncation selection,1


In [13]:
query = """SELECT * 
FROM
( (SELECT pageid, title FROM contents WHERE category = 'Business_Software') as BS 
INNER JOIN 
(SELECT pageid, title FROM contents WHERE category = 'Machine_Learning' ) as ML
ON bs.pageid = ML.pageid
) stuff;"""

query = re.sub( '\s+', " ", query)
query

"SELECT * FROM ( (SELECT pageid, title FROM contents WHERE category = 'Business_Software') as BS INNER JOIN (SELECT pageid, title FROM contents WHERE category = 'Machine_Learning' ) as ML ON bs.pageid = ML.pageid ) stuff;"

In [14]:
query_to_dataframe(query)

In [15]:
content_q_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4130 entries, 0 to 4129
Data columns (total 4 columns):
category    4130 non-null object
content     4115 non-null object
pageid      4130 non-null int64
title       4130 non-null object
dtypes: int64(1), object(3)
memory usage: 129.1+ KB


In [16]:
content_q_df.isnull().sum()


category     0
content     15
pageid       0
title        0
dtype: int64

In [17]:
content_q_df.dropna(axis=0,inplace=True)

In [18]:
content_q_df.isnull().sum()


category    0
content     0
pageid      0
title       0
dtype: int64

In [19]:
content_q_df.content.values[0]


"Business software or a business application is any software or set of computer programs used by business users to perform various business functions. These business applications are used to increase productivity, to measure productivity and to perform other business functions accurately.\nBy and large, business software is likely to be developed to meet the needs of a specific business, and therefore is not easily transferable to a different business environment, unless its nature and operation is identical. Due to the unique requirements of each business, off-the-shelf software is unlikely to completely address a company's needs. However, where an on-the-shelf solution is necessary, due to time or monetary considerations, some level of customization is likely to be required. Exceptions do exist, depending on the business in question, and thorough research is always required before committing to bespoke or off-the-shelf solutions.\nSome business applications are interactive, i.e., the

In [20]:
content_q_df['content'].sample(10).values


array([ 'In statistics, \'Markov chain Monte Carlo\' (MCMC) methods are a class of algorithms for sampling from a probability distribution based on constructing a Markov chain that has the desired distribution as its equilibrium distribution. The state of the chain after a number of steps is then used as a sample of the desired distribution. The quality of the sample improves as a function of the number of steps.\n\nRandom walk Monte Carlo methods make up a large subclass of MCMC methods.\n\n\n== Application domains ==\nMCMC methods are primarily used for calculating numerical approximations of multi-dimensional integrals, for example in Bayesian statistics, computational physics, computational biology and computational linguistics.\nIn Bayesian statistics, the recent development of MCMC methods has been a key step in making it possible to compute large hierarchical models that require integrations over hundreds or even thousands of unknown parameters.\nThey are also used for generatin

In [None]:
content_lower = content_q_df.content.str.lower()


In [21]:
!pip install spacy



In [22]:
!python -m spacy download en



    Downloading en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz (52.2MB)
[K    100% |████████████████████████████████| 52.2MB 58.9MB/s ta 0:00:01

[93m    Linking successful[0m

    /opt/conda/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
    --> /opt/conda/lib/python3.6/site-packages/spacy/data/en

    You can now load the model via spacy.load('en').



In [23]:
import re
from spacy.en import STOP_WORDS
from spacy.en import English
nlp = English()

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [24]:
def cleaner(text):
    text = re.sub('&#39;','',text).lower()
    text = re.sub('<br />','',text)
    text = re.sub('<.*>.*</.*>','', text)
    text = re.sub('\\ufeff', '', text)
    text = re.sub('[\d]','',text)
    text = re.sub('[^a-z ]','',text)
    text = re.sub("\d+\.\d*", "", text)
    text = ' '.join(i.lemma_ for i in nlp(text) if i.orth_ not in STOP_WORDS)
    text = ' '.join(text.split())
    return text

In [25]:
content_q_df['clean_content'] = content_q_df['content'].apply(cleaner)


In [26]:
content_q_df.head()

Unnamed: 0,category,content,pageid,title,clean_content
0,Business_Software,Business software or a business application is...,1037763,Business software,business software business application softwar...
1,Business_Software,AccuSystems LLC is an American company headqua...,41270069,AccuSystems,accusystem llc american company headquarter pu...
2,Business_Software,Active policy management is business-oriented ...,5211212,Active policy management,active policy management businessorient enterp...
3,Business_Software,Alexandria is browser based cross-platform lib...,28502793,Alexandria (library software),alexandria browser base crossplatform library ...
4,Business_Software,Alteryx is an American computer software compa...,44133735,Alteryx,alteryx american computer software company bas...


In [27]:
content_q_df['clean_content'].sample(2).values


array([ 'parity learning problem machine learn algorithm solve problem guess function give sample x x assurance compute parity bit fix location sample generate distribution input problem easy solve gaussian elimination provide sufficient number sample distribution skew provide algorithm noisy version learn parity noise version sample contain error instead sample x x algorithm provide x y y x small probability noisy version parity learn problem conjecture hard learn error reference avrim blum adam kalai hal wasserman noisetolerant learn parity problem statistical query model j acm adam tauman kalai yishay mansour elad verbin agnostic boosting parity learn proceeding th annual acm symposium theory compute victoria british columbia canada acm httpportalacmorgcitationcfmidod regev lattice learn error random linear code cryptography proceeding thirtyseventh annual acm symposium theory compute baltimore md usa acm httpportalacmorgcitationcfmid',
       'structured sparsity regularization cla

In [28]:
tfidf_vectorizer = TfidfVectorizer(min_df= 2, max_df= .95, ngram_range=(1,2), stop_words= "english")


In [29]:
vectorizer = CountVectorizer(min_df=3, max_df=.9, stop_words = "english")


In [30]:
doc_term_matrix_sp = vectorizer.fit_transform(content_q_df.clean_content.str.lower())


In [31]:
doc_term_matrix_sp_tf = tfidf_vectorizer.fit_transform(content_q_df.clean_content)


In [32]:
doc_term_matrix_sp_tf

<4115x188921 sparse matrix of type '<class 'numpy.float64'>'
	with 1372199 stored elements in Compressed Sparse Row format>

In [33]:
doc_term_matrix_sp_tf.shape


(4115, 188921)

In [34]:
doc_term_matrix_sp

<4115x18873 sparse matrix of type '<class 'numpy.int64'>'
	with 736932 stored elements in Compressed Sparse Row format>

In [35]:
doc_term_matrix_sp.shape


(4115, 18873)

In [36]:
from sklearn.decomposition import TruncatedSVD


In [37]:
n_components = 400
SVD = TruncatedSVD(n_components)
components_names = ["component_"+str(i+1) for i in range(n_components)]

In [38]:
latent_semantic_analysis = SVD.fit_transform(doc_term_matrix_sp_tf)


In [39]:
len(SVD.explained_variance_ratio_)


400

In [40]:
SVD.explained_variance_ratio_


array([ 0.00292268,  0.01368549,  0.00750553,  0.00537311,  0.00464621,
        0.00384325,  0.0033631 ,  0.00320244,  0.0030688 ,  0.00281997,
        0.00263468,  0.00250812,  0.0022863 ,  0.00215572,  0.00204309,
        0.00193747,  0.00188146,  0.00184703,  0.0018066 ,  0.00175869,
        0.00174151,  0.00172505,  0.00163545,  0.00160161,  0.00156387,
        0.00154277,  0.00149312,  0.00146708,  0.00146421,  0.00144979,
        0.00140365,  0.0013816 ,  0.00135882,  0.00133618,  0.00132423,
        0.00129957,  0.00128273,  0.00126747,  0.00126013,  0.00124294,
        0.00122619,  0.00120419,  0.00117867,  0.00117011,  0.00115653,
        0.00115034,  0.00114819,  0.00113496,  0.00110529,  0.00108892,
        0.00108016,  0.00106896,  0.00105858,  0.00105553,  0.00104858,
        0.00104291,  0.00103089,  0.00102056,  0.00101725,  0.00100917,
        0.00099667,  0.0009938 ,  0.00098406,  0.0009725 ,  0.00096583,
        0.00096009,  0.00095704,  0.0009539 ,  0.000944  ,  0.00

# searching
## cosine-similarities

In [41]:
search_term = "Neural Network"


In [42]:
search_term_vec = tfidf_vectorizer.transform([search_term])
search_term_lsa = SVD.transform(search_term_vec)

In [43]:
cosine_similarities = latent_semantic_analysis.dot(search_term_lsa.T).ravel()


In [44]:
cosine_similarities.argsort()[:-6:-1]


array([3360, 3394, 3281, 3390, 3393])

In [45]:
print(content_q_df.loc[3383]['clean_content'][:500])


snarc stochastic neural analog reinforcement calculator neural net machine design marvin lee minsky george miller gather funding project air force office scientific research summer time minskys graduate student princeton dean edmund volunteer good electronic minsky bring projectthe machine randomly connect network approximately hebb synaps synapse memory hold probability signal come input signal come output probability knob go show probability signal propagate probability signal get capacitor re


In [46]:
"neural" in content_q_df.loc[3383]['clean_content']


True

# Top 5 related articles and their catogories based on the search term¶


In [47]:
top_5 = [3383, 3418, 3334, 3417, 3375]

for n in top_5:
    print( content_q_df.loc[n]["title"] , " : ",content_q_df.loc[n]["category"])



print(content_q_df.loc[3383]["title"])

Stochastic neural analog reinforcement calculator  :  Machine_Learning
Gesture Description Language  :  Machine_Learning
Helmholtz machine  :  Machine_Learning
Causal Markov condition  :  Machine_Learning
Reservoir computing  :  Machine_Learning
Stochastic neural analog reinforcement calculator


In [50]:
print([content_q_df.loc[n]["title"] for n in top_5])


['Stochastic neural analog reinforcement calculator', 'Gesture Description Language', 'Helmholtz machine', 'Causal Markov condition', 'Reservoir computing']


In [51]:
search_term1 = "customer"


In [52]:
search_term_vec1 = tfidf_vectorizer.transform([search_term1])
search_term_lsa1 = SVD.transform(search_term_vec1)

In [53]:
cosine_similarities_1 = latent_semantic_analysis.dot(search_term_lsa1.T).ravel()


In [54]:
cosine_similarities_1.argsort()[:-6:-1]


array([3039, 2496, 2493, 1803, 1176])

In [55]:
print(content_q_df.loc[2512]['clean_content'][:500])


sap crm application integrate customer relationship management crm software manufacture sap se target business software requirement midsize large organization industry sector overview acquisition hybris sap gradually realign strategy crm solution space mainly take market leader salesforcecom cloud base solution bid competitive future focused sap shift cloud base crm solution traditional onpremise crmsap consolidate crm solution hybris brand customer engagement commerce sap offer variety solution


In [56]:
"customer" in content_q_df.loc[2512]['clean_content']


True

In [57]:
top2_5 = [3064, 2512, 2509, 1812, 1182]

for n in top2_5:
    print( content_q_df.loc[n]["title"] , " : ",content_q_df.loc[n]["category"])

Apprenticeship learning  :  Machine_Learning
SAP CRM  :  Business_Software
User:Eli32167/sandbox  :  Business_Software
FirstClass  :  Business_Software
Marketing automation  :  Business_Software


In [58]:
#from sklearn.neighbors import NearestNeighbors
#NN = NearestNeighbors()
#NN.fit(latent_semantic_analysis)

In [59]:
type(doc_term_matrix_sp)


scipy.sparse.csr.csr_matrix

In [60]:
vocabulary_expression = pd.DataFrame(SVD.components_,
                                     index=components_names,
                                     columns=vectorizer.get_feature_names()).T

#IT SEEMS I DIDN'T PASSED THE CORRECT DATA SHAPE.

ValueError: Shape of passed values is (188921, 400), indices imply (18873, 400)

In [62]:
for i in range(1,6):
    vocabulary_expression['abs_component_{}'.format(i)] = np.abs(vocabulary_expression['component_{}'.format(i)])

NameError: name 'vocabulary_expression' is not defined

In [61]:
vocabulary_expression[['component_1']].sort_values('component_1',ascending=False).head(5)


NameError: name 'vocabulary_expression' is not defined

In [63]:
vocabulary_expression[['component_2']].sort_values('component_2',ascending=False).head(5)


NameError: name 'vocabulary_expression' is not defined