In [1]:
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

%matplotlib inline

In [2]:
connection = pg2.connect(host='postgres',
                         user='postgres',
                         database='postgres')

In [3]:
#Contains our connection credentials and returns a connection and cursor.
def connect_to_db():
    con = pg2.connect(host='postgres', 
                      dbname='postgres', 
                      user='postgres')
    cur = con.cursor(cursor_factory=RealDictCursor)
    return con, cur


#creates a connection and a cursor
#uses the cursor to execute a query
#if fetch_res is True it fetches the results, otherwise results are None
#closes the connection and returns results
def query_to_dictionary(query, fetch_res=True):
    con, cur = connect_to_db()
    cur.execute(query)
    if fetch_res:
        results = cur.fetchall()
    else:
        results = None
    con.close()
    return results

def query_to_dataframe(query):
    return DataFrame(query_to_dictionary(query))

In [5]:
content_q_df = query_to_dataframe("""
    SELECT category as title, content as category, pageid, title as content
    FROM contents"""
)

In [6]:
content_q_df.head()


Unnamed: 0,category,content,pageid,title
0,Business_Software,Business software or a business application is...,1037763,Business software
1,Business_Software,AccuSystems LLC is an American company headqua...,41270069,AccuSystems
2,Business_Software,Active policy management is business-oriented ...,5211212,Active policy management
3,Business_Software,Alexandria is browser based cross-platform lib...,28502793,Alexandria (library software)
4,Business_Software,Alteryx is an American computer software compa...,44133735,Alteryx


In [7]:
content_q_df["content"][:5]


0    Business software or a business application is...
1    AccuSystems LLC is an American company headqua...
2    Active policy management is business-oriented ...
3    Alexandria is browser based cross-platform lib...
4    Alteryx is an American computer software compa...
Name: content, dtype: object

In [8]:
content_q_df.shape


(4130, 4)

In [9]:
content_q_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4130 entries, 0 to 4129
Data columns (total 4 columns):
category    4130 non-null object
content     4115 non-null object
pageid      4130 non-null int64
title       4130 non-null object
dtypes: int64(1), object(3)
memory usage: 129.1+ KB


In [10]:
content_q_df.isnull().sum()

category     0
content     15
pageid       0
title        0
dtype: int64

In [11]:
content_q_df.dropna(axis=0,inplace=True)

In [12]:
X = content_q_df["content"]
y = content_q_df["category"]

In [13]:
X.shape, y.shape


((4115,), (4115,))

In [15]:
X.content = X.apply(lambda x: x.lower())


In [16]:
np.where(X.content.apply(lambda x: len(x)).values < 100)


(array([ 776, 1236]),)

In [18]:
X.iloc[776]

'\n== References =='

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 8)


In [20]:
X_train[X_train.index == 776]


776    \n== References ==
Name: content, dtype: object

In [22]:
X_train_2 = X_train[X_train.index != 776]


In [23]:
content_q_df.content.str.lower().sample(1).values


array([ 'in artificial intelligence, an evolutionary algorithm (ea) is a subset of evolutionary computation, a generic population-based metaheuristic optimization algorithm. an ea uses mechanisms inspired by biological evolution, such as reproduction, mutation, recombination, and selection. candidate solutions to the optimization problem play the role of individuals in a population, and the fitness function determines the quality of the solutions (see also loss function). evolution of the population then takes place after the repeated application of the above operators.\nevolutionary algorithms often perform well approximating solutions to all types of problems because they ideally do not make any assumption about the underlying fitness landscape. techniques from evolutionary algorithms applied to the modeling of biological evolution are generally limited to explorations of microevolutionary processes and planning models based upon cellular processes. in most real applications of eas, 

In [24]:
rf_pipe = Pipeline([
    ('vec', TfidfVectorizer()),
    ('svd', TruncatedSVD(n_components=400)),
    ('clf', RandomForestClassifier())
])

In [25]:
rfparams = {
    'vec__min_df':[2],
    'vec__max_df':[.95],
    'vec__ngram_range':[(1,2)],
    'clf__n_estimators':[200],
    'clf__max_features':['auto']
}

In [26]:
rf_gs = GridSearchCV(rf_pipe, param_grid=rfparams, cv=5)


In [27]:
rf_gs.fit(X_train, y_train)


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'vec__min_df': [2], 'vec__max_df': [0.95], 'vec__ngram_range': [(1, 2)], 'clf__n_estimators': [200], 'clf__max_features': ['auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [28]:
rf_gs.score(X_train, y_train)

0.99837977965003244

In [29]:
rf_gs.score(X_test, y_test)


0.9611273080660836

In [30]:
rf_gs.best_params_


{'clf__max_features': 'auto',
 'clf__n_estimators': 200,
 'vec__max_df': 0.95,
 'vec__min_df': 2,
 'vec__ngram_range': (1, 2)}

In [31]:
rf_gs.best_score_


0.96338302009073229

In [32]:
rf_gs.predict(X_test)


array(['Machine_Learning', 'Business_Software', 'Machine_Learning', ...,
       'Business_Software', 'Business_Software', 'Business_Software'], dtype=object)

In [33]:
y_preds = rf_gs.predict(y_test)
len(y_preds)

1029

In [34]:
y_pred_proba = rf_gs.predict_proba(y_test)
y_pred_proba

array([[ 0.485,  0.515],
       [ 0.485,  0.515],
       [ 0.485,  0.515],
       ..., 
       [ 0.485,  0.515],
       [ 0.485,  0.515],
       [ 0.485,  0.515]])

In [35]:
!pip install wikipedia


Collecting wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [36]:
import re
import wikipedia
import requests
import json

In [37]:
def generate_category(category):
    '''
    format a category for insertion in to a wikipedia api call
    '''
    category = re.sub('\s','+',category)
    return category

In [38]:
def generate_query(category):
    '''
    Format an api call for requests
    '''
    query = """
            http://en.wikipedia.org/w/api.php?
            action=query&
            format=json&
            list=categorymembers&
            cmtitle=Category:{}&
            cmprop=ids%7Ctitle%7Ctype&
            cmlimit=max
            """.format(generate_category(category))
    query = re.sub('\s','',query)
    return query

In [39]:
def execute_category_query(category):
    '''
    Executes a category qeury and returns a 
    DataFrame of the category members
    '''
    
    r = requests.get(generate_query(category))
    response = r.json()
    return pd.DataFrame(response['query']['categorymembers'])

In [40]:
dl_query = generate_query("deep learning")
dl_query

'http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:deep+learning&cmprop=ids%7Ctitle%7Ctype&cmlimit=max'

In [41]:
new_data = wikipedia.page('deep learning')

dl_df = execute_category_query("deep learning")
dl_df.head()

Unnamed: 0,ns,pageid,title,type
0,0,32472154,Deep learning,page
1,0,52642349,AIVA,page
2,0,52801963,AlexNet,page
3,0,51545339,Apache SINGA,page
4,0,55075082,BigDL,page


In [47]:
new_content = []

page =  wikipedia.WikipediaPage(pageid = 32472154)
content = page.content
title = page.title
new_content.append((page, content, title))

In [48]:
new_content

[(<WikipediaPage 'Deep learning'>,
  'Deep learning (also known as deep structured learning or hierarchical learning) is part of a broader family of machine learning methods based on learning data representations, as opposed to task-specific algorithms. Learning can be supervised, partially supervised or unsupervised.\nSome representations are loosely based on interpretation of information processing and communication patterns in a biological nervous system, such as neural coding that attempts to define a relationship between various stimuli and associated neuronal responses in the brain. Research attempts to create efficient systems to learn these representations from large-scale, unlabeled data sets.\nDeep learning architectures such as deep neural networks, deep belief networks and recurrent neural networks have been applied to fields including computer vision, speech recognition, natural language processing, audio recognition, social network filtering, machine translation and bioin

In [49]:
X_new_df = pd.DataFrame(new_content, columns= ("pageid","content","title"))

X_new_df.head()

Unnamed: 0,pageid,content,title
0,<WikipediaPage 'Deep learning'>,Deep learning (also known as deep structured l...,Deep learning


In [50]:
X_new = X_new_df['content'].apply(lambda x: x.lower())

In [51]:
rf_gs.predict(X_new)


array(['Machine_Learning'], dtype=object)

In [52]:
new_data_as = wikipedia.page('Accounting software')

as_df = execute_category_query("Accounting software")
as_df.head()

Unnamed: 0,ns,pageid,title,type
0,0,13277642,Accounting software,page
1,0,2405553,Comparison of accounting software,page
2,0,13229015,2Clix Software,page
3,0,1272412,Accounting information system,page
4,0,9016834,Accounting intelligence,page


In [53]:
new_content_as = []

page =  wikipedia.WikipediaPage(pageid = 13277642)
content = page.content
title = page.title
new_content_as.append((page, content, title))

In [54]:
X_as_df = pd.DataFrame(new_content_as, columns= ("pageid","content","title"))

X_as_df.head()

Unnamed: 0,pageid,content,title
0,<WikipediaPage 'Accounting software'>,Accounting software describes a type of applic...,Accounting software


In [55]:
X_as = X_as_df['content'].apply(lambda x: x.lower())


In [56]:
rf_gs.predict(X_as)


array(['Business_Software'], dtype=object)

In [57]:
page = wikipedia.page('deep learning').url
page

'https://en.wikipedia.org/wiki/Deep_learning'

In [58]:
new_url = page


In [59]:
new_url_df_1 = X_new_df


In [60]:
new_url_df_1["new_url"] = new_url


In [61]:
new_url_df_1


Unnamed: 0,pageid,content,title,new_url
0,<WikipediaPage 'Deep learning'>,Deep learning (also known as deep structured l...,Deep learning,https://en.wikipedia.org/wiki/Deep_learning


In [62]:
x_url_1 = new_url_df_1["new_url"].apply(lambda x: x.lower())


In [63]:
x_url_1


0    https://en.wikipedia.org/wiki/deep_learning
Name: new_url, dtype: object

In [64]:
rf_gs.predict(x_url_1)


array(['Business_Software'], dtype=object)