In [4]:
%cd ..

/home/jovyan/GA_DSI/Projects/project_4/semantic_search


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,precision_recall_curve,confusion_matrix 
from sklearn.metrics import (precision_score,accuracy_score,roc_auc_score,roc_curve, 
                             precision_recall_curve,recall_score,make_scorer,auc) 
from collections import Counter
import lib.general_utilities as gu
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [6]:
# make a predictive model to predict wiki category based on page title

In [7]:
# read the aggregate collection dataframe. 
BS_ML_collection_df = gu.read_pickle_obj('data/BS_ML_collection_df.obj')

In [8]:
# First 1104 pages are Machine Learning category.
# Last 3020 pages are Business Sotware category.
BS_ML_collection_df.shape

(4124, 5)

In [9]:
BS_ML_collection_df.drop("index", axis=1, inplace=True)

In [10]:
BS_ML_collection_df.head()

Unnamed: 0,pageid,text,title,category
0,43385931,Data exploration is an approach similar to ini...,Data exploration,machine learning
1,49082762,These datasets are used for machine-learning r...,List of datasets for machine learning research,machine learning
2,233488,Machine learning is a field of computer scienc...,Machine learning,machine learning
3,53587467,The following outline is provided as an overvi...,Outline of machine learning,machine learning
4,3771060,The accuracy paradox for predictive analytics ...,Accuracy paradox,machine learning


In [11]:
BS_ML_collection_df.tail()

Unnamed: 0,pageid,text,title,category
4119,27143309,Storyist is a creative writing application for...,Storyist,business software
4120,328705,Taste is a Macintosh word processor that combi...,Taste (software),business software
4121,1577008,Ted is a word processor for the X Window Syste...,Ted (word processor),business software
4122,37628014,The Thorn EMI Liberator was a laptop word proc...,Thorn EMI Liberator,business software
4123,29902828,Word Juggler was a word processor application ...,Word Juggler,business software


In [12]:
# read in the wiki svd matrix, that will be our X matrix.
wiki_svd_matrix = gu.read_pickle_obj('data/SVD_corpus.obj')

In [13]:
# Set X 
X = wiki_svd_matrix

In [14]:
# set y, the target 'category' needs to be numerically encoded. 
encoder = LabelEncoder()
y = encoder.fit_transform(BS_ML_collection_df['category'])

In [15]:
# 'Business Software' encoded to 0, 'machine learning' encoded to 1.
Counter(y)

Counter({0: 3020, 1: 1104})

In [16]:
final_model_results = []
target_names=['class BS', 'class ML']

# train test split our X and y.
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=42)
                                           
# Fit and score Logistic Regression model.
model = LogisticRegression()

lr = model.fit(Xtrain, ytrain)
lr.score(Xtrain, ytrain)
lr.score(Xtest, ytest)

# Mean accuracy score.
train_score = lr.score(Xtrain,ytrain)
test_score = lr.score(Xtest,ytest) 
y_pred = lr.predict(Xtest)

# Calculate precision and recall.   
recall = recall_score(ytest, y_pred)
precision = precision_score(ytest, y_pred)

output = {'train_acc_score': train_score, 
          'test_acc_score' : test_score, 
          'model_name' : 'Logistic Regression'}

final_model_results.append(output)
    
#classification reports. 
class_report_LogReg_str = classification_report(ytest, y_pred, target_names=target_names)       
print("********* Logistic Regression ********\n\n", class_report_LogReg_str)

********* Logistic Regression ********

              precision    recall  f1-score   support

   class BS       0.94      1.00      0.97       586
   class ML       0.99      0.86      0.92       239

avg / total       0.96      0.96      0.96       825



In [17]:
pd.DataFrame(final_model_results)

Unnamed: 0,model_name,test_acc_score,train_acc_score
0,Logistic Regression,0.956364,0.959079


### From the precision, recall and accuracy metrics, we see that the logistic regression model performed very well in predicting the wiki category from the input text. 

## **Now let's take any wiki text from the corpus and see if it accurately predicts its category.**

In [19]:
# read pickled tfidf transformer fit on the wiki corpus.
tfidf_vectorizer = gu.read_pickle_obj('data/tfidf_vectorizer_fit_transformed.obj')

In [20]:
# read pickled SVD transformer, fit on the wiki corpus.
SVD = gu.read_pickle_obj('data/SVD_500_fit_transformed.obj')

In [2]:
# This method predicts the category of the input text, with the input model. 
def predict_category(model, text):
    
    # Predict category - (category ML is 1 and category BS is 0)  
    categories = {0:"business software", 1:"machine learning"}
    
    # LSA text by TFIDF + SVD transforming it.
    tf_text = tfidf_vectorizer.transform([text])
    svd_text = SVD.transform(tf_text)
    
    # predict category. Get [0] index since it returns as list. Don't know why.
    pred = lr.predict(svd_text)[0]
    
    return(categories[pred])
    

In [97]:
# Store output results.
predict_results = []

In [98]:
# query text iterating through the list of pages we colleced from wikipedia and see if the model
# correctly predicts the category from the page text.
num_rows = BS_ML_collection_df.category.shape[0]

for ind in range(0, num_rows, 50):
    log = {'category': BS_ML_collection_df.category[ind], 'predicted_category':''}
    query_text = BS_ML_collection_df.text[ind]
    log['predicted_category'] = predict_category(lr, query_text)

    predict_results.append(log)

In [101]:
pred_df = pd.DataFrame(predict_results)
pred_df.head(10)

Unnamed: 0,category,predicted_category
0,machine learning,machine learning
1,machine learning,machine learning
2,machine learning,machine learning
3,machine learning,business software
4,machine learning,business software
5,machine learning,machine learning
6,machine learning,business software
7,machine learning,business software
8,machine learning,machine learning
9,machine learning,machine learning


In [99]:
# calculate the accuracy of predicitons.
accuracy = (pred_df['category'] == pred_df['predicted_category']).sum()/pred_df.shape[0]
accuracy

0.9285714285714286

### The Logistic Regression model built predicts the category of the text with about 93% accuracy.

### When a new article from wikipedia comes along, <br>we would like to be able to predict what category the article should fall into.

In [None]:
# I randomly copied and pasted pieces of text directly from wikipedia, 
# from very embedded subcategory pages, for the Business and Machine
# Learning categories. 

In [None]:
# query text from wikipedia business software nested subcategory to see what my model predicts.

In [108]:
# query text.
query_text = '''Broadcast Markup Language, or BML, is an XML-based standard developed by Japan's Association of 
Radio Industries and Businesses as a data broadcasting specification for digital television broadcasting. It is a 
data-transmission service allowing text to be displayed on a 1seg TV screen.

The text contains news, sports, weather forecasts, emergency warnings such as Earthquake Early Warning, etc.
free of charge. It was finalized in 1999, becoming ARIB STD-B24 Data Coding and Transmission Specification for 
Digital Broadcasting.

The STD-B24 specification is derived from an early draft of XHTML 1.0 strict, which it extends and alters. 
Some subset of CSS 1 and 2 is supported, as well as ECMAScript.'''

In [109]:
predict_category(lr, query_text)

'business software'

In [116]:
query_text='''Cuneiform is an open-source workflow language for large-scale scientific data analysis.[1][2] It is
a workflow DSL in the form of a functional programming language promoting parallelizable algorithmic skeletons.
External tools and libraries, in, e.g., R or Python, can be integrated via a foreign function interface. 
Cuneiform's data-driven evaluation model and integration of external software originate in scientific workflow 
languages like Taverna, KNIME, or Galaxy while its algorithmic skeletons (second-order functions) for parallel
execution originate in data-parallel programming models like MapReduce or Pig Latin. Cuneiform is implemented in 
Erlang, and therefore must run on an Erlang Virtual Machine (BEAM) similar to the way Java must run on a JVM 
(Java Virtual Machine). Cuneiform scripts can be executed on top of Hadoop.[3][4][5][6][7]'''

In [117]:
predict_category(lr, query_text)

'business software'

In [None]:
# query text from wikipedia machine learning nested subcategory to see what my model predicts.

In [118]:
# query text.
query_text ='''Latent growth modeling is a statistical technique used in the structural equation modeling (SEM) 
framework to estimate growth trajectory. It is a longitudinal analysis technique to estimate growth over a period 
of time. It is widely used in the field of behavioral science, education and social science. It is also called 
latent growth curve analysis. The latent growth model was derived from theories of SEM. General purpose SEM 
software, such as OpenMx, lavaan (both open source packages based in R), AMOS, Mplus, LISREL, or EQS among others
may be used to estimate the trajectory of growth.

Latent Growth Models [1] [2] [3] [4] represent repeated measures of dependent variables as a function of time and 
other measures. Such longitudinal data share the features that the same subjects are observed repeatedly over time,
and on the same tests (or parallel versions), and at known times. In latent growth modeling, the relative standing 
of an individual at each time is modeled as a function of an underlying growth process, with the best parameter 
values for that growth process being fitted to each individual.'''

In [119]:
predict_category(lr, query_text)

'machine learning'

### As we can see above, the model predicts for the most part well on embedded subcategory text, I randomly got from wikipedia.<br> <br> <br>However as seen below, it mispredicts when I select a samaller amount of text, with more generalized words, for the machine learning category data.

In [None]:
# I chose text from a very nested machine learning subcategory. It mispredicted "business software".

In [22]:
query_text = input("Enter wiki text: ")
print("\n\nPredicted Wiki category: ", predict_category(lr, query_text))

Enter wiki text:  is a biologically constrained theory of machine intelligence originally described in the 2004 book On Intelligence[1] by Jeff Hawkins with Sandra Blakeslee. HTM is based on neuroscience and the physiology and interaction of pyramidal neurons in the neocortex of the human brain. The technology has been tested and implemented in software through example applications from Numenta and commercial applications from Numenta’s partners.


Predicted Wiki category:  business software


### For the business category, it seems to predict accurately. That may be because there was more training data available for this category and it seems to cover a wider spread of topics. 

In [24]:
query_text = input("Enter wiki text: ")
print("\n\nPredicted Wiki category: ", predict_category(lr, query_text))

Enter wiki text: From 1985 onward, ADP’s annual revenues exceeded the $1 billion mark, with paychecks processed for about 20% of the U.S. workforce. In the 1990s, ADP began acting as a professional employer organization (PEO). Around this time, the company acquired Autonom, a German company and the payroll and human resource services company, GSI, headquartered in Paris.[6] In September 1998, ADP acquired UK-based Chessington Computer Centre that supplied administration services to the UK Government.[7]


Predicted Wiki category:  business software
