# IMPORTING THE REQUIRED PACKAGES

In [5]:
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest

# READING THE DATASET

In [7]:
df = pd.read_csv('E:\\paper_dataset\\ICMLA_2014_2015_2016_2017.csv',encoding= 'unicode_escape')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 448 entries, 0 to 447
Data columns (total 6 columns):
paper_id    448 non-null int64
title       448 non-null object
keywords    448 non-null object
abstract    448 non-null object
session     448 non-null object
year        448 non-null int64
dtypes: int64(2), object(4)
memory usage: 21.1+ KB


In [9]:
df

Unnamed: 0,paper_id,title,keywords,abstract,session,year
0,1,Ensemble Statistical and Heuristic Models for ...,"statistical word alignment, ensemble learning,...",Statistical word alignment models need large a...,Ensemble Methods,2014
1,2,Improving Spectral Learning by Using Multiple ...,"representation, spectral learning, discrete fo...",Spectral learning algorithms learn an unknown ...,Ensemble Methods,2014
2,3,Applying Swarm Ensemble Clustering Technique f...,"software defect prediction, particle swarm opt...",Number of defects remaining in a system provid...,Ensemble Methods,2014
3,4,Reducing the Effects of Detrimental Instances,"filtering, label noise, instance weighting",Not all instances in a data set are equally be...,Ensemble Methods,2014
4,5,Concept Drift Awareness in Twitter Streams,"twitter, adaptation models, time-frequency ana...",Learning in non-stationary environments is not...,Ensemble Methods,2014
5,6,High Precision Screening for Android Malware w...,"dimensionality reduction, mobile security, and...",This work presents a new method of classifying...,Applications in Security,2014
6,7,Reducing the Cost of Breaking Audio CAPTCHAs b...,"active learning, semi-supervised learning, aud...",CAPTCHAs are challenge-response tests that are...,Applications in Security,2014
7,8,Q-Learning: From Computer Network Security To ...,"software architecture, machine learning, q-lea...",Reinforcement learning techniques become more ...,Applications in Security,2014
8,9,On-line Signature Verification using Symbolic ...,"signature,discretize,unordered,classification,...",Signatures are the single most widely used met...,Applications in Security,2014
9,10,Detection of abnormal human behavior using a m...,"abnormal event detection, low-rank approximati...",Automatic detection of abnormal events is one ...,Applications in Security,2014


# PREPROCESSING

# 1. Removing Punctuations
# 2. Converting all the words into lower case 
# 3. Tokenizing the words

In [10]:
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

# EXAMPLE HOW SHINGLES ARE FORMED AFTER PREPROCESSING

In [11]:
sample_text = 'Statistical word alignment models need large amount of training data while they are weak in small-size corpora. This paper proposes a new approach of unsupervised hybrid word alignment technique using ensemble learning method. This algorithm uses three base alignment models in several rounds to generate alignments. The ensemble algorithm uses a weighed scheme for resampling training data and a voting score to consider aggregated alignments. The underlying alignment algorithms used in this study include IBM Model 1, 2 and a heuristic method based on Dice measurement. Our experimental results show that by this approach, the alignment error rate could be improved by at least %15 for the base alignment models.'
print('The shingles (tokens) are:', preprocess(sample_text))

The shingles (tokens) are: ['statistical', 'word', 'alignment', 'models', 'need', 'large', 'amount', 'of', 'training', 'data', 'while', 'they', 'are', 'weak', 'in', 'smallsize', 'corpora', 'this', 'paper', 'proposes', 'a', 'new', 'approach', 'of', 'unsupervised', 'hybrid', 'word', 'alignment', 'technique', 'using', 'ensemble', 'learning', 'method', 'this', 'algorithm', 'uses', 'three', 'base', 'alignment', 'models', 'in', 'several', 'rounds', 'to', 'generate', 'alignments', 'the', 'ensemble', 'algorithm', 'uses', 'a', 'weighed', 'scheme', 'for', 'resampling', 'training', 'data', 'and', 'a', 'voting', 'score', 'to', 'consider', 'aggregated', 'alignments', 'the', 'underlying', 'alignment', 'algorithms', 'used', 'in', 'this', 'study', 'include', 'ibm', 'model', '1', '2', 'and', 'a', 'heuristic', 'method', 'based', 'on', 'dice', 'measurement', 'our', 'experimental', 'results', 'show', 'that', 'by', 'this', 'approach', 'the', 'alignment', 'error', 'rate', 'could', 'be', 'improved', 'by', 'a

# COMBINING THE TITLE , ABSTRACT AND KEYWORDS OF PAPERS 

In [12]:
df['input_text'] = df['title'] + ' ' + df['abstract'] +  ' ' + df['keywords']

In [26]:
df

Unnamed: 0,paper_id,title,keywords,abstract,session,year,input_text
0,1,Ensemble Statistical and Heuristic Models for ...,"statistical word alignment, ensemble learning,...",Statistical word alignment models need large a...,Ensemble Methods,2014,Ensemble Statistical and Heuristic Models for ...
1,2,Improving Spectral Learning by Using Multiple ...,"representation, spectral learning, discrete fo...",Spectral learning algorithms learn an unknown ...,Ensemble Methods,2014,Improving Spectral Learning by Using Multiple ...
2,3,Applying Swarm Ensemble Clustering Technique f...,"software defect prediction, particle swarm opt...",Number of defects remaining in a system provid...,Ensemble Methods,2014,Applying Swarm Ensemble Clustering Technique f...
3,4,Reducing the Effects of Detrimental Instances,"filtering, label noise, instance weighting",Not all instances in a data set are equally be...,Ensemble Methods,2014,Reducing the Effects of Detrimental Instances ...
4,5,Concept Drift Awareness in Twitter Streams,"twitter, adaptation models, time-frequency ana...",Learning in non-stationary environments is not...,Ensemble Methods,2014,Concept Drift Awareness in Twitter Streams Lea...
5,6,High Precision Screening for Android Malware w...,"dimensionality reduction, mobile security, and...",This work presents a new method of classifying...,Applications in Security,2014,High Precision Screening for Android Malware w...
6,7,Reducing the Cost of Breaking Audio CAPTCHAs b...,"active learning, semi-supervised learning, aud...",CAPTCHAs are challenge-response tests that are...,Applications in Security,2014,Reducing the Cost of Breaking Audio CAPTCHAs b...
7,8,Q-Learning: From Computer Network Security To ...,"software architecture, machine learning, q-lea...",Reinforcement learning techniques become more ...,Applications in Security,2014,Q-Learning: From Computer Network Security To ...
8,9,On-line Signature Verification using Symbolic ...,"signature,discretize,unordered,classification,...",Signatures are the single most widely used met...,Applications in Security,2014,On-line Signature Verification using Symbolic ...
9,10,Detection of abnormal human behavior using a m...,"abnormal event detection, low-rank approximati...",Automatic detection of abnormal events is one ...,Applications in Security,2014,Detection of abnormal human behavior using a m...


# CREATION OF MINHASH_LSH_FOREST IN MODEL

In [14]:
def create_MinHash_LSH_forest(df,permutation):
    start_time = time.time()
    
    minhash = []
    
    for text in df['input_text']:
        #print(text)
        tokens = preprocess(text)
        m = MinHash(num_perm=permutation)
        #print(m)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    #print(minhash)
        
    forest = MinHashLSHForest(num_perm=permutation,l=10)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

# SETTING UP THE RECOMMENDATION (RESULT) 

In [15]:
def recommendation(text, df, permutation, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=permutations)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    id_res = np.array(forest.query(m, num_results))
    if len(id_res) == 0:
        return None # if your query is empty, return none
    
    result = df.iloc[id_res]['title']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

# MODEL

In [24]:
permutations = 128

In [17]:
forest = create_MinHash_LSH_forest(df, permutations)

It took 1.4979946613311768 seconds to build forest.


# RESULT PREDICTION

In [21]:
num_recommendations = 6

In [25]:
title = 'Activity Recognition Using Graphical Features'
result = recommendation(title, df, permutations, num_recommendations, forest)
print('\n The Recommendations are \n\n', result)

It took 0.13624930381774902 seconds to query forest.

 The Recommendations are 

 100    Performance Comparison of Major Classical Face...
359    Machine Learning in Appearance-based Robot Sel...
108    Adaptive Fuzzy Prediction for Automotive Appli...
342    Sequential Pattern Based Temporal Contour Repr...
345    DeepPositioning:  Intelligent Fusion of Pervas...
125    Medical Image Classification via SVM Using LBP...
Name: title, dtype: object
