# Similarity search

In [35]:
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

### Read preprocessed text

In [157]:
df = pd.read_csv("cleaned_df.csv")
df = df[['text_clean', "text_no_point", 'Unnamed: 0_x', 'Unnamed: 0', 'article_name', 'count']]
df.dropna(inplace=True)
df = df.sort_values(by=['count'], ascending=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 915 entries, 0 to 914
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text_clean     915 non-null    object
 1   text_no_point  915 non-null    object
 2   Unnamed: 0_x   915 non-null    int64 
 3   Unnamed: 0     915 non-null    int64 
 4   article_name   915 non-null    object
 5   count          915 non-null    int64 
dtypes: int64(3), object(3)
memory usage: 50.0+ KB


In [176]:
# Read labelled test df
#df.to_csv("merge.csv")
df = pd.read_csv("merge.csv", sep=";")

In [177]:
train = df[:60].copy()
test =  df[61:].copy()

In [178]:
train.tail(5)

Unnamed: 0.1,index,text_clean,text_no_point,Unnamed: 0_x,Unnamed: 0,article_name,count,label
55,74,STEENWIJK. Mei. Coöp. Veilingsver. ..Steenwij...,STEENWIJK Mei Co Veilingsver Steenwijk en Bone...,102357,52,DDD_010537383_0070_articletext.xml,3,False
56,75,N.SCHARWOUDE. Nov. — Groente kg ien — grote...,SCHARWOUDE Nov Groente kg ien grote drielingen...,27757,53,DDD_110585212_0127_articletext.xml,3,False
57,76,NAALDWIJK. Dcc — Alicanten f .—.. Handappelen...,NAALDWIJK Dcc Alicanten Handappelen Stoofperen...,42450,54,DDD_110585221_0265_articletext.xml,3,False
58,77,Welk percentage van de steenkool in gas wordt ...,Welk percentage van de steenkool in gas wordt ...,123980,55,DDD_010612554_0068_articletext.xml,3,True
59,85,f Stel voor dat een stk Steenkool van een ki...,Stel voor dat een stk Steenkool van een kilo i...,126729,62,DDD_010862579_0048_articletext.xml,3,True


### TF-IDF similarity

In [36]:
stop_words = nltk.corpus.stopwords.words('dutch')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    #filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(list(df['text_clean']))
len(norm_corpus)

915

In [210]:
# Spliting into X & y
#X = train.iloc[:, 2].values # to get th
X = train["text_no_point"].values # to get th

# Convert label to numeric
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.label.values)

In [211]:
# Building a TF IDF matrix out of the corpus of reviews
from sklearn.feature_extraction.text import TfidfVectorizer
td = TfidfVectorizer(max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            ngram_range=(1, 1),  use_idf=1, smooth_idf=1, sublinear_tf=1)
X = td.fit_transform(X).toarray()

# Splitting into training & test subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,
                                                    random_state = 0)

# Training the classifier & predicting on test data
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict
y_pred = classifier.predict(X_test)

In [212]:
# Classification metrics
from sklearn.metrics import accuracy_score, classification_report
classification_report = classification_report(y_test, y_pred)

print('\n Accuracy: ', accuracy_score(y_test, y_pred))
print('\nClassification Report')
print('======================================================')
print('\n', classification_report)


 Accuracy:  1.0

Classification Report

               precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         6

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18



In [213]:
T = test.iloc[:,1].values
X_pred_test = td.transform(T).toarray()
y_test = classifier.predict(X_pred_test)

y_test

array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,

## Fasttext

In [218]:
df_ft = train.copy()

In [220]:
df_ft.head()
# Convert label to numeric
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.label.values)
df_ft["label"] = y

In [228]:
df_ft.head()

Unnamed: 0.1,index,text_clean,text_no_point,Unnamed: 0_x,Unnamed: 0,article_name,count,label
0,0,AMSTERDAM. Nov. — Coörj. Tir.dersveilingverg....,AMSTERDAM Nov Co rj Tir dersveilingverg Amster...,40995,0,DDD_110585201_0106_articletext.xml,22,__label__0
1,1,AMSTERDAM Nov. — Groente. tCoöp. Tindersveili...,AMSTERDAM Nov Groente tCo Tindersveilingverg A...,27756,1,DDD_110585212_0127_articletext.xml,19,__label__0
2,2,GROENTENVEILING LEEUWARDEN Jni. Andijvie B—l ...,GROENTENVEILING LEEUWARDEN Jni Andijvie et per...,122267,2,DDD_010612675_0107_articletext.xml,13,__label__0
3,3,Aoiang er mijnen bestaan is het mijngas de gro...,Aoiang er mijnen bestaan is het mijngas de gro...,36288,3,DDD_010417712_0100_articletext.xml,13,__label__1
4,4,NAALDWIJK Dec. — Groente Alleanten .—.. handa...,NAALDWIJK Dec Groente Alleanten handappelen an...,90947,4,DDD_110585219_0073_articletext.xml,12,__label__0


In [230]:
df_ft['label']=['__label__'+str(s) for s in df_ft['label']]

Unnamed: 0.1,index,text_clean,text_no_point,Unnamed: 0_x,Unnamed: 0,article_name,count,label
0,0,AMSTERDAM. Nov. — Coörj. Tir.dersveilingverg....,AMSTERDAM Nov Co rj Tir dersveilingverg Amster...,40995,0,DDD_110585201_0106_articletext.xml,22,__label____label__0
1,1,AMSTERDAM Nov. — Groente. tCoöp. Tindersveili...,AMSTERDAM Nov Groente tCo Tindersveilingverg A...,27756,1,DDD_110585212_0127_articletext.xml,19,__label____label__0
2,2,GROENTENVEILING LEEUWARDEN Jni. Andijvie B—l ...,GROENTENVEILING LEEUWARDEN Jni Andijvie et per...,122267,2,DDD_010612675_0107_articletext.xml,13,__label____label__0
3,3,Aoiang er mijnen bestaan is het mijngas de gro...,Aoiang er mijnen bestaan is het mijngas de gro...,36288,3,DDD_010417712_0100_articletext.xml,13,__label____label__1
4,4,NAALDWIJK Dec. — Groente Alleanten .—.. handa...,NAALDWIJK Dec Groente Alleanten handappelen an...,90947,4,DDD_110585219_0073_articletext.xml,12,__label____label__0


In [236]:
dff = df_ft[["label", "text_no_point"]]

In [240]:
import csv
dff.to_csv(r'merge.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

In [261]:
import fasttext

model = fasttext.train_supervised(input="merge.txt", lr=0.001, epoch=500, wordNgrams=5, bucket=200000, dim=200, loss='hs')

In [298]:
#model.predict(test.iloc[:,1].values)
test.iloc[7,1]#.values

'Ernstig ongelk bij frites bakken te Gelle. — Bij de bakker H. Bollen te Gelle was men dezer dagen bezig op een Petrolemstel frites te bakken. Het petrolemstel kantelde en de petrolem verlengde zich met het vet. Een felle vlam sloeg hoog op jist toen de vrow van de heer Bollen en haar schoonzster toeschoten om het gevaar te bezweren. De bieren van beide vrowen vatten vlam. Mevrow Bollen had de tegenwoordigheid van geest onmiddellijk in een biten de honing geplaats vat met water te springen. De schoonzster bekwam ernstiger brandwonden. De twee vrowen zijn naar het ziekenhis Calvariënberg te Maastricht overgebracht. Hn toestand is thans bevredigend.'

In [299]:
t=[test.iloc[7,1]]
tt = td.transform(t).toarray()
classifier.predict(tt)

array([1])