
    Performs small-scale tests on the dataset using different representations for the vectorizations and 
    different implementation of the Jaccard distances.
    1. Default CountVectorizer() and default "metric='jaccard'" in knn classifier.
    2. One-hot CountVectorizer(binary=True) and custom jaccard metric acting on one-hot vectors.
    3. Vocabulary indexing and custom jaccard metric acting on those vectors.
    4. TFIDF vectorizer and default "metric='jaccard'".
    


In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tensorflow import keras

In [22]:
len_train_test = 1000
split = 0.1

df = pd.read_csv(r"/home/vkalekis/Documents/bigdata/dataframes/df1/train_concat.csv", encoding="utf-8", nrows=len_train_test)
df.head

<bound method NDFrame.head of          Id                                        Content_Pre  Target
0    227464  netflix come cabl box amazon groceri overlord ...       0
1    244074  pharrel iranian presid react tehran happi arre...       0
2     60707  wildlif servic seek comment fish wildlif servi...       1
3     27883  facebook team story launch fb newswir natur so...       1
4    169596  caesar plan us mln new york casino caesar plan...       2
..      ...                                                ...     ...
995  253918  miley cyru maserati quattroport stolen miley c...       0
996  261459  charg paul simon edi brickel drop two month ch...       0
997  148745  rpt lenovo aim sell mln smartphon tablet come ...       2
998  177032  settlement big bank send right messag settleme...       2
999  134656  wall street stock close mostli higher earn app...       2

[1000 rows x 3 columns]>

In [23]:
X = df.iloc[:, -2].values
y = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(
                                    df.iloc[:, -2].values, 
                                    df.iloc[:, -1].values, 
                                    test_size=split,
                                    random_state=100)

In [24]:
print(f"Shapes: {X_train.shape} {y_train.shape} {X_test.shape} {y_test.shape}")

Shapes: (900,) (900,) (100,) (100,)


In [25]:
# 1-------------------------------------------------------------
# Count Vectorizer - Generates vocabulary of words in train set.
# Creates vectors for each entry in train+test set 
# which contain the count of each word in the vocabulary in the entry.

vectorizer = CountVectorizer()

X_train_cv1 = vectorizer.fit_transform(X_train).toarray()
X_test_cv1 = vectorizer.transform(X_test).toarray()

print(X_train_cv1.shape)
print(X_test_cv1.shape)

print(X_train_cv1[0])

(900, 17219)
(100, 17219)
[0 0 0 ... 0 0 0]


In [26]:
# 2-------------------------------------------------------------
# One-hot Count Vectorizer - Generates vocabulary of words in train set.
# Creates one hot representation of each vector in train+test set.

vectorizer = CountVectorizer(binary=True)

X_train_cv2 = vectorizer.fit_transform(X_train).toarray()
X_test_cv2 = vectorizer.transform(X_test).toarray()

print(X_train_cv2.shape)
print(X_test_cv2.shape)

print(X_train_cv2[0])

(900, 17219)
(100, 17219)
[0 0 0 ... 0 0 0]


In [27]:
# 3-----------------------------------------------------------
# Vocabulary indexing 
# Each word in every document the train+test set is given
# its index in the vocabulary of the train set.
# We use the keras vectorizer in order to fit only on the train set and 
# not in both, which would be the case if we used the sklearn one.

tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_temp = tokenizer.texts_to_sequences(X_train)
X_test_temp = tokenizer.texts_to_sequences(X_test)

pad1 = len(max(X_train_temp, key=len))
pad2 = len(max(X_test_temp, key=len))

maxpad = max(pad1, pad2)

X_train_voc = keras.preprocessing.sequence.pad_sequences(X_train_temp, padding='post', maxlen=maxpad)
X_test_voc = keras.preprocessing.sequence.pad_sequences(X_test_temp, padding='post', maxlen=maxpad)


print(X_train_voc.shape)
print(len(X_train_voc))
print(len(X_test_voc))

print(X_train_voc[0])

(900, 2084)
900
100
[2590   20   14 ...    0    0    0]


In [28]:
# 4-------------------------------------------------
# Tfidf Vectorizer
# Generates term frequency inverse document frequency for each word 
# in every entry in train and test set.

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

print(X_train_tfidf[0])

(900, 17219)
(100, 17219)
[0. 0. 0. ... 0. 0. 0.]


In [29]:
def jaccard_binary(x,y):
    """
        Jaccard Similarity between two binary vectors (used in one-hot vectorization)
    """
    intersection = np.logical_and(x, y)
    union = np.logical_or(x, y)
    similarity = intersection.sum() / float(union.sum())
    return similarity

In [30]:
def jaccard_similarity(list1, list2):
    """
        Jaccard similarity between two vectors.
    """
    set1 = set(list1)
    set2 = set(list2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

In [31]:
def jaccard_distance_binary(x, y):
    """
        Jaccard distance using binary Jaccard similarity.
    """
    return 1-jaccard_binary(x,y)

In [32]:
def jaccard_distance(x, y):
    """
        Jaccard distance using Jaccard similarity.
    """
    return 1-jaccard_similarity(x, y)

In [33]:
def runKnn(choice, X_train, y_train, X_test, y_test):
    error = []

    t1 = time.time()

    if choice == 0:
        # Default Jaccard distance.
        knn = KNeighborsClassifier(n_jobs=1, n_neighbors=5, metric='jaccard')
    elif choice == 1:
        # Custom Jaccard distance for binary / one-hot vectors.
        knn = KNeighborsClassifier(n_jobs=1, n_neighbors=5, metric=jaccard_distance_binary)
    elif choice == 2:
        # Custom Jaccard distance.
        knn = KNeighborsClassifier(n_jobs=1, n_neighbors=5, metric=jaccard_distance)

    knn.fit(X_train, y_train)
    predicted = knn.predict(X_test)
    acc = metrics.accuracy_score(y_test, predicted)

    t2 = time.time()
    
    return acc, (t2-t1)/60.0, predicted

In [34]:
# Test 1 - Vectorization using CountVectorizer and default Jaccard distance.
acc_1, time_1, pred_1 = runKnn(0, X_train_cv1, y_train, X_test_cv1, y_test)




In [35]:
# Test 2 - One-hot Vectorization using CountVectorizer and custom Jaccard binary distance.
acc_2, time_2, pred_2 = runKnn(1, X_train_cv2, y_train, X_test_cv2, y_test)


In [36]:
# Test 3 - Vocabulary indexing and custom Jaccard distance.
acc_3, time_3, pred_3 = runKnn(2, X_train_voc, y_train, X_test_voc, y_test)

In [37]:
# Test 4 - Tfidf representation and default Jaccard distance.
acc_4, time_4, pred_4 = runKnn(0, X_train_tfidf, y_train, 
                                   X_test_tfidf, y_test)



In [38]:
print(f"{acc_1}   {time_1:.6f}")
print(f"{acc_2}   {time_2:.6f}")
print(f"{acc_3}   {time_3:.6f}")
print(f"{acc_4}   {time_4:.6f}")

0.81   0.008332
0.81   0.102978
0.82   0.510911
0.81   0.008144
