In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
data = pd.read_csv("lsh_data.csv")
data.head(3)

In [None]:
data.describe()

In [None]:
data['category'].value_counts()

- The data has collection of news across various categories
- Supervised learning
- X -> text
- Y -> category

In [None]:
data[pd.isnull(data).any(axis=1)]

 - The last 10 data points are for testing

In [None]:
test_data = data[pd.isnull(data).any(axis=1)]

In [None]:
train_data = data.dropna()

In [None]:
print("Train Data")
print("Number of data points"+str(len(train_data)))
print(train_data.head(2))
print("\n")
print("Test Data")
print("Number of data points"+str(len(test_data)))
print(test_data.head(2))

## Vectorizing Data

In [None]:
x_train = train_data['text']
y_train = train_data['category']
x_test = test_data['text']

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(2,3),min_df=10,max_features=4000)
X = vectorizer.fit(x_train)
X = vectorizer.transform(x_train)
test = vectorizer.transform(x_test)

In [None]:
print(X.shape)
print(test.shape)

In [None]:
np.random.seed(0)
hyperplanes = np.random.normal(0,1,(5,4000))

In [None]:
hash_combo = []
hash_value = []
hash_table = []

def compute_dot(x,y):
    res = []
    for i,j in zip(x,y):
        res.append(i*j)
    return(sum(res))

def to_hashtable(hashh,X_i):
    if hashh not in hash_combo:
        hash_combo.append(hashh)
        hash_value.append([])
    for n in range(len(hash_combo)):
        if hash_combo[n]==hashh:
            hash_value[n].append(X_i)

def compute_len(x):
    res=0
    for i in x:
        res = res + (i*i)
    return np.sqrt(res)

In [None]:
def LSH(X):
    for X_i in range(len(X.toarray())):
        hashh=[]
        for plane in hyperplanes:
            if compute_dot(X[X_i].toarray()[0],plane)>=0:
                hashh.append(1)
            else:
                hashh.append(-1)
        to_hashtable(hashh,X_i)

    for i,j in zip(hash_combo,hash_value):
        temp=[]
        temp.append(i)
        temp.append(j)
        hash_table.append(temp)

In [None]:
LSH(X)

In [None]:
def compute_knn(query,k):
    hashh=[]
    for plane in hyperplanes:
        if compute_dot(query,plane)>=0:
            hashh.append(1)
        else:
            hashh.append(-1)
    for i in range(len(hash_table)):
        if hashh == hash_table[i][0]:
            pointsInHash=hash_table[i][1]
    dist = {}
    for n in pointsInHash:
        xy=compute_dot(query,X.toarray()[n])
        x_ = compute_len(query)
        y_ = compute_len(X.toarray()[n])
        cos_sim = xy/(x_*y_)
        dist[n]=cos_sim
    dist = sorted(dist.items(), key=lambda x: x[1], reverse=True)
    return dist[0:k] 

In [None]:
def maj_vote(knn):
    maj_vot = []
    frequency={}
    for i in range(len(knn)):
        maj_vot.append(y_train[knn[i][0]])
    for cat in maj_vot:
        n = maj_vot.count(cat)
        frequency[cat]=n
    return sorted(frequency.items(), key=lambda x: x[1], reverse=True)[0]

In [None]:
k=10
classification=[]
def predict(test):
    for query in range(len(test.toarray())):
        knn = compute_knn(test.toarray()[query],k)
        majority = maj_vote(knn)
        classification.append(majority)
    return classification

In [None]:
classification = predict(test)
print(classification)