In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
data = pd.read_csv("lsh_data.csv")
data.head(3)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...


In [4]:
data.describe()

Unnamed: 0,category,text
count,2215,2225
unique,5,2126
top,sport,singer s film to show at festival a documentar...
freq,509,2


In [5]:
data['category'].value_counts()

sport            509
business         508
politics         415
tech             399
entertainment    384
Name: category, dtype: int64

- The data has collection of news across various categories
- Supervised learning
- X -> text
- Y -> category

In [6]:
data[pd.isnull(data).any(axis=1)]

Unnamed: 0,category,text
2215,,junk e-mails on relentless rise spam traffic i...
2216,,top stars join us tsunami tv show brad pitt r...
2217,,rings of steel combat net attacks gambling is ...
2218,,davies favours gloucester future wales hooker ...
2219,,beijingers fume over parking fees choking traf...
2220,,cars pull down us retail figures us retail sal...
2221,,kilroy unveils immigration policy ex-chatshow ...
2222,,rem announce new glasgow concert us band rem h...
2223,,how political squabbles snowball it s become c...
2224,,souness delight at euro progress boss graeme s...


 - The last 10 data points are for testing

In [7]:
test_data = data[pd.isnull(data).any(axis=1)]

In [8]:
train_data = data.dropna()

In [9]:
print("Train Data")
print("Number of data points"+str(len(train_data)))
print(train_data.head(2))
print("\n")
print("Test Data")
print("Number of data points"+str(len(test_data)))
print(test_data.head(2))

Train Data
Number of data points2215
   category                                               text
0      tech  tv future in the hands of viewers with home th...
1  business  worldcom boss  left books alone  former worldc...


Test Data
Number of data points10
     category                                               text
2215      NaN  junk e-mails on relentless rise spam traffic i...
2216      NaN  top stars join us tsunami tv show brad pitt  r...


## Vectorizing Data

In [42]:
x_train = train_data['text']
y_train = train_data['category']
x_test = test_data['text']

In [43]:
vectorizer = TfidfVectorizer(ngram_range=(2,3),min_df=10,max_features=4000)
X = vectorizer.fit(x_train)
X = vectorizer.transform(x_train)
test = vectorizer.transform(x_test)

In [44]:
print(X.shape)
print(test.shape)

(2215, 4000)
(10, 4000)


In [49]:
np.random.seed(0)
hyperplanes = np.random.normal(0,1,(5,4000))

In [50]:
hash_combo = []
hash_value = []
hash_table = []

def compute_dot(x,y):
    res = []
    for i,j in zip(x,y):
        res.append(i*j)
    return(sum(res))

def to_hashtable(hashh,X_i):
    if hashh not in hash_combo:
        hash_combo.append(hashh)
        hash_value.append([])
    for n in range(len(hash_combo)):
        if hash_combo[n]==hashh:
            hash_value[n].append(X_i)

def compute_len(x):
    res=0
    for i in x:
        res = res + (i*i)
    return np.sqrt(res)

In [51]:
def LSH(X):
    for X_i in range(len(X.toarray())):
        hashh=[]
        for plane in hyperplanes:
            if compute_dot(X[X_i].toarray()[0],plane)>=0:
                hashh.append(1)
            else:
                hashh.append(-1)
        to_hashtable(hashh,X_i)

    for i,j in zip(hash_combo,hash_value):
        temp=[]
        temp.append(i)
        temp.append(j)
        hash_table.append(temp)

In [52]:
LSH(X)

In [53]:
def compute_knn(query,k):
    hashh=[]
    for plane in hyperplanes:
        if compute_dot(query,plane)>=0:
            hashh.append(1)
        else:
            hashh.append(-1)
    for i in range(len(hash_table)):
        if hashh == hash_table[i][0]:
            pointsInHash=hash_table[i][1]
    dist = {}
    for n in pointsInHash:
        xy=compute_dot(query,X.toarray()[n])
        x_ = compute_len(query)
        y_ = compute_len(X.toarray()[n])
        cos_sim = xy/(x_*y_)
        dist[n]=cos_sim
    dist = sorted(dist.items(), key=lambda x: x[1], reverse=True)
    return dist[0:k] 

In [54]:
def maj_vote(knn):
    maj_vot = []
    frequency={}
    for i in range(len(knn)):
        maj_vot.append(y_train[knn[i][0]])
    for cat in maj_vot:
        n = maj_vot.count(cat)
        frequency[cat]=n
    return sorted(frequency.items(), key=lambda x: x[1], reverse=True)[0]

In [59]:
k=10
classification=[]
def predict(test):
    for query in range(len(test.toarray())):
        knn = compute_knn(test.toarray()[query],k)
        majority = maj_vote(knn)
        classification.append(majority)
    return classification

In [60]:
classification = predict(test)
print(classification)

[('tech', 7), ('entertainment', 6), ('tech', 9), ('tech', 6), ('business', 4), ('business', 8), ('politics', 4), ('entertainment', 6), ('politics', 6), ('sport', 7)]
