#Reading the data from csv file

In [None]:
# Code to mount google drive in case you are loading the data from your google drive
from google.colab import drive
drive.mount("gdrive/")

Drive already mounted at gdrive/; to attempt to forcibly remount, call drive.mount("gdrive/", force_remount=True).


In [None]:
# Loading data from csv file
import pandas as pd
data_path = "gdrive/My Drive/MY_DB/"
df = pd.read_csv(data_path + "lsh_assignment_data.csv")
df

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,,cars pull down us retail figures us retail sal...
2221,,kilroy unveils immigration policy ex-chatshow ...
2222,,rem announce new glasgow concert us band rem h...
2223,,how political squabbles snowball it s become c...


In [None]:
#Data overview
df["category"].value_counts() 
#Return a Series containing counts of unique values.
#The resulting object will be in descending order
#so that the first element is the most frequently-occurring element.
#Excludes NA values by default.


sport            509
business         508
politics         415
tech             399
entertainment    384
Name: category, dtype: int64

#Creating Train and Test Datasets
Note that the labels for test data will not be present in the dataset and hence they are mentioned as NaN.

In [None]:
# The last 10 rows in the csv file are query points, so loading them into test data.
# And loading the reamining points to train_data for which labels are given.
train_data = df.iloc[:-10]
test_data = df.iloc[-10:]

In [None]:
# For train_data here the labels are in the column named "category".
train_data

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2210,politics,teens know little of politics teenagers ques...
2211,entertainment,lopez misses uk charity premiere jennifer lope...
2212,business,christmas shoppers flock to tills shops all ov...
2213,tech,progress on new internet domains by early 2005...


In [None]:
test_data

Unnamed: 0,category,text
2215,,junk e-mails on relentless rise spam traffic i...
2216,,top stars join us tsunami tv show brad pitt r...
2217,,rings of steel combat net attacks gambling is ...
2218,,davies favours gloucester future wales hooker ...
2219,,beijingers fume over parking fees choking traf...
2220,,cars pull down us retail figures us retail sal...
2221,,kilroy unveils immigration policy ex-chatshow ...
2222,,rem announce new glasgow concert us band rem h...
2223,,how political squabbles snowball it s become c...
2224,,souness delight at euro progress boss graeme s...


#Custom  Implementation

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import defaultdict
from numpy import linalg as la

def predictLabels (test_data):
    
    tfidf = TfidfVectorizer(ngram_range = (2,3), max_features = 4000,           #Generating vectorized train_data using sklearns built in tfidf vectorize
                                                                min_df = 10)    #with bigrams & trigrams,max features to 4000 and mini document frequency to 10
    train_vectors = tfidf.fit_transform(train_data.text)

    num_hyperplanes = 5                                                         #Number of hyperplanes
    k = 11                                                                      #11 nearest neighbours

    np.random.seed(0)
    hyperplanes = np.random.normal(0, 1, (num_hyperplanes,                      #Generating 5 normally distributed random hyperplanes with mean=0 & variance=1 
                                          train_vectors.shape[1]))
    a = np.arange(num_hyperplanes-1, -1, step = -1)                             #initializing binary values
    power_of_two = 1 << a                                                       #Generating power of two, 1<<a = 1*2^a
    score = find_dist(train_vectors, hyperplanes, power_of_two)                 #Calculating distances of training datapoints from hyperplanes
    table = defaultdict(list)                                                   #Initializing dictionary with list of values
    for i in range(len(score)):                                                 #Creating the dictionary while placing the points into their respective bins
        table[score[i]].append(i)

    Xq = tfidf.transform(test_data.text)                                        #Vectorizing the text in test data with TfidfVectorizer
    Q_score = list(find_dist(Xq, hyperplanes, power_of_two))                    #Calculating distances of query points from hyperplanes

    predicted_labels = []                                                      
    for i in range(len(Q_score)):                                               #Finding predicted labels for the query points by looping through each datapoint
        locality = []                                                          
        locality = np.array(table[Q_score[i]])
        cos_sim = []
        for j in locality:                                                      #Finding cosine similarities of all the datapoints in the locality
            cosine = (train_vectors[j].dot(Xq[i].T)).todense().item() / (la.norm(train_vectors[j].toarray()) * (la.norm(Xq[i].toarray())))
            cos_sim.append(cosine)
        argsorted_cos_sim = np.argsort(cos_sim)[::-1]                           #Sorting the similarities indices in descending order
        neighbors = locality[argsorted_cos_sim[:k]]                             #Considering only the nearest k=11 neighbors
        predictions = list(train_data.category[neighbors])                      #Predicting possible query points labels by using training data's labels
        label = max(set(predictions), key = predictions.count)                  #Getting the label by maximum frequented possible labels
        predicted_labels.append(label)                                          #Forming a list of labels of all the query points
    return(predicted_labels)                                                    #and returning it to the main function

def find_dist(vector, hyperplanes, power_of_two):                               
    bin_bits = vector.dot(hyperplanes.T) <= 0                                   #generating binary bits 0 for the points on the -ve side and 1 for +ve of the hyperplanes                                                          
    dec_val = bin_bits.dot(power_of_two)                                        #Converting the binary bits into corresponding decimal values
    return dec_val

In [83]:
###########################################
## GRADER CELL: Do NOT Change this.
# This cell will print "Success" if your implmentation of the computeTFIDF() is correct.
# Else, it will print "Failed"
###########################################
import numpy as np

# compute TF-IDF using the computeTFIDF() function
Y_custom = np.array(predictLabels(test_data))

# Reference grader array - DO NOT MODIFY IT
Y_grader = np.array(['tech', 'entertainment', 'tech', 'sport', 'business', 'business', 'politics', 'entertainment', 'politics', 'sport'])

#calculating accuracy by comparing Y_grader and Y_custom
accuracy=np.sum(Y_grader==Y_custom)*10
# compare Y_grader and Y_custom
if accuracy>=80:
  print("******** Success ********","Accuracy Achieved",accuracy,'%')
else:
  print("####### Failed #######")
  print("\Y_grader = \n\n", Y_grader)
  print("\n","*"*50)
  print("\Y_custom = \n\n", Y_custom)


******** Success ******** Accuracy Achieved 90 %
