In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import math
import nltk
import string
import collections
import numpy as np
import pandas as pd
from textblob import TextBlob
from natsort import natsorted

### Reading and Preprocessing Dataset:

In [None]:
import zipfile
with zipfile.ZipFile('/content/drive/MyDrive/Humor,Hist,Media,Food.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [None]:
file_names = natsorted(os.listdir('/content/Humor,Hist,Media,Food'))
data = []

for file_name in file_names:
    path = '/content/Humor,Hist,Media,Food/'+file_name
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        temp = {}
        temp['name'] = file_name
        temp['text'] = f.read()
        data.append(temp)

df = pd.DataFrame(data)
df.head()

Unnamed: 0,name,text
0,1st_aid.txt,HERBALHERB1ST AIDCALENDULACOMFREYREMEDIESSICKM...
1,a-team,From uunet!cs.utexas.edu!usc!ucsd!ucbvax!CAE.W...
2,a_fish_c.apo,From: murph@buscard.fidonet.org (Brian Murphy)...
3,a_tv_t-p.com,____________________________________________\n...
4,abbott.txt,\n Abbott & Coste...


In [None]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import re
import numpy as np
from tqdm import tqdm
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

cachedStopWords = stopwords.words("english")

In [None]:
# References: https://williamscott701.medium.com/information-retrieval-unigram-postings-and-positional-postings-a28b907c4e8
def convert_lower_case(data):
    return np.char.lower(data)

def remove_punctuation(data):
    symbols = """˛şË›ÃºÅŸ§ż±ŕőíä°üß!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"""
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_stop_words(data):
    words = word_tokenize(str(data))
    res = ' '.join([word for word in words if word not in cachedStopWords])
    return np.char.strip(res)

def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + lemmatizer.lemmatize(w)
    return np.char.strip(new_text)

def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_stop_words(data)
    data = lemmatization(data)
    return data

In [None]:
def clean_text(data_df):
    for index, row in tqdm(data_df.iterrows(), total=data_df.shape[0]):
        sample = row['text']
        data_df.loc[index, 'text'] = str(preprocess(sample))
    return data_df

df = clean_text(df.copy())
df.head()

100%|██████████| 1133/1133 [00:30<00:00, 37.67it/s]


Unnamed: 0,name,text
0,1st_aid.txt,herbalherb1st aidcalendulacomfreyremediessickm...
1,a-team,uunet c utexas edu usc ucsd ucbvax cae wisc ed...
2,a_fish_c.apo,murph buscard fidonet org brian murphy subject...
3,a_tv_t-p.com,survey result computer use fan alt tv twin pea...
4,abbott.txt,abbott costello first abbott well costello goi...


## First Part:

In [None]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def get_top_5(query, df):
    all_docs = [TextBlob(i) for i in df['text']]

    cleaned_query = TextBlob(str(preprocess(query)))

    scores = []
    for i in tqdm(all_docs, total=len(all_docs)):
        score = jaccard(list(i.words), list(cleaned_query.words))
        scores.append(score)
    
    temp = pd.DataFrame()
    temp['doc'] = df['name']
    temp['score'] = scores

    return temp.sort_values(by=['score'], ascending=False).reset_index(drop=True).head(5)

In [None]:
query = 'Effective against some common internal parasites if anyone has any additions or corrections'

get_top_5(query, df)

100%|██████████| 1133/1133 [00:09<00:00, 113.70it/s]


Unnamed: 0,doc,score
0,flowchrt,0.034483
1,flowchrt.txt,0.017857
2,1st_aid.txt,0.017467
3,popmach,0.010204
4,odd_to.obs,0.009967


## Second Part:

In [None]:
def tf(word, counter):
    return counter[word] / len(counter)

def tf_binary(word, counter):
    if counter[word]>0:
        return 1
    return 0

def tf_rawcount(word, counter):
    return counter[word]

def tf_lognorm(word, counter):
    return math.log(1+counter[word])

def tf_doublenorm(word, counter, max_count):
    return 0.5+(0.5*counter[word])/max_count

def idf(word, postings):
    return math.log(len(postings) / (1 + postings[word]))

def tfidf(word, counter, postings, operation, max_count=None):
    if operation=='binary':
        return tf_binary(word, counter) * idf(word, postings)
    elif operation=='raw count':
        return tf_rawcount(word, counter) * idf(word, postings)
    elif operation=='term frequency':
        return tf(word, counter) * idf(word, postings)
    elif operation=='log normalization':
        return tf_lognorm(word, counter) * idf(word, postings)
    elif operation=='double normalization':
        return tf_doublenorm(word, counter, max_count) * idf(word, postings)
    return 0.0

def get_corpus(df):
    all_text = TextBlob(' '.join(df['text']))
    counter=collections.Counter(list(all_text.words))
    
    index = 0
    for key, value in counter.items():
        counter[key] = index
        index+=1
    
    return counter

def get_posting(df):
    postings = {}

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        tokens = word_tokenize(str(row['text']))
        for token in tokens:
            if token in postings:
                postings[token] += 1
            else:
                postings[token] = 0
    
    return postings

In [None]:
all_text = [TextBlob(i) for i in df['text']]

print('Generating Posting List:')
postings = get_posting(df) 

print('Building Corpus:')
corpus = get_corpus(df)

Generating Posting List:


100%|██████████| 1133/1133 [00:06<00:00, 163.77it/s]


Building Corpus:


In [None]:
vectors1 = []
print('Generating Vectors:')
operation = 'binary'
for i, blob in tqdm(enumerate(all_text), total=len(all_text)):
    vec = np.zeros((len(corpus),))
    counter=collections.Counter(list(blob.words))

    for word in blob.words:
        vec[corpus[word]] = tfidf(word, counter, postings, operation) 

    vectors1.append(vec)

Generating Vectors:


100%|██████████| 1133/1133 [00:14<00:00, 76.71it/s] 


In [None]:
vectors2 = []
print('Generating Vectors:')
operation = 'raw count'
for i, blob in tqdm(enumerate(all_text), total=len(all_text)):
    vec = np.zeros((len(corpus),))
    counter=collections.Counter(list(blob.words))

    for word in blob.words:
        vec[corpus[word]] = tfidf(word, counter, postings, operation) 

    vectors2.append(vec)

Generating Vectors:


100%|██████████| 1133/1133 [00:04<00:00, 260.87it/s]


In [None]:
vectors3 = []
print('Generating Vectors:')
operation = 'term frequency'
for i, blob in tqdm(enumerate(all_text), total=len(all_text)):
    vec = np.zeros((len(corpus),))
    counter=collections.Counter(list(blob.words))

    for word in blob.words:
        vec[corpus[word]] = tfidf(word, counter, postings, operation) 

    vectors3.append(vec)

Generating Vectors:


100%|██████████| 1133/1133 [00:04<00:00, 238.29it/s]


In [None]:
vectors4 = []
print('Generating Vectors:')
operation = 'log normalization'
for i, blob in tqdm(enumerate(all_text), total=len(all_text)):
    vec = np.zeros((len(corpus),))
    counter=collections.Counter(list(blob.words))

    for word in blob.words:
        vec[corpus[word]] = tfidf(word, counter, postings, operation) 

    vectors4.append(vec)

Generating Vectors:


100%|██████████| 1133/1133 [00:04<00:00, 227.74it/s]


In [None]:
vectors5 = []
print('Generating Vectors:')
operation = 'double normalization'
for i, blob in tqdm(enumerate(all_text), total=len(all_text)):
    vec = np.zeros((len(corpus),))
    
    counter=collections.Counter(list(blob.words))
    max_count = sorted(counter.items(), key=lambda x: x[1], reverse=True)[0][1]

    for word in blob.words:
        vec[corpus[word]] = tfidf(word, counter, postings, operation, max_count) 

    vectors5.append(vec)

Generating Vectors:


100%|██████████| 1133/1133 [00:08<00:00, 126.39it/s]


In [None]:
query = 'Effective against some common internal parasites if anyone has any additions or corrections'
cleaned_query = TextBlob(str(preprocess(query)))

result = pd.DataFrame()
result['file'] = df['name']
result['binary'] = 0
result['raw count'] = 0
result['term frequency'] = 0
result['log normalization'] = 0
result['double normalization'] = 0


for i in list(cleaned_query.words):
    result['binary']+=(np.array(vectors1)[:, corpus[i]])

for i in list(cleaned_query.words):
    result['raw count']+=(np.array(vectors2)[:, corpus[i]])

for i in list(cleaned_query.words):
    result['term frequency']+=(np.array(vectors3)[:, corpus[i]])

for i in list(cleaned_query.words):
    result['log normalization']+=(np.array(vectors4)[:, corpus[i]])

for i in list(cleaned_query.words):
    result['double normalization']+=(np.array(vectors5)[:, corpus[i]])

In [None]:
result.head()

Unnamed: 0,file,binary,raw count,term frequency,log normalization,double normalization
0,1st_aid.txt,27.156528,65.518911,0.431045,31.906132,19.038173
1,a-team,18.082741,18.082741,0.009297,12.534001,9.161922
2,a_fish_c.apo,0.0,0.0,0.0,0.0,0.0
3,a_tv_t-p.com,12.00074,12.00074,0.015749,8.318279,6.100376
4,abbott.txt,0.0,0.0,0.0,0.0,0.0


In [None]:
top_k = 10
scheme = 'binary'
result.sort_values(scheme, ascending=False)[['file', scheme]].head(top_k)

Unnamed: 0,file,binary
501,humor9.txt,31.606085
663,mlverb.hum,31.587736
0,1st_aid.txt,27.156528
349,epikarat.txt,25.633924
9,acronym.lis,25.16392
174,byfb.txt,25.001579
228,coffee.faq,25.001579
252,consp.txt,23.839139
173,bw.txt,23.839139
449,hack7.txt,23.369135


In [None]:
scheme = 'raw count'
result.sort_values(scheme, ascending=False)[['file', scheme]].head(top_k)

Unnamed: 0,file,raw count
663,mlverb.hum,141.212007
11,acronyms.txt,124.772955
817,practica.txt,113.339979
630,manners.txt,113.017486
175,c0dez.txt,104.795726
173,bw.txt,85.101122
483,hop.faq,79.578766
450,hackingcracking.txt,75.459351
39,anime.lif,67.439132
37,anim_lif.txt,67.439132


In [None]:
scheme = 'term frequency'
result.sort_values(scheme, ascending=False)[['file', scheme]].head(top_k)

Unnamed: 0,file,term frequency
0,1st_aid.txt,0.431045
389,flowchrt,0.273309
390,flowchrt.txt,0.149078
413,gas.txt,0.10615
36,aniherb.txt,0.095148
171,bw-phwan.hat,0.081993
723,nukewar.jok,0.077815
1006,temphell.jok,0.075313
591,lifeinfo.hum,0.07499
646,memo.hum,0.074663


In [None]:
scheme = 'log normalization'
result.sort_values(scheme, ascending=False)[['file', scheme]].head(top_k)

Unnamed: 0,file,log normalization
663,mlverb.hum,39.010196
817,practica.txt,34.105551
173,bw.txt,33.011395
501,humor9.txt,32.779866
0,1st_aid.txt,31.906132
175,c0dez.txt,30.643494
174,byfb.txt,30.61026
630,manners.txt,29.302768
449,hack7.txt,28.849454
450,hackingcracking.txt,28.048332


In [None]:
scheme = 'double normalization'
result.sort_values(scheme, ascending=False)[['file', scheme]].head(top_k)

Unnamed: 0,file,double normalization
0,1st_aid.txt,19.038173
663,mlverb.hum,16.152274
501,humor9.txt,16.027808
9,acronym.lis,13.949391
349,epikarat.txt,12.847406
173,bw.txt,12.806039
174,byfb.txt,12.793575
228,coffee.faq,12.634303
252,consp.txt,12.121596
273,cultmov.faq,11.96335
