In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import zipfile
with zipfile.ZipFile('/content/drive/MyDrive/20_newsgroups.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [None]:
import glob
data = []
req = ['comp.graphics', 'sci.med', 'talk.politics.misc', 'rec.sport.hockey', 'sci.space']

for i in glob.glob("/content/20_newsgroups/*"):
    if i[23:] in req:
        for j in glob.glob(i + '/*'):
            temp = {}
            temp['class'] = i[23:]
            with open(j, 'r', encoding='utf-8', errors='ignore') as f:
                temp['text'] = f.read()
                data.append(temp)

len(data)

5000

In [None]:
import pandas as pd

df = pd.DataFrame(data)
df.head()

Unnamed: 0,class,text
0,talk.politics.misc,Xref: cantaloupe.srv.cs.cmu.edu alt.conspiracy...
1,talk.politics.misc,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....
2,talk.politics.misc,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....
3,talk.politics.misc,Xref: cantaloupe.srv.cs.cmu.edu alt.sex:111580...
4,talk.politics.misc,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....


In [None]:
df['class'].value_counts()

talk.politics.misc    1000
sci.space             1000
rec.sport.hockey      1000
comp.graphics         1000
sci.med               1000
Name: class, dtype: int64

In [None]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import re
import math
import collections
import numpy as np
from tqdm import tqdm
from nltk import pos_tag
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

cachedStopWords = stopwords.words("english")

In [None]:
# References: https://williamscott701.medium.com/information-retrieval-unigram-postings-and-positional-postings-a28b907c4e8
def convert_lower_case(data):
    return np.char.lower(data)

def remove_punctuation(data):
    symbols = """˛şË›ÃºÅŸ§ż±ŕőíä°üß!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"""
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_stop_words(data):
    words = word_tokenize(str(data))
    res = ' '.join([word for word in words if word not in cachedStopWords])
    return np.char.strip(res)

def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + lemmatizer.lemmatize(w)
    return np.char.strip(new_text)

def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_stop_words(data)
    data = lemmatization(data)
    return data

In [None]:
def clean_text(data_df):
    for index, row in tqdm(data_df.iterrows(), total=data_df.shape[0]):
        sample = row['text']
        data_df.loc[index, 'text'] = str(preprocess(sample))
    return data_df

df = clean_text(df.copy())
df.head()

100%|██████████| 5000/5000 [00:38<00:00, 130.79it/s]


Unnamed: 0,class,text
0,talk.politics.misc,xref cantaloupe srv c cmu edu alt conspiracy 2...
1,talk.politics.misc,xref cantaloupe srv c cmu edu talk politics mi...
2,talk.politics.misc,xref cantaloupe srv c cmu edu talk politics mi...
3,talk.politics.misc,xref cantaloupe srv c cmu edu alt sex 111580 s...
4,talk.politics.misc,xref cantaloupe srv c cmu edu talk politics mi...


In [None]:
def tf(word, counter):
    return counter[word] / len(counter)

def icf(word, postings):
    return math.log(len(postings) / (1 + postings[word]))

def tficf(word, blob, postings):
    return tf(word, blob) * icf(word, postings)

def get_corpus(df):
    all_text = TextBlob(' '.join(df['text']))
    counter=collections.Counter(list(all_text.words))
    
    index = 0
    for key, value in counter.items():
        counter[key] = index
        index+=1
    
    return counter

def get_posting(df):
    postings = {}

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        tokens = word_tokenize(str(row['text']))
        for token in tokens:
            if token in postings:
                postings[token] += 1
            else:
                postings[token] = 0
    
    return postings

In [None]:
a = df.groupby(['class'])['text'].apply(' '.join).reset_index()
a

Unnamed: 0,class,text
0,comp.graphics,xref cantaloupe srv c cmu edu comp graphic 389...
1,rec.sport.hockey,newsgroups rec sport hockey path cantaloupe sr...
2,sci.med,path cantaloupe srv c cmu edu crabapple srv c ...
3,sci.space,newsgroups sci space path cantaloupe srv c cmu...
4,talk.politics.misc,xref cantaloupe srv c cmu edu alt conspiracy 2...


In [None]:
print('Generating Posting List:')
postings = get_posting(a)

Generating Posting List:


100%|██████████| 5/5 [00:05<00:00,  1.10s/it]


In [None]:
classwise_text = [TextBlob(i) for i in a['text']]
k = 25

corpus = set()

for i, blob in tqdm(enumerate(classwise_text), total=len(classwise_text)):
    counter=collections.Counter(list(blob.words))
    scores = {}
    for i in blob.words:
        scores[i] = tficf(i, counter, postings)

    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    count = 0
    for word, score in sorted_words:
        corpus.add(word)
        count+=1
        if count==k:
            break

100%|██████████| 5/5 [00:14<00:00,  2.81s/it]


In [None]:
len(corpus)

76

In [None]:
vectors = []
classwise_text = [' '.join(set(TextBlob(i).words).intersection(corpus)) for i in a['text']]
all_text = [' '.join(set(TextBlob(i).words).intersection(corpus)) for i in df['text']]

In [None]:
temp = pd.DataFrame()
temp['text'] = all_text

new_corpus = get_corpus(temp)

In [None]:
temp = pd.DataFrame()
temp['text'] = classwise_text

new_posting = get_posting(temp)

100%|██████████| 5/5 [00:00<00:00, 793.53it/s]


In [None]:
for i, blob in tqdm(enumerate(all_text), total=len(all_text)):
    blob = TextBlob(blob)
    counter = collections.Counter(list(blob.words))
    vec = np.zeros((len(new_corpus),))
    for word in blob.words:
        vec[new_corpus[word]] = tficf(word, counter, new_posting) 

    vectors.append(vec)

100%|██████████| 5000/5000 [00:01<00:00, 2716.68it/s]


In [None]:
encode = {'class': {'comp.graphics':0, 'sci.med':1, 'talk.politics.misc':2, 'rec.sport.hockey':3, 'sci.space':4}}
df = df.replace(encode)
df.head()

Unnamed: 0,class,text
0,2,xref cantaloupe srv c cmu edu alt conspiracy 2...
1,2,xref cantaloupe srv c cmu edu talk politics mi...
2,2,xref cantaloupe srv c cmu edu talk politics mi...
3,2,xref cantaloupe srv c cmu edu alt sex 111580 s...
4,2,xref cantaloupe srv c cmu edu talk politics mi...


In [None]:
def train(X, y, e):
    y_labels, y_value_counts = np.unique(y, return_counts=True)
    x_sep = np.array([np.unique(x) for x in np.transpose(X)])
    dell_y = 1.0*(y_value_counts/y_value_counts.sum())
    u_mean = np.array([X[y==k].mean(axis=0) for k in y_labels])
    prob_x = np.array([X[y==k].var(axis=0) + e for k in y_labels])
    return [prob_x, u_mean, y_labels]

def predict(X, prob_x, u_mean, y_labels):
    return np.apply_along_axis(lambda x: compute_probs(x, prob_x, u_mean, y_labels), 1, X)

def compute_probs(x, prob_x, u_mean, y_labels):
    probs = np.array([get_weight(x, y, prob_x, u_mean) for y in range(len(y_labels))])
    return y_labels[np.argmax(probs)]

def get_weight(x, y, prob_x, u_mean):
    c = 1.0 /np.sqrt(2.0 * np.pi * (prob_x[y]))
    return np.prod(c * np.exp(-1.0 * np.square(x - u_mean[y]) / (2.0 * prob_x[y])))

### Split - 0.2

In [None]:
split = 0.2
X_train, X_test, y_train, y_test = train_test_split(np.array(vectors), df['class'], test_size=split, random_state=42, stratify=df['class'])

weights = train(X_train, y_train, 1e5)
preds = predict(X_test, weights[0], weights[1], weights[2])

print(f'Accuracy: {accuracy_score(y_test, preds)}')
print(classification_report(y_test, preds))

Accuracy: 0.986
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       200
           1       1.00      0.98      0.99       200
           2       0.95      1.00      0.98       200
           3       1.00      1.00      1.00       200
           4       0.99      0.98      0.98       200

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000



  This is separate from the ipykernel package so we can avoid doing imports until


### Split - 0.3

In [None]:
split = 0.3
X_train, X_test, y_train, y_test = train_test_split(np.array(vectors), df['class'], test_size=split, random_state=42, stratify=df['class'])

weights = train(X_train, y_train, 1e5)
preds = predict(X_test, weights[0], weights[1], weights[2])

print(f'Accuracy: {accuracy_score(y_test, preds)}')
print(classification_report(y_test, preds))

  This is separate from the ipykernel package so we can avoid doing imports until


Accuracy: 0.9853333333333333
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       300
           1       1.00      0.99      0.99       300
           2       0.96      1.00      0.98       300
           3       1.00      1.00      1.00       300
           4       0.98      0.98      0.98       300

    accuracy                           0.99      1500
   macro avg       0.99      0.99      0.99      1500
weighted avg       0.99      0.99      0.99      1500



### Split - 0.5

In [None]:
split = 0.5
X_train, X_test, y_train, y_test = train_test_split(np.array(vectors), df['class'], test_size=split, random_state=42, stratify=df['class'])

weights = train(X_train, y_train, 1e5)
preds = predict(X_test, weights[0], weights[1], weights[2])

print(f'Accuracy: {accuracy_score(y_test, preds)}')
print(classification_report(y_test, preds))

  This is separate from the ipykernel package so we can avoid doing imports until


Accuracy: 0.9848
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       500
           1       1.00      0.98      0.99       500
           2       0.96      1.00      0.98       500
           3       1.00      1.00      1.00       500
           4       0.97      0.97      0.97       500

    accuracy                           0.98      2500
   macro avg       0.99      0.98      0.98      2500
weighted avg       0.99      0.98      0.98      2500

