# Creating CTFIDFVectorizer class

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer


class CTFIDFVectorizer(TfidfTransformer):
    def __init__(self, *args, **kwargs):
        super(CTFIDFVectorizer, self).__init__(*args, **kwargs)

    def fit(self, X: sp.csr_matrix, n_samples: int):
        """Learn the idf vector (global term weights) """
        _, n_features = X.shape
        df = np.squeeze(np.asarray(X.sum(axis=0)))
        idf = np.log(n_samples / df)
        self._idf_diag = sp.diags(idf, offsets=0,
                                  shape=(n_features, n_features),
                                  format='csr',
                                  dtype=np.float64)
        return self

    def transform(self, X: sp.csr_matrix) -> sp.csr_matrix:
        """Transform a count-based matrix to c-TF-IDF """
        X = X * self._idf_diag
        X = normalize(X, axis=1, norm='l1', copy=False)
        return X

# Getting data

In [2]:

with open("./train.dat", "r") as f:
    lines = f.readlines()
labels = []
texts = []
for i in range(len(lines)):
    splitline = lines[i].split('\t')
    labels.append(int(splitline[0])-1)
    texts.append(splitline[1])

# Splitting Data

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)


docs = pd.DataFrame({'Document': X_train, 'Class': y_train})
docs_per_class = docs.groupby(['Class'], as_index=True).agg({'Document': ' '.join})

In [4]:
docs_per_class

Unnamed: 0_level_0,Document
Class,Unnamed: 1_level_1
0,Endothelin-like immunoreactivity in human brea...
1,Prevention of a false diagnosis of sexually ac...
2,Glucocorticoid-induced muscle atrophy preventi...
3,Threat of unemployment and cardiovascular risk...
4,Standards for analysis of ventricular late pot...


# Transforming data

In [5]:
count_vectorizer = CountVectorizer().fit(docs_per_class.Document)
count = count_vectorizer.transform(docs_per_class.Document)
ctfidf_vectorizer = CTFIDFVectorizer().fit(count, n_samples=len(docs))
ctfidf = ctfidf_vectorizer.transform(count)

# testing model

In [6]:
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity

count_test = count_vectorizer.transform(X_test)
vector = ctfidf_vectorizer.transform(count_test)
distances = cosine_similarity(vector, ctfidf)

# distances
prediction = np.argmax(distances, 1)
# prediction
print(metrics.classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.66      0.67      0.67       805
           1       0.43      0.53      0.48       381
           2       0.40      0.61      0.48       459
           3       0.62      0.63      0.62       774
           4       0.49      0.35      0.41      1191

    accuracy                           0.53      3610
   macro avg       0.52      0.56      0.53      3610
weighted avg       0.54      0.53      0.53      3610



# Labeling unlabeled data

In [7]:
with open("./test.dat", "r") as f:
    txt_test = f.readlines()

In [8]:
count_test = count_vectorizer.transform(txt_test)
vector = ctfidf_vectorizer.transform(count_test)
distances = cosine_similarity(vector, ctfidf)
prediction = np.argmax(distances, 1)

In [9]:
len(prediction)

14442

In [10]:
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=37)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_text_count_model = CountVectorizer().fit(texts)
train_text_count = train_text_count_model.transform(texts)
train_text_tfidf_model = TfidfTransformer().fit(train_text_count)
train_text_tfidf =train_text_tfidf_model.transform(train_text_count)
clf_knn.fit(train_text_tfidf, labels)

KNeighborsClassifier(n_neighbors=37)

In [12]:
test_text_count = train_text_count_model.transform(txt_test)
test_text_tfidf = train_text_tfidf_model.transform(test_text_count)
learningmodel_p = clf_knn.predict(test_text_tfidf)

# Comparing Learning model and c-Tf-Idf

In [13]:
print(metrics.classification_report(learningmodel_p, prediction))

              precision    recall  f1-score   support

           0       0.82      0.68      0.75      3833
           1       0.54      0.80      0.65      1371
           2       0.39      0.84      0.53      1343
           3       0.84      0.65      0.73      4031
           4       0.53      0.43      0.48      3864

    accuracy                           0.63     14442
   macro avg       0.62      0.68      0.63     14442
weighted avg       0.68      0.63      0.64     14442

