In [1]:
import pandas as pd
import math
import operator

In [2]:
class TextClassifier:
    
    training_set = None
    
    def __init__(self, document, dataset):
        self.document = document
        self.dataset = dataset
        self.similarity = {}
        self.count_classes= {'business': 0, 'politics': 0, 'sport': 0, 'technology': 0}
    
    @classmethod
    def define_training_set(dataset):
        training_set = dataset
        
    def look_up_cat(self, doc_id):
        doc_index = self.dataset.class_df['class'].loc[self.dataset.class_df['doc_id'] == doc_id].index[0]
        doc_class = self.dataset.class_df['class'][doc_index]
        return doc_class
        
    def classify(self, weighted=False):
        try:
            k = int(input())
        except ValueError:
            print("Please enter an integer >= 1")
            self.classify()
        if weighted:
            #Todo: add weighted method
            pass
        else:
            return self.classify_noweight(k)
            
    def classify_noweight(self, k):
        #takes the k nearest neighboors
        sorted_similarities = sorted(self.similarity.items(), key=operator.itemgetter(1), reverse=True)
        for i in range(k):
            curr_doc_id = sorted_similarities[i][0]
            curr_doc_cat = self.look_up_cat(curr_doc_id)
            self.count_classes[curr_doc_cat] += 1
        highest = max(self.count_classes.values())
        potential_classes = [k for k,v in self.count_classes.items() if v == highest]
        if len(potential_classes) > 1:
            k -= 1 # classify text using 1 less neighboors until there are either no equality
            return self.classify(1, weighted)
        return potential_classes[0]
        
    def create_similarity_dic(self):
        self.document.create_bag_of_words(self.dataset)
        for doc_id in self.dataset.class_df['doc_id']:
            if doc_id == self.document.doc_id:
                continue #ignore entry if it is the same document...
            curr_doc = Document(doc_id)
            curr_doc.create_bag_of_words(self.dataset)
            curr_cos = self.calculate_cosine(curr_doc)
            self.similarity[int(doc_id)] = curr_cos
        return self.similarity
    
    def calculate_cosine(self, other_doc):
        numerator = 0
        for term in self.document.bag_of_words:
            try:
                other_occur = other_doc.bag_of_words[term]
            except KeyError:
                continue #skip if term not in other document
            numerator += self.document.bag_of_words[term] * other_occur
        denominator_1 = math.sqrt(sum(map(lambda x:x**2, other_doc.bag_of_words.values())))
        denominator_2 = math.sqrt(sum(map(lambda x:x**2, self.document.bag_of_words)))
        
        return float(numerator / (denominator_1 * denominator_2))

In [3]:
class TextData:
    
    def __init__(self, data_file, class_file):
        self.df = pd.read_csv(data_file, 
                                    sep=" ", 
                                    skiprows=2, 
                                    names=['doc_id', 'term_id', 'nb_occurences'])
        self.class_df = pd.read_csv(class_file, 
                                      names=['doc_id', 'class'])

In [4]:
class Document(TextData):
    
    def __init__(self, doc_id):
        self.doc_id = doc_id
        self.bag_of_words = {}
    
    def create_bag_of_words(self, dataset):
        '''returns a dictionary of all (term_id, occurrences) of the terms present in the document'''
        df = dataset.df.loc[dataset.df['doc_id'] == self.doc_id].reset_index()
        for i in range(df.shape[0]):
            self.bag_of_words[df['term_id'][i]] = df['nb_occurences'][i]
        return self.bag_of_words
    
    def get_category(self):
        doc_index = self.dataset.class_df['class'].loc[self.dataset.class_df['doc_id'] == doc_id].index[0]
        doc_class = self.dataset.class_df['class'][doc_index]
        return doc_class

In [5]:
my_dataset = TextData('data/news_articles.mtx', 'data/news_articles.labels')

In [6]:
my_document = Document(500)

In [7]:
my_dataset

<__main__.TextData at 0x7f94549ad320>

In [10]:
df = my_dataset.df

In [13]:
def split_in_k_folds(df, k):
        fold_size = df.shape[0] // k
        all_folds = [None] * k
        for i in range(k):
            all_folds[i] = df[fold_size * i: fold_size * i + fold_size]
        return all_folds

In [14]:
df.head()

Unnamed: 0,doc_id,term_id,nb_occurences
0,1,4649,1
1,1,2638,1
2,1,3785,1
3,1,621,1
4,1,3621,1


In [15]:
folds = split_in_k_folds(df, 10)

In [17]:
len(folds)

10

In [19]:
folds[1].shape

(22102, 3)

In [22]:
df.shape[0] // 10

22102

In [21]:
folds[2].shape

(22102, 3)

In [25]:
pd.concat([folds[0], folds[1]])

Unnamed: 0,doc_id,term_id,nb_occurences
0,1,4649,1
1,1,2638,1
2,1,3785,1
3,1,621,1
4,1,3621,1
5,1,1593,2
6,1,4126,3
7,1,3429,1
8,1,3420,1
9,1,2875,1


In [24]:
folds[0]

Unnamed: 0,doc_id,term_id,nb_occurences
0,1,4649,1
1,1,2638,1
2,1,3785,1
3,1,621,1
4,1,3621,1
5,1,1593,2
6,1,4126,3
7,1,3429,1
8,1,3420,1
9,1,2875,1
