In [1]:
import pandas as pd
import numpy as np

from matplotlib import pylab as plt
import networkx as nx

In [53]:
#visualize graph
def save_graph(graph,file_name):
    #initialze Figure
    plt.figure(num=None, figsize=(20, 20), dpi=80)
    plt.axis('off')
    fig = plt.figure(1)
    pos = nx.spring_layout(graph)
    nx.draw_networkx_nodes(graph,pos)
    nx.draw_networkx_edges(graph,pos)

    cut = 1.00
    xmax = cut * max(xx for xx, yy in pos.values())
    ymax = cut * max(yy for xx, yy in pos.values())
    plt.xlim(0, xmax)
    plt.ylim(0, ymax)

    plt.savefig(file_name,bbox_inches="tight")
    plt.close()
    del fig

In [4]:
#create directed graph from hierarchy
def create_digraph(file):
    graph = nx.DiGraph()

    file = open(file)

    lines = file.readlines()

    for line in lines:
        edge = line.rstrip().split(' ')
        graph.add_edge(int(edge[0]), int(edge[1]))
    file.close()
    return graph

graph = create_digraph('hierarchy.txt')
all_nodes = nx.nodes(graph)


In [5]:
print(len(all_nodes))
nodes_to_remove = all_nodes[0:250000]
print(len(nodes_to_remove))
graph.remove_nodes_from(nodes_to_remove)
print(graph.number_of_nodes())

478020
250000
228020


In [6]:
#find a connected component of graph with maximum number of nodes
def single_connected_graph(graph):
    new_graphs = nx.weakly_connected_component_subgraphs(graph)

    the_graphs = []

    for g in new_graphs:
        the_graphs.append(g)
    max_nodes = 0
    max_nodes_index = 0
    index = 0
    for g in the_graphs:
        num_nodes = g.number_of_nodes()
        if(num_nodes > max_nodes):
            max_nodes = num_nodes
            max_nodes_index = index
        index += 1
    final_graph = the_graphs[max_nodes_index]
    return final_graph

final_graph = single_connected_graph(graph)
final_nodes = nx.nodes(final_graph)
print(len(final_nodes))
# nx.write_edgelist(final_graph, "new_hierarchy.txt")

147739


In [7]:
discarded_nodes = []
map_final_nodes = {}

for node in final_nodes:
    map_final_nodes[node] = 1

for node in all_nodes:
    if(node not in map_final_nodes):
        discarded_nodes.append(node)
    
map_discarded_nodes = {}

for node in discarded_nodes:
    map_discarded_nodes[node] = 1
print(len(discarded_nodes))
print(discarded_nodes[0:10])

330281
[1048576, 1048577, 2, 174763, 4, 2097157, 6, 1, 8, 2097161]


In [8]:
data = open('train-remapped.csv', 'r')
output_file = open('final_training.csv', 'w')
lines = data.readlines()
output_file.write(lines[0])
for i in range(1,len(lines)):
#     print(lines[i])
    classes = lines[i].rstrip().split(":")[0]
    true_classes = []
    for j in range(len(classes)):
        index = len(classes) -j - 1
        if(classes[index] == ' '):
            true_classes = classes[0:index].split(',')
            break
    flag = False
    for c in true_classes:
        if int(c) in map_discarded_nodes:
            flag = True
            break
    if(not flag):
        output_file.write(lines[i])
    
    
data.close()
output_file.close()

In [9]:
print(len(lines))

output = open('final_training.csv', 'r')
print(len(output.readlines()))

2365437
216313


In [10]:
nx.is_directed_acyclic_graph(final_graph)

False

In [7]:
#count number of examples for each class
import operator
def count_concepts(file_name):
    file = open(file_name, 'r')
    lines = file.readlines()
    class_map ={}
    for i in range(1, len(lines)):
        classes = lines[i].rstrip().split(":")[0]
        true_classes = []
        for j in range(len(classes)):
            index = len(classes) -j - 1
            if(classes[index] == ' '):
                true_classes = classes[0:index].split(',')
                break
        for c in true_classes:
            if int(c) in class_map:
                class_map[int(c)] = class_map[int(c)] + 1
            else:
                class_map[int(c)] = 1
    file.close()
    return class_map

In [8]:
class_map = count_concepts('train-remapped.csv')

sorted_list = sorted(class_map.items(), key=operator.itemgetter(1), reverse=True)

print(sorted_list[0:10])

[(24177, 387168), (285613, 41104), (98808, 14838), (264962, 12556), (167593, 11400), (242532, 10435), (52954, 10026), (300558, 9473), (444502, 9217), (78249, 9161)]


In [41]:
#Find all classes with atleast 1000 examples in dataset
relevant_classes = list(map(lambda x: x[0], list(filter(lambda x: x[1] >= 1000, sorted_list))))
print(relevant_classes)
print(len(relevant_classes))

relevant_classes_map = {}

for c in relevant_classes:
    relevant_classes_map[c] = 1
# print(relevant_classes_map.keys())

[24177, 285613, 98808, 264962, 167593, 242532, 52954, 300558, 444502, 78249, 237290, 220514, 10721, 337728, 174545, 73518, 24016, 327590, 154064, 374771, 366417, 87241, 73092, 115838, 334220, 169902, 59758, 347803, 364106, 178462, 287120, 14843, 260304, 73462, 23611, 322170, 174425, 167844, 29462, 158599, 299629, 34161, 390974, 228232, 150636, 341276, 36224, 289559, 418360, 323972, 352578, 284433, 383600, 300073, 231746, 60639, 251484, 2830, 183203, 234578, 283823, 161537, 286264, 304661, 93718, 348488, 139391, 397350, 244711, 186125, 419276, 1508, 398319, 428719, 290537, 403132, 395447, 351111, 324660, 13252, 131804, 430081, 24052, 244616, 86836, 393137, 374859, 111772, 206933, 109127, 96443, 228238, 269785, 2903, 272741, 213350, 225356, 174595, 414726, 429208, 151184, 20627, 259458, 97284, 143799, 316670, 1859, 89192, 93043, 165833, 198076, 198336, 363791, 118798, 396560, 402991, 439461, 145157, 171670, 24513, 372228, 234240, 212458, 98638, 251951, 411448, 258850, 65676, 390846, 3547

In [80]:
hfile = open('hierarchy.txt', 'r')
newhfile = open('updated_hierarchy.txt', 'w')
for line in hfile.readlines():
    x = line.rstrip().split(' ')
    
    if(int(x[0]) in relevant_classes_map or int(x[1]) in relevant_classes_map):
        newhfile.write(line)
hfile.close()
newhfile.close()
        

In [4]:
# train_file = open('train-remapped.csv', 'r')
# update_train_file = open('train-updated.csv', 'w')
# lines = train_file.readlines()
# # update_train_file.write(lines[0])
# for i in range(1, len(lines)):
#     classes = lines[i].rstrip().split(":")[0]
#     true_classes = []
#     for j in range(len(classes)):
#         index = len(classes) -j - 1
#         if(classes[index] == ' '):
#             true_classes = classes[0:index].split(',')
#             break
# #     for c in true_classes:
# #         if int(c) in relevant_classes_map:
#     new_classes = get_new_classes(true_classese)
#     if(len(true_classes) > 0):
#         update_train_file.write(lines[i])
# #     break
# train_file.close()
# update_train_file.close()

In [24]:
update_train_file = open('train-updated.csv', 'r')
print(len(update_train_file.readlines()))

723085


In [21]:
def get_new_classes(classes):
    new_classes = []
    for c in classes:
        if c in relevant_classes_map:
            new_classes.append(c)
    return list(map(str, new_classes))

In [42]:
train_file = open('train-remapped.csv', 'r')
update_train_file = open('train-updated.csv', 'w')
lines = train_file.readlines()
# update_train_file.write(lines[0])
for i in range(1, len(lines)):
    classes = lines[i].rstrip().split(":")[0]
    features = ''
    true_classes = []
    for j in range(len(classes)):
        index = len(classes) -j - 1
        if(classes[index] == ' '):
            true_classes = list(map(int, classes[0:index].split(',')))
            features = lines[i].split(classes[0:index])[1]
            break
#     for c in true_classes:
#         if int(c) in relevant_classes_map:

    new_classes = get_new_classes(true_classes)
#     print(new_classes)
#     print(features)
    if(len(new_classes) > 0):
        update_train_file.write(str(",".join(new_classes)) + features)
#     break
train_file.close()
update_train_file.close()

In [43]:
train_file = open('train-remapped.csv', 'r')
update_train_file = open('train-updated-single-label.csv', 'w')
lines = train_file.readlines()
# update_train_file.write(lines[0])
for i in range(1, len(lines)):
    classes = lines[i].rstrip().split(":")[0]
    features = ''
    true_classes = []
    for j in range(len(classes)):
        index = len(classes) -j - 1
        if(classes[index] == ' '):
            true_classes = list(map(int, classes[0:index].split(',')))
            features = lines[i].split(classes[0:index])[1]
            break
#     for c in true_classes:
#         if int(c) in relevant_classes_map:

    new_classes = get_new_classes(true_classes)
#     print(new_classes)
#     print(features)
    for c in new_classes:
        update_train_file.write(c + features)
#     break
train_file.close()
update_train_file.close()

In [48]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.datasets import load_svmlight_file


In [49]:
dataf = load_svmlight_file("train-updated-single-label.csv")

ValueError: need more than 1 value to unpack