In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import random as rm

In [2]:
def csr_read(fname, ftype="csr", nidx=1):
    r""" 
        Read CSR matrix from a text file. 
        
        \param fname File name for CSR/CLU matrix
        \param ftype Input format. Acceptable formats are:
            - csr - Compressed sparse row
            - clu - Cluto format, i.e., CSR + header row with "nrows ncols nnz"
        \param nidx Indexing type in CSR file. What does numbering of feature IDs start with?
    """
    
    with open(fname) as f:
        lines = f.readlines()
    
    if ftype == "clu":
        p = lines[0].split()
        nrows = int(p[0])
        ncols = int(p[1])
        nnz = long(p[2])
        lines = lines[1:]
        assert(len(lines) == nrows)
    elif ftype == "csr":
        nrows = len(lines)
        ncols = 0 
        nnz = 0 
        for i in xrange(nrows):
            p = lines[i].split()
            if len(p) % 2 != 0:
                raise ValueError("Invalid CSR matrix. Row %d contains %d numbers." % (i, len(p)))
            nnz += len(p)/2
            for j in xrange(0, len(p), 2): 
                cid = int(p[j]) - nidx
                if cid+1 > ncols:
                    ncols = cid+1
    else:
        raise ValueError("Invalid sparse matrix ftype '%s'." % ftype)
    val = np.zeros(nnz, dtype=np.float)
    ind = np.zeros(nnz, dtype=np.int)
    ptr = np.zeros(nrows+1, dtype=np.long)
    n = 0 
    for i in xrange(nrows):
        p = lines[i].split()
        for j in xrange(0, len(p), 2): 
            ind[n] = int(p[j]) - nidx
            val[n] = float(p[j+1])
            n += 1
        ptr[i+1] = n 
    
    assert(n == nnz)
    
    return csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.float)

In [3]:
filename = "train.dat"
text_csr = csr_read(filename)

In [4]:
#dense_matrix = csr_matrix.todense(text_csr)

In [5]:
#print dense_matrix.shape


In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
idft = TfidfTransformer(norm=None)
idfmatrix = idft.fit_transform(text_csr)

In [7]:
denseidf = csr_matrix.todense(idfmatrix)

In [8]:
print denseidf.shape

(8580, 126355)


In [9]:
#from sklearn.decomposition import TruncatedSVD
#svd = TruncatedSVD(n_components=4000)
#reducedm = svd.fit_transform(idfmatrix)

In [10]:
#print(svd.explained_variance_ratio_)
#print(svd.explained_variance_ratio_.sum())

In [11]:
#from sklearn.externals import joblib
#fileObject = open('../pickle/idfmatrix.pickle','wb')
#joblib.dump(idfmatrix, fileObject)
#fileObject.close()

In [12]:
#read code
#fileObject = open('../pickle/idfmatrix.pickle','rb')
#idfmatrix = joblib.load(fileObject)
#fileObject.close()

In [13]:
#from sklearn.externals import joblib
#fileObject = open('../pickle/actualreducedMwithoutnormalisation.pickle','wb')
#joblib.dump(reducedm, fileObject)
#fileObject.close()

In [14]:
#readcode
#fileObject = open('../pickle/aactualreducedMwithoutnormalisation.pickle','rb')
#reducedm = joblib.load(fileObject)
#fileObject.close()

In [15]:
from sklearn.preprocessing import normalize
#normalizedReduced=normalize(reducedm, norm='l2')

In [16]:
normalizedwithoutReducing = normalize(idfmatrix, norm='l2')

In [17]:
densenormalizedwithoutReducing = csr_matrix.todense(normalizedwithoutReducing)

In [18]:
print densenormalizedwithoutReducing.shape

(8580, 126355)


In [19]:
densenormalizedwithoutReducing = np.asarray(densenormalizedwithoutReducing)
denseidf = np.asarray(denseidf)

In [20]:
print type (densenormalizedwithoutReducing)
print type (denseidf)

<type 'numpy.ndarray'>
<type 'numpy.ndarray'>


In [21]:
print densenormalizedwithoutReducing[0]
print denseidf[0]

[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]


In [22]:
def k2_means(denseidf,densenormalizedwithoutReducing,centroid1=None,centroid2=None,iter=20):
    if type(centroid1)!=np.ndarray or type(centroid2)!=np.ndarray:
        #print "here"
        cent1=0
        cent2=0
        num = denseidf.shape[0]-1
        while (cent1==cent2):
            cent1 = rm.randint(0, num)
            cent2 = rm.randint(0, num)
            #print normalizedwithoutReducing[cent1],cent2
        centroid1 = np.array(densenormalizedwithoutReducing[cent1])
        centroid2 = np.array(densenormalizedwithoutReducing[cent2])
        centroidarray = np.append([centroid1],[centroid2], axis=0)
        #print centroid1.shape
        
    else:
        centroidarray = np.append([centroid1],[centroid2], axis=0)
        #print centroidarray.shape
        
    centroidcosineArray = densenormalizedwithoutReducing.dot(centroidarray.T)
    
    i=0
    cluster=[]
    newcentroid1_points = 0
    newcentroid2_points = 0
    newcentroid1_sum = np.zeros(shape=[1,126355])
    newcentroid2_sum = np.zeros(shape=[1,126355])
    newcentroid1_mean = np.zeros(shape=[1,126355])
    newcentroid2_mean = np.zeros(shape=[1,126355])
    newcentroid1_mean_norm = np.zeros(shape=[1,126355])
    newcentroid2_mean_norm = np.zeros(shape=[1,126355])
    
    for item in centroidcosineArray:
        if item[0]>item[1]:
            newcentroid1_points+=1
            newcentroid1_sum = newcentroid1_sum+denseidf[i]
            cluster.append(1)
        else:
            newcentroid2_points+=1
            newcentroid2_sum = newcentroid2_sum+denseidf[i]
            cluster.append(2)
        i+=1
    #print newcentroid1_points,newcentroid2_points
    
    newcentroid1_mean = newcentroid1_sum/newcentroid1_points
    newcentroid2_mean = newcentroid2_sum/newcentroid2_points
    
    newcentroid1_mean_norm=normalize(newcentroid1_mean, norm='l2')
    newcentroid2_mean_norm=normalize(newcentroid2_mean, norm='l2')
    
    #print newcentroid1_mean[0],centroid1
    
    comp1 = newcentroid1_mean_norm[0].dot(centroid1.T)
    comp2 = newcentroid2_mean_norm[0].dot(centroid2.T)
    
    if (comp1>=0.99 and comp2>=0.99) or iter==0:
        #print cluster
        print iter,comp1,comp2,newcentroid1_points,newcentroid2_points
        return cluster,newcentroid1_mean_norm,newcentroid2_mean_norm
    else:
        print iter,comp1,comp2,newcentroid1_points,newcentroid2_points
        iter-=1
        return k2_means(denseidf,densenormalizedwithoutReducing,newcentroid1_mean_norm[0],newcentroid2_mean_norm[0],iter)
     

In [103]:
clusterresult,newcentroid1_mean_norm,newcentroid2_mean_norm = k2_means(denseidf,densenormalizedwithoutReducing)

20 0.193426225197 0.210376433635 3597 4983
19 0.975277707096 0.967001464175 4526 4054
18 0.993039186333 0.991459435976 4627 3953


In [147]:
print densenormalizedwithoutReducing[0]

[ 0.  0.  0. ...,  0.  0.  0.]


In [104]:
from collections import defaultdict
memoryrowsdict={}
scores=[]
flag=0
odd=1
even=0
sent0=0
sentlist=[]
sentlist1 = defaultdict(list)
positionofsent=0
for kbisect in range(7):
    print "k",kbisect
    list1_for_memlist=[]
    list2_for_memlist=[]
    list2=[]
    list22=[]
    list1=[]
    list11=[]
    sum1=0
    count1=0
    sum2=0
    count2=0
    i=0
    for items in clusterresult:
            if items==1:
                count1+=1
                sum1 = sum1+(densenormalizedwithoutReducing[i].dot(newcentroid1_mean_norm.T))
                list1.append(densenormalizedwithoutReducing[i])
                list11.append(denseidf[i])
                if kbisect==0:
                    list1_for_memlist.append(i)
                else:
                    if flag==1:
                        list1_for_memlist.append(memoryrowsdict[positionofsent][0][i])
                    else:
                        list1_for_memlist.append(memoryrowsdict[positionofsent][3][i])

            else:
                count2+=1
                sum2 = sum2+densenormalizedwithoutReducing[i].dot(newcentroid2_mean_norm.T)
                list2.append(densenormalizedwithoutReducing[i])
                list22.append(denseidf[i])
                if kbisect==0:
                    list2_for_memlist.append(i)
                else:
                    if flag==1:
                        list2_for_memlist.append(memoryrowsdict[positionofsent][0][i])
                    else:
                        list2_for_memlist.append(memoryrowsdict[positionofsent][3][i])
            i+=1
    print len(list1_for_memlist),len(list2_for_memlist)
    avg1 = sum1/count1
    avg2 = sum2/count2
    print avg1,avg2
    memoryrowsdict[kbisect]=[list2_for_memlist,list22,list2,list1_for_memlist,list11,list1]
    #print memoryrowsdict[0][0][0],memoryrowsdict[0][1][0],memoryrowsdict[0][2][0],memoryrowsdict[0][3][0],memoryrowsdict[0][4][0],memoryrowsdict[0][5][0]
    if avg1<avg2:
        scores.append(avg2)
        index_min = min(xrange(len(scores)), key=scores.__getitem__)
        print scores
        print len(scores)
        print "i",index_min
        positionofsent=index_min
        if(index_min+1)==len(scores):
            flag=0
            if kbisect!=0:
                scores[index_min]=1
            else:
                sent0=0
            sentlist1[positionofsent].append(flag)
            sentlist.append(odd)
            a=np.asarray(memoryrowsdict[index_min][4])
            b=np.asarray(memoryrowsdict[index_min][5])
            #print a.shape,b.shape
            #print a
            print "here1"
            clusterresult,newcentroid1_mean_norm,newcentroid2_mean_norm = k2_means(a,b)
            #print clusterresult
            odd+=2
                        
        else:
            scores[index_min]=1
            sentlist.append(even)
            flag=1
            sentlist1[positionofsent].append(flag)
            if sentlist1[positionofsent][0]==0:
                flag=1
                sentlist1[positionofsent].append(flag)
                a=np.asarray(memoryrowsdict[index_min][1])
                b=np.asarray(memoryrowsdict[index_min][2])
            else:
                flag=0
                sentlist1[positionofsent].append(flag)
                a=np.asarray(memoryrowsdict[index_min][4])
                b=np.asarray(memoryrowsdict[index_min][5])
            #print a.shape,b.shape
            #print a,b
            print "here2"
            clusterresult,newcentroid1_mean_norm,newcentroid2_mean_norm = k2_means(a,b)
            #print clusterresult
            even+=2
                        
    else:
        scores.append(avg1)
        index_min = min(xrange(len(scores)), key=scores.__getitem__)
        positionofsent=index_min
        print scores
        print len(scores)
        print "i",index_min
        if(index_min+1)==len(scores):
            flag=1
            if kbisect!=0:
                scores[index_min]=1
            else:
                 sent0 = 1
            sentlist1[positionofsent].append(flag)
            sentlist.append(even)
            a=np.asarray(memoryrowsdict[index_min][1])
            b=np.asarray(memoryrowsdict[index_min][2])
            #print a.shape,b.shape
            #print a,b
            print "here3"
            clusterresult,newcentroid1_mean_norm,newcentroid2_mean_norm = k2_means(a,b)
            #print clusterresult
            even+=2
                        
        else:
            flag=0
            sentlist1[positionofsent].append(flag)
            scores[index_min]=1
            if sentlist1[positionofsent][0]==0:
                flag=1
                sentlist1[positionofsent].append(flag)
                a=np.asarray(memoryrowsdict[index_min][1])
                b=np.asarray(memoryrowsdict[index_min][2])
            else:
                flag=0
                sentlist1[positionofsent].append(flag)
                a=np.asarray(memoryrowsdict[index_min][4])
                b=np.asarray(memoryrowsdict[index_min][5])
            #print a.shape,b.shape
            #print a,b
            print "here4"
            clusterresult,newcentroid1_mean_norm,newcentroid2_mean_norm = k2_means(a,b)
            #print clusterresult
            odd+=2
                        

        

    

k 0
4627 3953
[ 0.18776543] [ 0.23814188]
[array([ 0.23814188])]
1
i 0
here1
20 0.385899655657 0.21512182905 1281 3346
19 0.96595103494 0.99747315556 973 3654
18 0.996940544196 0.999716303783 889 3738
k 1
889 3738
[ 0.11548834] [ 0.14785132]
[array([ 0.23814188]), array([ 0.14785132])]
2
i 1
here1
20 0.167220677231 0.116669785473 488 401
19 0.965859908228 0.975244724093 428 461
18 0.971816923176 0.974509693366 367 522
17 0.989422920136 0.993911392666 304 585
16 0.991791312022 0.996054284978 249 640
k 2
249 640
[ 0.07522827] [ 0.19810195]
[array([ 0.23814188]), 1, array([ 0.19810195])]
3
i 2
here1
20 0.443536150841 0.347488868495 153 96
19 0.981147643248 0.969092746985 110 139
18 0.99463363129 0.993707621489 94 155
k 3
94 155
[ 0.15945624] [ 0.19472829]
[array([ 0.23814188]), 1, 1, array([ 0.19472829])]
4
i 3
here1
20 0.279694101535 0.27810881164 51 43
19 0.985119461547 0.990579980286 53 41
18 1.0 1.0 53 41
k 4
53 41
[ 0.29298126] [ 0.08430413]
[array([ 0.23814188]), 1, 1, 1, array([ 0.

In [105]:
print len(memoryrowsdict)

7


In [106]:
print sentlist1

defaultdict(<type 'list'>, {0: [0, 0, 1], 1: [0], 2: [0], 3: [0], 5: [0], 6: [0]})


In [107]:
len((memoryrowsdict[1][3]))

889

In [108]:
clusters_7=[]
count_7 = 1
for clus in range(1,6):
    if clus in sentlist1.keys():
        #if clus!=0:
        #print sentlist1[clus][0]
        if sentlist1[clus][0]==1:
            clusters_7.append(memoryrowsdict[clus][3])
            print len((memoryrowsdict[clus][3]))
        else:
            clusters_7.append(memoryrowsdict[clus][0])
            print len((memoryrowsdict[clus][0]))
        #count_7+=1
            #if count_7==8:
                #break
    else:
        '''if clus==0 and sent==1:
            clusters_7.append(memoryrowsdict[clus][3])
            print len((memoryrowsdict[clus][3]))
            count_7+=1
            if count_7==8:
                break
        elif clus==0 and sent==0: 
            clusters_7.append(memoryrowsdict[clus][0])
            print len((memoryrowsdict[clus][0]))
            count_7+=1
            if count_7==8:
                break
        else:'''
        clusters_7.append(memoryrowsdict[clus][0])
        print len((memoryrowsdict[clus][0]))
        #count_7+=1
        #if count_7==8:
            #break
        clusters_7.append(memoryrowsdict[clus][3])
        print len((memoryrowsdict[clus][3]))
            #count_7+=1
            #if count_7==8:
                #break
merged = memoryrowsdict[6][3]+memoryrowsdict[6][0]                
clusters_7.append(merged)
print len(merged)


3738
640
155
41
53
2398
1555


In [109]:
print len(clusters_7)

7


In [110]:
clusterlabel=1
final_list=[]
for item in clusters_7:
    for i in item:
          final_list.append((clusterlabel,i))  
    clusterlabel+=1

    

In [111]:
print len(final_list)

8580


In [112]:
sorted_by_second = sorted(final_list, key=lambda tup: tup[1])

In [113]:
print sorted_by_second

[(1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (6, 7), (6, 8), (1, 9), (1, 10), (1, 11), (1, 12), (2, 13), (1, 14), (1, 15), (1, 16), (1, 17), (1, 18), (1, 19), (1, 20), (6, 21), (1, 22), (6, 23), (1, 24), (1, 25), (1, 26), (2, 27), (6, 28), (6, 29), (1, 30), (1, 31), (6, 32), (1, 33), (1, 34), (1, 35), (1, 36), (1, 37), (1, 38), (6, 39), (1, 40), (6, 41), (6, 42), (1, 43), (1, 44), (6, 45), (1, 46), (1, 47), (1, 48), (6, 49), (1, 50), (1, 51), (1, 52), (1, 53), (1, 54), (1, 55), (1, 56), (2, 57), (1, 58), (1, 59), (1, 60), (1, 61), (2, 62), (1, 63), (1, 64), (2, 65), (1, 66), (1, 67), (1, 68), (1, 69), (1, 70), (1, 71), (1, 72), (1, 73), (6, 74), (1, 75), (1, 76), (6, 77), (6, 78), (1, 79), (6, 80), (6, 81), (1, 82), (1, 83), (6, 84), (1, 85), (1, 86), (1, 87), (1, 88), (1, 89), (1, 90), (1, 91), (6, 92), (6, 93), (1, 94), (6, 95), (2, 96), (1, 97), (1, 98), (1, 99), (6, 100), (1, 101), (2, 102), (1, 103), (1, 104), (1, 105), (2, 106), (1, 107), (1, 108), (1, 109), (1, 110),

In [128]:
f = open('format.dat', 'w')
for item in sorted_by_second:
    f.write(str(item[0])+'\n')
f.close()