In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import random as rm
from sklearn.preprocessing import normalize

In [2]:
def csr_read(fname, ftype="csr", nidx=1):
    r""" 
        Read CSR matrix from a text file. 
        
        \param fname File name for CSR/CLU matrix
        \param ftype Input format. Acceptable formats are:
            - csr - Compressed sparse row
            - clu - Cluto format, i.e., CSR + header row with "nrows ncols nnz"
        \param nidx Indexing type in CSR file. What does numbering of feature IDs start with?
    """
    
    with open(fname) as f:
        lines = f.readlines()
    
    if ftype == "clu":
        p = lines[0].split()
        nrows = int(p[0])
        ncols = int(p[1])
        nnz = long(p[2])
        lines = lines[1:]
        assert(len(lines) == nrows)
    elif ftype == "csr":
        nrows = len(lines)
        ncols = 0 
        nnz = 0 
        for i in xrange(nrows):
            p = lines[i].split()
            if len(p) % 2 != 0:
                raise ValueError("Invalid CSR matrix. Row %d contains %d numbers." % (i, len(p)))
            nnz += len(p)/2
            for j in xrange(0, len(p), 2): 
                cid = int(p[j]) - nidx
                if cid+1 > ncols:
                    ncols = cid+1
    else:
        raise ValueError("Invalid sparse matrix ftype '%s'." % ftype)
    val = np.zeros(nnz, dtype=np.float)
    ind = np.zeros(nnz, dtype=np.int)
    ptr = np.zeros(nrows+1, dtype=np.long)
    n = 0 
    for i in xrange(nrows):
        p = lines[i].split()
        for j in xrange(0, len(p), 2): 
            ind[n] = int(p[j]) - nidx
            val[n] = float(p[j+1])
            n += 1
        ptr[i+1] = n 
    
    assert(n == nnz)
    
    return csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.float)

In [3]:
filename = "train.dat"
text_csr = csr_read(filename)
from sklearn.feature_extraction.text import TfidfTransformer
idft = TfidfTransformer(norm=None)
idfmatrix = idft.fit_transform(text_csr)
denseidf = csr_matrix.todense(idfmatrix)
normalizedwithoutReducing = normalize(idfmatrix, norm='l2')
densenormalizedwithoutReducing = csr_matrix.todense(normalizedwithoutReducing)
densenormalizedwithoutReducing = np.asarray(densenormalizedwithoutReducing)
denseidf = np.asarray(denseidf)

In [5]:
def k2_means(denseidf,densenormalizedwithoutReducing,centroid1=None,centroid2=None,iter=20):
    if type(centroid1)!=np.ndarray or type(centroid2)!=np.ndarray:
        #print "here"
        cent1=0
        cent2=0
        num = denseidf.shape[0]-1
        while (cent1==cent2):
            cent1 = rm.randint(0, num)
            cent2 = rm.randint(0, num)
            #print normalizedwithoutReducing[cent1],cent2
        centroid1 = np.array(densenormalizedwithoutReducing[cent1])
        centroid2 = np.array(densenormalizedwithoutReducing[cent2])
        centroidarray = np.append([centroid1],[centroid2], axis=0)
        #print centroid1.shape
        
    else:
        centroidarray = np.append([centroid1],[centroid2], axis=0)
        #print centroidarray.shape
        
    centroidcosineArray = densenormalizedwithoutReducing.dot(centroidarray.T)
    
    i=0
    cluster=[]
    newcentroid1_points = 0
    newcentroid2_points = 0
    newcentroid1_sum = np.zeros(shape=[1,126355])
    newcentroid2_sum = np.zeros(shape=[1,126355])
    newcentroid1_mean = np.zeros(shape=[1,126355])
    newcentroid2_mean = np.zeros(shape=[1,126355])
    newcentroid1_mean_norm = np.zeros(shape=[1,126355])
    newcentroid2_mean_norm = np.zeros(shape=[1,126355])
    
    for item in centroidcosineArray:
        if item[0]>item[1]:
            newcentroid1_points+=1
            newcentroid1_sum = newcentroid1_sum+denseidf[i]
            cluster.append(1)
        else:
            newcentroid2_points+=1
            newcentroid2_sum = newcentroid2_sum+denseidf[i]
            cluster.append(2)
        i+=1
    #print newcentroid1_points,newcentroid2_points
    
    newcentroid1_mean = newcentroid1_sum/newcentroid1_points
    newcentroid2_mean = newcentroid2_sum/newcentroid2_points
    
    newcentroid1_mean_norm=normalize(newcentroid1_mean, norm='l2')
    newcentroid2_mean_norm=normalize(newcentroid2_mean, norm='l2')
    
    #print newcentroid1_mean[0],centroid1
    
    comp1 = newcentroid1_mean_norm[0].dot(centroid1.T)
    comp2 = newcentroid2_mean_norm[0].dot(centroid2.T)
    
    if (comp1>=0.99 and comp2>=0.99) or iter==0:
        #print cluster
        print iter,comp1,comp2,newcentroid1_points,newcentroid2_points
        return cluster,newcentroid1_mean_norm,newcentroid2_mean_norm
    else:
        print iter,comp1,comp2,newcentroid1_points,newcentroid2_points
        iter-=1
        return k2_means(denseidf,densenormalizedwithoutReducing,newcentroid1_mean_norm[0],newcentroid2_mean_norm[0],iter)

In [6]:
clusterresult,newcentroid1_mean_norm,newcentroid2_mean_norm = k2_means(denseidf,densenormalizedwithoutReducing)

20 0.261358955569 0.301517934122 4447 4133
19 0.991694263163 0.98656004728 4806 3774
18 0.999413181198 0.999166869705 4897 3683


In [7]:
from collections import defaultdict
memoryrowsdict={}
scores=[]
flag=0
odd=1
even=0
sent0=0
sentlist=[]
sentlist1 = defaultdict(list)
positionofsent=0
for kbisect in range(7):
    print "k",kbisect
    list1_for_memlist=[]
    list2_for_memlist=[]
    list2=[]
    list22=[]
    list1=[]
    list11=[]
    sum1=0
    count1=0
    sum2=0
    count2=0
    i=0
    for items in clusterresult:
            if items==1:
                count1+=1
                sum1 = sum1+(densenormalizedwithoutReducing[i].dot(newcentroid1_mean_norm.T))
                list1.append(densenormalizedwithoutReducing[i])
                list11.append(denseidf[i])
                if kbisect==0:
                    list1_for_memlist.append(i)
                else:
                    if flag==1:
                        list1_for_memlist.append(memoryrowsdict[positionofsent][3][i])
                    else:
                        list1_for_memlist.append(memoryrowsdict[positionofsent][0][i])

            else:
                count2+=1
                sum2 = sum2+densenormalizedwithoutReducing[i].dot(newcentroid2_mean_norm.T)
                list2.append(densenormalizedwithoutReducing[i])
                list22.append(denseidf[i])
                if kbisect==0:
                    list2_for_memlist.append(i)
                else:
                    if flag==1:
                        list2_for_memlist.append(memoryrowsdict[positionofsent][3][i])
                    else:
                        list2_for_memlist.append(memoryrowsdict[positionofsent][0][i])
            i+=1
    print len(list1_for_memlist),len(list2_for_memlist)
    avg1 = sum1/count1
    avg2 = sum2/count2
    print avg1,avg2
    if avg1<avg2:
        scores.append(avg2)
        scores.append(avg1)
        memoryrowsdict[kbisect]=[list2_for_memlist,list22,list2,list1_for_memlist,list11,list1]
    else:
        scores.append(avg1)
        scores.append(avg2)
        memoryrowsdict[kbisect]=[list1_for_memlist,list11,list1,list2_for_memlist,list22,list2]
    index_min = min(xrange(len(scores)), key=scores.__getitem__)
    print scores
    print len(scores)
    print "i",index_min
    QR = divmod(index_min,2)
    positionofsent=QR[0]
    flag=QR[1]
    scores[index_min]=1
    sentlist1[positionofsent].append(flag)
    if flag==1:
        a=np.asarray(memoryrowsdict[positionofsent][4])
        b=np.asarray(memoryrowsdict[positionofsent][5])
    else:
        a=np.asarray(memoryrowsdict[positionofsent][1])
        b=np.asarray(memoryrowsdict[positionofsent][2])
    print "here1"
    clusterresult,newcentroid1_mean_norm,newcentroid2_mean_norm = k2_means(a,b)
    

k 0
4897 3683
[ 0.20962075] [ 0.2084737]
[array([ 0.20962075]), array([ 0.2084737])]
2
i 1
here1
20 0.154811250034 0.325837621049 1791 1892
19 0.988481567384 0.961220708286 2368 1315
18 0.999098302583 0.998634128341 2369 1314
k 1
2369 1314
[ 0.1796831] [ 0.09005118]
[array([ 0.20962075]), 1, array([ 0.1796831]), array([ 0.09005118])]
4
i 3
here1
20 0.214502095325 0.213621051563 303 1011
19 0.975357830169 0.995460704135 392 922
18 0.994123187203 0.998827366071 447 867
k 2
447 867
[ 0.18769376] [ 0.22950258]
[array([ 0.20962075]), 1, array([ 0.1796831]), 1, array([ 0.22950258]), array([ 0.18769376])]
6
i 2
here1
20 0.287871435544 0.26862717984 1177 1192
19 0.990792832095 0.972881656417 1600 769
18 0.99399725954 0.989406575995 1687 682
17 0.999063761316 0.998426826345 1712 657
k 3
1712 657
[ 0.23502152] [ 0.22755771]
[array([ 0.20962075]), 1, 1, 1, array([ 0.22950258]), array([ 0.18769376]), array([ 0.23502152]), array([ 0.22755771])]
8
i 5
here1
20 0.302771022651 0.293965111346 213 234
1

In [8]:
print len(memoryrowsdict)
print sentlist1

7
defaultdict(<type 'list'>, {0: [1], 1: [1, 0], 2: [1], 4: [1], 5: [1], 6: [1]})


In [9]:
clusters_7=[]
count_7 = 1
for clus in range(7):
    if clus in sentlist1.keys():
        if len(sentlist1[clus])==1:
                if sentlist1[clus][0]==1:
                    clusters_7.append(memoryrowsdict[clus][0])
                else:
                    clusters_7.append(memoryrowsdict[clus][3])
    else:
        clusters_7.append(memoryrowsdict[clus][0])
        clusters_7.append(memoryrowsdict[clus][3])


SyntaxError: invalid syntax (<ipython-input-9-d51236311209>, line 5)

In [None]:
print len(clusters_7)
clusterlabel=1
final_list=[]
for item in clusters_7:
    for i in item:
          final_list.append((clusterlabel,i))  
    clusterlabel+=1

In [None]:
print len(final_list)

In [None]:
sorted_by_second = sorted(final_list, key=lambda tup: tup[1])
print sorted_by_second

In [None]:
f = open('format.dat', 'w')
for item in sorted_by_second:
    f.write(str(item[0])+'\n')
f.close()