In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import random as rm

In [2]:
def csr_read(fname, ftype="csr", nidx=1):
    r""" 
        Read CSR matrix from a text file. 
        
        \param fname File name for CSR/CLU matrix
        \param ftype Input format. Acceptable formats are:
            - csr - Compressed sparse row
            - clu - Cluto format, i.e., CSR + header row with "nrows ncols nnz"
        \param nidx Indexing type in CSR file. What does numbering of feature IDs start with?
    """
    
    with open(fname) as f:
        lines = f.readlines()
    
    if ftype == "clu":
        p = lines[0].split()
        nrows = int(p[0])
        ncols = int(p[1])
        nnz = long(p[2])
        lines = lines[1:]
        assert(len(lines) == nrows)
    elif ftype == "csr":
        nrows = len(lines)
        ncols = 0 
        nnz = 0 
        for i in xrange(nrows):
            p = lines[i].split()
            if len(p) % 2 != 0:
                raise ValueError("Invalid CSR matrix. Row %d contains %d numbers." % (i, len(p)))
            nnz += len(p)/2
            for j in xrange(0, len(p), 2): 
                cid = int(p[j]) - nidx
                if cid+1 > ncols:
                    ncols = cid+1
    else:
        raise ValueError("Invalid sparse matrix ftype '%s'." % ftype)
    val = np.zeros(nnz, dtype=np.float)
    ind = np.zeros(nnz, dtype=np.int)
    ptr = np.zeros(nrows+1, dtype=np.long)
    n = 0 
    for i in xrange(nrows):
        p = lines[i].split()
        for j in xrange(0, len(p), 2): 
            ind[n] = int(p[j]) - nidx
            val[n] = float(p[j+1])
            n += 1
        ptr[i+1] = n 
    
    assert(n == nnz)
    
    return csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.float)

In [3]:
filename = "train.dat"
text_csr = csr_read(filename)

In [4]:
#dense_matrix = csr_matrix.todense(text_csr)

In [5]:
#print dense_matrix.shape


In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
idft = TfidfTransformer(norm=None)
idfmatrix = idft.fit_transform(text_csr)

In [7]:
denseidf = csr_matrix.todense(idfmatrix)

In [8]:
print denseidf.shape

(8580, 126355)


In [9]:
#from sklearn.decomposition import TruncatedSVD
#svd = TruncatedSVD(n_components=4000)
#reducedm = svd.fit_transform(idfmatrix)

In [10]:
#print(svd.explained_variance_ratio_)
#print(svd.explained_variance_ratio_.sum())

In [11]:
#from sklearn.externals import joblib
#fileObject = open('../pickle/idfmatrix.pickle','wb')
#joblib.dump(idfmatrix, fileObject)
#fileObject.close()

In [12]:
#read code
#fileObject = open('../pickle/idfmatrix.pickle','rb')
#idfmatrix = joblib.load(fileObject)
#fileObject.close()

In [13]:
#from sklearn.externals import joblib
#fileObject = open('../pickle/actualreducedMwithoutnormalisation.pickle','wb')
#joblib.dump(reducedm, fileObject)
#fileObject.close()

In [14]:
#readcode
#fileObject = open('../pickle/aactualreducedMwithoutnormalisation.pickle','rb')
#reducedm = joblib.load(fileObject)
#fileObject.close()

In [15]:
from sklearn.preprocessing import normalize
#normalizedReduced=normalize(reducedm, norm='l2')

In [16]:
normalizedwithoutReducing = normalize(idfmatrix, norm='l2')

In [17]:
densenormalizedwithoutReducing = csr_matrix.todense(normalizedwithoutReducing)

In [18]:
print densenormalizedwithoutReducing.shape

(8580, 126355)


In [19]:
densenormalizedwithoutReducing = np.asarray(densenormalizedwithoutReducing)
denseidf = np.asarray(denseidf)

In [20]:
print type (densenormalizedwithoutReducing)
print type (denseidf)

<type 'numpy.ndarray'>
<type 'numpy.ndarray'>


In [21]:
print densenormalizedwithoutReducing[0]
print denseidf[0]

[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]


In [22]:
def k2_means(denseidf,densenormalizedwithoutReducing,centroid1=None,centroid2=None,iter=20):
    if type(centroid1)!=np.ndarray or type(centroid2)!=np.ndarray:
        #print "here"
        cent1=0
        cent2=0
        num = denseidf.shape[0]-1
        while (cent1==cent2):
            cent1 = rm.randint(0, num)
            cent2 = rm.randint(0, num)
            #print normalizedwithoutReducing[cent1],cent2
        centroid1 = np.array(densenormalizedwithoutReducing[cent1])
        centroid2 = np.array(densenormalizedwithoutReducing[cent2])
        centroidarray = np.append([centroid1],[centroid2], axis=0)
        #print centroid1.shape
        
    else:
        centroidarray = np.append([centroid1],[centroid2], axis=0)
        #print centroidarray.shape
        
    centroidcosineArray = densenormalizedwithoutReducing.dot(centroidarray.T)
    
    i=0
    cluster=[]
    newcentroid1_points = 0
    newcentroid2_points = 0
    newcentroid1_sum = np.zeros(shape=[1,126355])
    newcentroid2_sum = np.zeros(shape=[1,126355])
    newcentroid1_mean = np.zeros(shape=[1,126355])
    newcentroid2_mean = np.zeros(shape=[1,126355])
    newcentroid1_mean_norm = np.zeros(shape=[1,126355])
    newcentroid2_mean_norm = np.zeros(shape=[1,126355])
    
    for item in centroidcosineArray:
        if item[0]>item[1]:
            newcentroid1_points+=1
            newcentroid1_sum = newcentroid1_sum+denseidf[i]
            cluster.append(1)
        else:
            newcentroid2_points+=1
            newcentroid2_sum = newcentroid2_sum+denseidf[i]
            cluster.append(2)
        i+=1
    #print newcentroid1_points,newcentroid2_points
    
    newcentroid1_mean = newcentroid1_sum/newcentroid1_points
    newcentroid2_mean = newcentroid2_sum/newcentroid2_points
    
    newcentroid1_mean_norm=normalize(newcentroid1_mean, norm='l2')
    newcentroid2_mean_norm=normalize(newcentroid2_mean, norm='l2')
    
    #print newcentroid1_mean[0],centroid1
    
    comp1 = newcentroid1_mean_norm[0].dot(centroid1.T)
    comp2 = newcentroid2_mean_norm[0].dot(centroid2.T)
    
    if (comp1>=0.99 and comp2>=0.99) or iter==0:
        #print cluster
        print iter,comp1,comp2,newcentroid1_points,newcentroid2_points
        return cluster,newcentroid1_mean_norm,newcentroid2_mean_norm
    else:
        print iter,comp1,comp2,newcentroid1_points,newcentroid2_points
        iter-=1
        return k2_means(denseidf,densenormalizedwithoutReducing,newcentroid1_mean_norm[0],newcentroid2_mean_norm[0],iter)
     

In [81]:
clusterresult,newcentroid1_mean_norm,newcentroid2_mean_norm = k2_means(denseidf,densenormalizedwithoutReducing)

20 0.184268344012 0.330019064707 4012 4568
19 0.976912398121 0.98243339514 3990 4590
18 0.995952095506 0.997984931352 3614 4966


In [147]:
print densenormalizedwithoutReducing[0]

[ 0.  0.  0. ...,  0.  0.  0.]


In [82]:
from collections import defaultdict
memoryrowsdict={}
scores=[]
flag=0
odd=1
even=0
sent0=0
sentlist=[]
sentlist1 = defaultdict(list)
positionofsent=0
for kbisect in range(7):
    print "k",kbisect
    list1_for_memlist=[]
    list2_for_memlist=[]
    list2=[]
    list22=[]
    list1=[]
    list11=[]
    sum1=0
    count1=0
    sum2=0
    count2=0
    i=0
    for items in clusterresult:
            if items==1:
                count1+=1
                sum1 = sum1+(densenormalizedwithoutReducing[i].dot(newcentroid1_mean_norm.T))
                list1.append(densenormalizedwithoutReducing[i])
                list11.append(denseidf[i])
                if kbisect==0:
                    list1_for_memlist.append(i)
                else:
                    if flag==1:
                        list1_for_memlist.append(memoryrowsdict[positionofsent][0][i])
                    else:
                        list1_for_memlist.append(memoryrowsdict[positionofsent][3][i])

            else:
                count2+=1
                sum2 = sum2+densenormalizedwithoutReducing[i].dot(newcentroid2_mean_norm.T)
                list2.append(densenormalizedwithoutReducing[i])
                list22.append(denseidf[i])
                if kbisect==0:
                    list2_for_memlist.append(i)
                else:
                    if flag==1:
                        list2_for_memlist.append(memoryrowsdict[positionofsent][0][i])
                    else:
                        list2_for_memlist.append(memoryrowsdict[positionofsent][3][i])
            i+=1
    print len(list1_for_memlist),len(list2_for_memlist)
    avg1 = sum1/count1
    avg2 = sum2/count2
    print avg1,avg2
    memoryrowsdict[kbisect]=[list2_for_memlist,list22,list2,list1_for_memlist,list11,list1]
    #print memoryrowsdict[0][0][0],memoryrowsdict[0][1][0],memoryrowsdict[0][2][0],memoryrowsdict[0][3][0],memoryrowsdict[0][4][0],memoryrowsdict[0][5][0]
    if avg1<avg2:
        scores.append(avg2)
        index_min = min(xrange(len(scores)), key=scores.__getitem__)
        print scores
        print len(scores)
        print "i",index_min
        positionofsent=index_min
        if(index_min+1)==len(scores):
            flag=0
            if kbisect!=0:
                scores[index_min]=1
            else:
                sent0=0
            sentlist1[positionofsent].append(flag)
            sentlist.append(odd)
            a=np.asarray(memoryrowsdict[index_min][4])
            b=np.asarray(memoryrowsdict[index_min][5])
            #print a.shape,b.shape
            #print a
            print "here1"
            clusterresult,newcentroid1_mean_norm,newcentroid2_mean_norm = k2_means(a,b)
            #print clusterresult
            odd+=2
                        
        else:
            scores[index_min]=1
            sentlist.append(even)
            flag=1
            sentlist1[positionofsent].append(flag)
            if sentlist1[positionofsent][0]==0:
                flag=1
                sentlist1[positionofsent].append(flag)
                a=np.asarray(memoryrowsdict[index_min][1])
                b=np.asarray(memoryrowsdict[index_min][2])
            else:
                flag=0
                sentlist1[positionofsent].append(flag)
                a=np.asarray(memoryrowsdict[index_min][4])
                b=np.asarray(memoryrowsdict[index_min][5])
            #print a.shape,b.shape
            #print a,b
            print "here2"
            clusterresult,newcentroid1_mean_norm,newcentroid2_mean_norm = k2_means(a,b)
            #print clusterresult
            even+=2
                        
    else:
        scores.append(avg1)
        index_min = min(xrange(len(scores)), key=scores.__getitem__)
        positionofsent=index_min
        print scores
        print len(scores)
        print "i",index_min
        if(index_min+1)==len(scores):
            flag=1
            if kbisect!=0:
                scores[index_min]=1
            else:
                 sent0 = 1
            sentlist1[positionofsent].append(flag)
            sentlist.append(even)
            a=np.asarray(memoryrowsdict[index_min][1])
            b=np.asarray(memoryrowsdict[index_min][2])
            #print a.shape,b.shape
            #print a,b
            print "here3"
            clusterresult,newcentroid1_mean_norm,newcentroid2_mean_norm = k2_means(a,b)
            #print clusterresult
            even+=2
                        
        else:
            flag=0
            sentlist1[positionofsent].append(flag)
            scores[index_min]=1
            if sentlist1[positionofsent][0]==0:
                flag=1
                sentlist1[positionofsent].append(flag)
                a=np.asarray(memoryrowsdict[index_min][1])
                b=np.asarray(memoryrowsdict[index_min][2])
            else:
                flag=0
                sentlist1[positionofsent].append(flag)
                a=np.asarray(memoryrowsdict[index_min][4])
                b=np.asarray(memoryrowsdict[index_min][5])
            #print a.shape,b.shape
            #print a,b
            print "here4"
            clusterresult,newcentroid1_mean_norm,newcentroid2_mean_norm = k2_means(a,b)
            #print clusterresult
            odd+=2
                        

        

    

k 0
3614 4966
[ 0.23551023] [ 0.20385361]
[array([ 0.23551023])]
1
i 0
here3
20 0.231810633505 0.259862180527 3522 1444
19 0.972535364845 0.964279168022 2669 2297
18 0.990715709092 0.992057088227 2308 2658
k 1
2308 2658
[ 0.13088851] [ 0.14586426]
[array([ 0.23551023]), array([ 0.14586426])]
2
i 1
here1
20 0.120862281866 0.175383113218 1664 644
19 0.983320934 0.952572366666 1374 934
18 0.996713150486 0.992758704829 1251 1057
k 2
1251 1057
[ 0.24410129] [ 0.20026022]
[array([ 0.23551023]), 1, array([ 0.24410129])]
3
i 0
here4
20 0.21953895066 0.175238964856 3307 307
19 0.994121470234 0.901897655482 2985 629
18 0.999270126286 0.985462692784 2920 694
17 0.999323695388 0.990329135688 2797 817
k 3
2797 817
[ 0.24935091] [ 0.15274675]
[1, 1, array([ 0.24410129]), array([ 0.24935091])]
4
i 2


IndexError: list index out of range

In [83]:
print len(memoryrowsdict)

4


In [84]:
print sentlist1

defaultdict(<type 'list'>, {0: [1, 0], 1: [0], 2: []})


In [80]:
clusters_7=[]
count_7 = 1
for clus in range(1,7):
    if clus in sentlist1.keys():
        #if clus!=0:
        if sentlist1[clus]==1:
            clusters_7.append(memoryrowsdict[clus][3])
            print len((memoryrowsdict[clus][3]))
        else:
            clusters_7.append(memoryrowsdict[clus][0])
            print len((memoryrowsdict[clus][0]))
        #count_7+=1
            #if count_7==8:
                #break
    else:
        '''if clus==0 and sent==1:
            clusters_7.append(memoryrowsdict[clus][3])
            print len((memoryrowsdict[clus][3]))
            count_7+=1
            if count_7==8:
                break
        elif clus==0 and sent==0: 
            clusters_7.append(memoryrowsdict[clus][0])
            print len((memoryrowsdict[clus][0]))
            count_7+=1
            if count_7==8:
                break
        else:'''
        clusters_7.append(memoryrowsdict[clus][0])
        print len((memoryrowsdict[clus][0]))
        #count_7+=1
        #if count_7==8:
            #break
        clusters_7.append(memoryrowsdict[clus][3])
        print len((memoryrowsdict[clus][3]))
            #count_7+=1
            #if count_7==8:
                #break
                
clusters_7.append[(memoryrowsdict[6][3])]
clusters_7.append[(memoryrowsdict[6][0])]
print len((memoryrowsdict[6][3]))
print len((memoryrowsdict[6][0]))

3444
1544
1152
874
121
64
57


TypeError: 'builtin_function_or_method' object has no attribute '__getitem__'

TypeError: 'builtin_function_or_method' object has no attribute '__getitem__'

In [315]:
print len(clusters_7)

7


In [332]:
clusterlabel=1
final_list=[]
for item in clusters_7:
    for i in item:
          final_list.append((clusterlabel,i))  
    clusterlabel+=1

    

In [342]:
print len(final_list)

10600


In [306]:
sorted_by_second = sorted(final_list, key=lambda tup: tup[1])

In [None]:
print sorted_by_second

In [302]:
del sorted_by_second
del final_list