# LSI Word Embedding using K-means, Gaussian Mixture and Cosine Similarity as Clustering

In [1]:
import os.path

from gensim import corpora
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

from clustering_class import *
from new_combine_models import *
from DEC import *
from generate_word_vector import *

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Using TensorFlow backend.


In [2]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # Untokenize
        joined_str = ' '.join(stemmed_tokens)
        # add tokens to list
        texts.append(joined_str)
    
    return texts

In [20]:
df = pd.read_csv('data/Russell3000_intro.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,company,intro,segment,industry,introLen,lemma,intro_lemmatized
0,0,MMM,"3M Company develops, manufactures, and markets...",Industrials,Specialty Industrial Machinery,1581,"['3m', 'company', 'develop', 'manufacture', 'a...",3m company develop manufacture and market vari...
1,1,ABT,"Abbott Laboratories discovers, develops, manuf...",Healthcare,Medical Devices,1954,"['abbott', 'laboratories', 'discover', 'develo...",abbott laboratories discover develop manufactu...
2,2,ABBV,"AbbVie Inc., a research-based biopharmaceutica...",Healthcare,Drug Manufacturers—General,1997,"['abbvie', 'inc', 'a', 'research', 'base', 'bi...",abbvie inc a research base biopharmaceutical c...
3,3,ABMD,"Abiomed, Inc. engages in the research, develop...",Healthcare,Medical Devices,1182,"['abiomed', 'inc', 'engage', 'in', 'the', 'res...",abiomed inc engage in the research development...
4,4,ACN,"Accenture plc provides consulting, technology,...",Technology,Information Technology Services,1989,"['accenture', 'plc', 'provide', 'consult', 'te...",accenture plc provide consult technology and o...


In [5]:
df.shape

(2896, 8)

In [6]:
df.iloc[1].intro

"Abbott Laboratories discovers, develops, manufactures, and sells health care products worldwide. Its Established Pharmaceutical Products segment offers branded generic pharmaceuticals for the treatment of pancreatic exocrine insufficiency; irritable bowel syndrome or biliary spasm; intrahepatic cholestasis or depressive symptom; gynecological disorder; hormone replacement therapy; dyslipidemia; hypertension; hypothyroidism; Ménière's disease and vestibular vertigo; pain, fever, and inflammation; migraine; and anti-infective clarithromycin, as well as provides influenza vaccines and products that regulate physiological rhythm of the colon. The company's Diagnostic Products segment offers core laboratory systems in the areas of immunoassay, clinical chemistry, hematology, and transfusion; molecular diagnostics systems that automates the extraction, purification, and preparation of DNA and RNA from patient samples, as well as detects and measures infectious agents; cartridges for blood a

In [7]:
doc_list = df['intro']

##### Pre Processing

In [8]:
preped_doc_list = preprocess_data(doc_list)

##### Tdidf vectorization

In [9]:
vectorizer = TfidfVectorizer(stop_words='english', max_features= None, max_df = 0.5, smooth_idf=True)
dtm = vectorizer.fit_transform(preped_doc_list) # dtm - Document Matrix (sparse matrix)
dtm.shape

(2896, 20025)

- 20025 unique words (features)

##### LSA Word Embedding

In [10]:
# Test
lsa = TruncatedSVD(200, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa) # normalisae so each vector is len 1

In [11]:
pd.DataFrame(dtm_lsa).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.258696,0.343025,0.038922,-0.082137,-0.124406,0.21029,0.063098,0.021308,0.132296,-0.168311,...,0.004256,0.095316,-0.015159,-0.003632,-0.043507,-0.015361,-0.006947,-0.035092,0.001768,-0.044589
1,0.139145,0.281081,0.313787,-0.000777,-0.062711,0.039652,-0.014875,-0.047445,0.079307,-0.149286,...,0.025904,0.000726,-0.047903,-0.013403,0.007489,-0.031114,-0.033641,0.047959,-0.088865,0.007064
2,0.074533,0.159242,0.523024,0.180128,-0.020108,0.031197,-0.007244,-0.012393,0.025801,0.007321,...,-0.0405,-0.05575,0.126388,-0.041882,-0.000613,-0.007205,-0.039796,0.001773,-0.014341,-0.056609
3,0.081596,0.196524,0.112854,-0.003735,-0.075644,0.007277,0.031386,-0.06344,-0.006427,-0.129971,...,-0.163261,-0.077003,0.038435,0.031072,-0.005973,0.04434,-0.062861,0.036373,-0.032168,-0.075726
4,0.331573,0.416709,0.028846,-0.183862,-0.001686,-0.053099,-0.124019,-0.041981,0.095736,0.009783,...,0.028951,-0.003283,0.018637,0.025221,-0.078516,0.01486,-0.056209,0.04874,0.087338,-0.023218


# Clustering Algos

In [12]:
NUM_INDUSTRIES = 69

### K - Means

In [13]:
# defining the kmeans function with initialization as k-means++
kmeans = KMeans(n_clusters=NUM_INDUSTRIES, init='k-means++')
# fitting the k means algorithm on scaled data
kmeans.fit(dtm_lsa)
pred = kmeans.predict(dtm_lsa)
# pd.concat([df[['company']], pd.DataFrame(pred)], axis=1).head()
pred

array([26, 53, 34, ...,  1, 66, 38])

In [14]:
def kmeans_cluster(dtm):
    '''
    input: Document Matrix
    output: list indicating clusters
    '''
    kmeans = KMeans(n_clusters=NUM_INDUSTRIES, init='k-means++')
    kmeans.fit(dtm)
    pred = kmeans.predict(dtm)
    return pred

### Gaussian Mixture

In [15]:
def gmm_cluster(dtm):
    '''
    input: Document Matrix
    output: list indicating clusters
    '''
    gmm = GaussianMixture(n_components=NUM_INDUSTRIES)
    gmm.fit(dtm)
    pred = gmm.predict(dtm)
    return pred

### Cosine Similarity

In [None]:
x = [((1,),(2,),3), ((4,),(1458,),6)]

mcorr = ((1458,), (1830,), 1.0000000000000004)
list(filter(lambda tup: (tup[0] not in mcorr) and (tup[1] not in mcorr), x))

In [None]:
((1,2), (4,)) in (1,)

In [None]:
def cosine_similarity(dtm):
    '''
    input: Doc Matrix
    output: list of lists indicating clusters
    '''
    corr = np.asarray(np.asmatrix(dtm) * np.asmatrix(dtm).T)
    L = corr.shape[0]
    
    cluster_corrs = []
    for i in range(L):
        for j in range(i+1,L):
            corr_tuple = ((i,), (j,), corr[i][j])
            cluster_corrs.append(corr_tuple)
            
    cluster_list = [(i,) for i in range(L)]
    
    num_iter = L - NUM_INDUSTRIES
    for x in range(num_iter):
        max_corr_tup = max(cluster_corrs, key = lambda tup: tup[2])
        # filters ur max_corr_tup also
        print(max_corr_tup)
        cluster_corrs = list(filter(lambda tup: (tup[0] not in max_corr_tup) and (tup[1] not in max_corr_tup),cluster_corrs))
        new_cluster = max_corr_tup[0] + max_corr_tup[1] # concatenate tups
        cluster_list.remove(max_corr_tup[0])
        cluster_list.remove(max_corr_tup[1])
        for cluster in cluster_list:
            # Similarity
            n = len(new_cluster) * len(cluster)
            total_similarity = 0
            for cpy_1 in new_cluster:
                for cpy_2 in cluster:
                    total_similarity += corr[cpy_1][cpy_2]
            similarity = total_similarity / n
            corr_tuple = (new_cluster, cluster, similarity)
            cluster_corrs.append(corr_tuple)
        cluster_list.append(new_cluster)
    
    if len(cluster_list) != NUM_INDUSTRIES:
        raise Exception()
        
    pred = np.full(N, -1)
    for i in range(NUM_INDUSTRIES):
        cluster = list(cluster_list[i])
        pred[cluster] = [i] * len(cluster)
    return pred

In [None]:
cosine_similarity(dtm_lsa)

In [None]:
dtmx = dtm_lsa
corr = np.asarray(np.asmatrix(dtmx) * np.asmatrix(dtmx).T)
L = corr.shape[0]

cluster_corrs = []
for i in range(L):
    for j in range(i+1,L):
        corr_tuple = ((i,), (j,), corr[i][j])
        cluster_corrs.append(corr_tuple)

cluster_list = [(i,) for i in range(L)]

num_iter = L - NUM_INDUSTRIES
for x in range(num_iter):
    max_corr_tup = max(cluster_corrs, key = lambda tup: tup[2])
    # filters ur max_corr_tup also
    print(max_corr_tup)
    cluster_corrs = list(filter(lambda tup: (tup[0] not in max_corr_tup) and (tup[1] not in max_corr_tup),cluster_corrs))
    new_cluster = max_corr_tup[0] + max_corr_tup[1] # concatenate tups
    cluster_list.remove(max_corr_tup[0])
    cluster_list.remove(max_corr_tup[1])
    for cluster in cluster_list:
        # Similarity
        n = len(new_cluster) * len(cluster)
        total_similarity = 0
        for cpy_1 in new_cluster:
            for cpy_2 in cluster:
                total_similarity += corr[cpy_1][cpy_2]
        similarity = total_similarity / n
        corr_tuple = (new_cluster, cluster, similarity)
        cluster_corrs.append(corr_tuple)
    cluster_list.append(new_cluster)

if len(cluster_list) != NUM_INDUSTRIES:
    raise Exception()

pred = np.full(N, -1)
for i in range(NUM_INDUSTRIES):
    cluster = list(cluster_list[i])
    pred[cluster] = [i] * len(cluster)

In [None]:
def t(tup):
    print((tup[0] not in max_corr_tup), (tup[1] not in max_corr_tup))
    return (tup[0] not in max_corr_tup) and (tup[1] not in max_corr_tup)
list(filter(t,cluster_corrs))

### Stock Performance

In [18]:
cluster_dfs_list = [df[['company']]]
for k in range(200, 700, 100):
    print(k)
    lsa_k = TruncatedSVD(k, algorithm = 'arpack')
    dtm_lsa_k = lsa_k.fit_transform(dtm)
    dtm_lsa_k = Normalizer(copy=False).fit_transform(dtm_lsa_k)
    # K-Means
    clusters_k = kmeans_cluster(dtm_lsa_k)
    cluster_dfs_list.append(pd.DataFrame(clusters_k, columns=['lsi_{}_kmeans'.format(str(k))]))
    # GMM
    clusters_k = gmm_cluster(dtm_lsa_k)
    cluster_dfs_list.append(pd.DataFrame(clusters_k, columns=['lsi_{}_gmm'.format(str(k))]))
    # Cosine_similarity
    
all_cluster = pd.concat(all_dfs, axis=1)

200
300
400
500
600


NameError: name 'all_dfs' is not defined

In [21]:
all_cluster.head()

Unnamed: 0,company,lsi_200_kmeans,lsi_200_gmm,lsi_300_kmeans,lsi_300_gmm,lsi_400_kmeans,lsi_400_gmm,lsi_500_kmeans,lsi_500_gmm,lsi_600_kmeans,lsi_600_gmm
0,MMM,68,62,65,28,50,9,52,28,25,15
1,ABT,58,57,52,52,32,42,49,37,23,62
2,ABBV,12,64,63,53,10,8,23,9,67,21
3,ABMD,41,15,62,2,37,66,17,15,43,8
4,ACN,11,53,17,3,67,41,34,19,24,9


In [23]:
stock_data = pd.read_csv('data/price_us.csv',index_col=0)
stock_data.index.names = ['date']
stock_data.index = pd.to_datetime(stock_data.index,format='%Y%m%d')
stock_data = stock_data[stock_data.columns.intersection(all_cluster['company'].values.tolist())]

In [24]:
stock_data.head()

Unnamed: 0_level_0,A,AA,AAL,AAN,AAOI,AAON,AAP,AAPL,AAT,AAWW,...,ZION,ZIOP,ZIXI,ZNGA,ZS,ZTS,ZUMZ,ZUO,ZYNE,ZYXI
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,38.5539,34.8453,51.0799,30.087,10.79,21.0696,156.793,99.946,35.2148,48.42,...,25.7582,5.13,3.55,2.73,16.0,41.6883,38.21,14.0,14.0,0.175245
2015-01-05,37.8315,32.8266,51.0467,29.9691,10.65,20.3538,154.727,97.13,35.6161,46.65,...,24.7931,5.07,3.48,2.71,33.0,41.438,38.94,20.0,16.25,0.155652
2015-01-06,37.2422,33.0679,50.2556,28.8489,10.25,20.0319,154.618,97.139,35.8952,45.66,...,23.8462,4.96,3.39,2.7,27.9,41.0338,38.46,20.6,19.32,0.145916
2015-01-07,37.7365,33.9237,50.2272,29.7235,9.85,20.2145,157.94,98.501,36.5756,46.48,...,24.0693,5.02,3.38,2.68,30.38,41.8808,40.28,20.6,24.54,0.142995
2015-01-08,38.8676,34.8892,50.843,30.2049,9.96,20.7141,159.325,102.286,36.7937,48.21,...,24.4107,4.99,3.64,2.58,31.08,42.5257,41.4,19.55,35.14,0.141048


In [25]:
stock_data = stock_data.pct_change()
stock_data=stock_data.stack()
stock_data = stock_data.reset_index()
stock_data.columns = ['date','company','return']
stock_data = stock_data[['company','return','date']]

In [26]:
stock_data.head()

Unnamed: 0,company,return,date
0,A,-0.018737,2015-01-05
1,AA,-0.057933,2015-01-05
2,AAL,-0.00065,2015-01-05
3,AAN,-0.003919,2015-01-05
4,AAOI,-0.012975,2015-01-05


### Evaluation

In [29]:
performance=performance_analysis(class_df=all_cluster,return_df=stock_data)
performance.get_statistical_describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  1, thresh=ret_comp, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  ].mean(1)


In [35]:
performance.print_table().sort_index()

summary table:


Unnamed: 0,R2,proportion of right classification,classes number
lsi_200_gmm,0.434857,0.732819,69.0
lsi_200_kmeans,0.428494,0.723001,69.0
lsi_300_gmm,0.431955,0.729663,69.0
lsi_300_kmeans,0.444555,0.717742,69.0
lsi_400_gmm,0.443627,0.719846,69.0
lsi_400_kmeans,0.438421,0.717391,69.0
lsi_500_gmm,0.433254,0.700561,69.0
lsi_500_kmeans,0.449381,0.731066,69.0
lsi_600_gmm,0.435887,0.708626,69.0
lsi_600_kmeans,0.437951,0.714236,69.0


| LSI length 200 | 0.4537 | 0.4962 |
|----------------|--------|--------|
| LSI length 300 | 0.4459 | 0.4768 |
|----------------|--------|--------|
| LSI length 400 | 0.4421 | 0.4072 |
|----------------|--------|--------|
| LSI length 500 | 0.4554 | 0.4244 |
|----------------|--------|--------|
| LSI length 600 | 0.4362 | 0.4409 |
|----------------|--------|--------|
| LSI length 700 | 0.4407 | 0.3842 |
|----------------|--------|--------|
| LSI length 800 | 0.4417 | 0.4921 |
|----------------|--------|--------|
| LSI length 900 | 0.4478 | 0.4005 |
|----------------|--------|--------|
| LSI length 1000 | 0.4403 | 0.4068 |
|----------------|--------|--------|
| LSI length 1100 | 0.4380 | 0.3933 |

| This | is   |
|------|------|
|   a  | table|

- not sure why in their code, the classes number is not 69

In [None]:
performance.plot_industry_dense()

In [31]:
ratio_df = pd.read_csv('data/russell3000_ratios.csv')
def multiplier(num):
    out=np.nan
    if num[-1]=='B':
        out=float(num[:-1])*1000
    elif num[-1]=='T':
        out=float(num[:-1])*1000*1000
    elif num[-1]=='M':
        out=float(num[:-1])
    return out
ratio_df['mkt_cap']=ratio_df['mkt_cap'].map(multiplier)
ratio_df['beta'] = pd.to_numeric(ratio_df['beta'],errors='coerce')
ratio_df['profit_m']=ratio_df['profit_m'].map(lambda x: float(x.strip('%').replace(',',''))/100 if not pd.isnull(x) else np.nan)
ratio_df['roa']=ratio_df['roa'].map(lambda x: float(x.strip('%').replace(',',''))/100 if not pd.isnull(x) else np.nan)
ratio_df['roe']=ratio_df['roe'].map(lambda x: float(x.strip('%').replace(',',''))/100 if not pd.isnull(x) else np.nan)
charic_names=list(ratio_df.columns[2:])
ratio_df.dropna(inplace=True)

In [32]:
cluster_names=list(all_cluster.columns[1:])
all_df=all_cluster.merge(ratio_df,how='inner',on='company')

In [33]:
eval_df=pd.DataFrame(columns=['clustering']+charic_names)
total_comp=all_df.shape[0]
charic_total=[]
for r in range(len(charic_names)):
    charic_total.append(ratio_df['mkt_cap'].dot(ratio_df.iloc[:,r+2]))
charic_total=charic_total/ratio_df['mkt_cap'].sum()
cl_results=[]
for cl in range(len(cluster_names)):
    cl_res=[cluster_names[cl]]
    for r in range(len(charic_names)):
        clr_df=all_df.loc[:,[cluster_names[cl],charic_names[r],'mkt_cap']]
        clr_df['sumpro']=clr_df[charic_names[r]]*clr_df['mkt_cap']
        clr_df=clr_df.groupby(cluster_names[cl]).agg({charic_names[r]:'count','mkt_cap':'sum','sumpro':'sum'})
        clr_df['sigma']=clr_df[charic_names[r]]/np.sum(clr_df[charic_names[r]])*(clr_df['sumpro']/clr_df['mkt_cap']-charic_total[r])**2
        clr=np.sqrt(clr_df['sigma'].sum())
        cl_res.append(clr)
    cl_results.append(cl_res)
eval_df=eval_df.append(pd.DataFrame(cl_results,columns=eval_df.columns))
eval_df['average']=eval_df.mean(1)
eval_df.set_index('clustering',inplace=True)
eval_df=eval_df.sort_values('average', ascending=False)
print('Market value weighted:')
print(eval_df)

Market value weighted:
                 pb_ratio      beta  profit_m       roa       roe   average
clustering                                                                 
lsi_400_gmm     10.640692  0.338565  0.144305  0.052437  1.808450  2.596890
lsi_500_kmeans   9.826530  0.300953  0.128645  0.048725  2.046703  2.470311
lsi_200_gmm     10.232821  0.315177  0.146739  0.060368  1.020367  2.355094
lsi_300_kmeans   8.713562  0.313041  0.131408  0.052699  2.498582  2.341858
lsi_600_gmm      9.329555  0.288173  0.131433  0.053440  0.873638  2.135248
lsi_200_kmeans   7.531815  0.301557  0.128854  0.062301  1.811218  1.967149
lsi_500_gmm      6.879473  0.311960  0.141205  0.061411  1.814388  1.841687
lsi_600_kmeans   6.865131  0.321986  0.152246  0.059960  1.803690  1.840603
lsi_300_gmm      6.714773  0.311362  0.154679  0.044518  1.786671  1.802401
lsi_400_kmeans   6.307048  0.308758  0.130515  0.045149  2.171495  1.792593


In [34]:
eval_df=pd.DataFrame(columns=['clustering']+charic_names)
total_comp=all_df.shape[0]
charic_total=[]
for r in range(len(charic_names)):
    charic_total.append(ratio_df.iloc[:,r+2].mean())
cl_results=[]
for cl in range(len(cluster_names)):
    cl_res=[cluster_names[cl]]
    for r in range(len(charic_names)):
        clr_df=all_df.loc[:,[cluster_names[cl],charic_names[r]]]
        clr_df=clr_df.groupby(cluster_names[cl]).agg(['count','mean'])
        clr_df=clr_df[clr_df[(charic_names[r],'count')]>=5]
        clr_df['sigma']=clr_df[(charic_names[r],'count')]/np.sum(clr_df[(charic_names[r],'count')])*(clr_df[(charic_names[r],'mean')]-charic_total[r])**2
        clr=np.sqrt(clr_df['sigma'].sum())
        cl_res.append(clr)
    cl_results.append(cl_res)
eval_df=eval_df.append(pd.DataFrame(cl_results,columns=eval_df.columns))
eval_df['average']=eval_df.mean(1)
eval_df.set_index('clustering',inplace=True)
eval_df=eval_df.sort_values('average', ascending=False)
print('Equal value weighted:')
print(eval_df)

Equal value weighted:
                pb_ratio      beta  profit_m       roa       roe   average
clustering                                                                
lsi_500_kmeans  3.733019  0.411994  0.176020  0.073677  3.165244  1.511991
lsi_400_kmeans  4.306670  0.411335  0.295937  0.070271  2.399707  1.496784
lsi_600_kmeans  4.672141  0.407858  0.208312  0.074003  2.060186  1.484500
lsi_300_kmeans  4.003964  0.400623  0.188819  0.071592  2.112957  1.355591
lsi_300_gmm     3.665035  0.401511  0.264355  0.072710  2.351725  1.351067
lsi_600_gmm     4.125774  0.394775  0.246937  0.072379  1.756900  1.319353
lsi_500_gmm     3.674184  0.414296  0.314009  0.073091  2.068266  1.308769
lsi_200_kmeans  3.992022  0.380000  0.236780  0.073897  1.803764  1.297293
lsi_200_gmm     3.827782  0.401900  0.282234  0.074704  1.834449  1.284214
lsi_400_gmm     3.816172  0.412976  0.193261  0.073064  1.740735  1.247242


### Extra

In [None]:
# Plotting for 2D
%pylab inline

xs = [w[0] for w in dtm_lsa]
ys = [w[1] for w in dtm_lsa]
xs, ys

figure()
plt.scatter(xs,ys)
xlabel('First principal component')
ylabel('Second principal component')
title('Plot of points against LSA principal components')

In [None]:
def cosine_similarity(dtm):
    '''
    input: Doc Matrix
    output: list of lists indicating clusters
    '''
    corr = np.asarray(numpy.asmatrix(dtm_lsa) * numpy.asmatrix(dtm_lsa).T)
    
    L = dtm.shape[0]
    clusters = [[i] for i in range(L)]
    
    while (len(clusters) != NUM_INDUSTRIES):
        L = len(clusters)
        max_similarity = -1
        max_similarity_clusters = (None, None)
        for i in range(L):
            for j in range(i+1,L):
                cluster_1 = clusters[i]
                cluster_2 = clusters[j]
                n = len(cluster_1) * len(cluster_2)
                
                # Similarity
                total_similarity = 0
                for cpy_1 in cluster_1:
                    for cpy_2 in cluster_2:
                        print(cpy_1, cpy_2)
                        total_similarity += corr[cpy_1][cpy_2]
                similarity = total_similarity / n
                
                if similarity > max_similarity:
                    max_similarity = similarity 
                    max_similarity_clusters = (i, j)
        
        # Merge max_similarity clusters
        cluster_merge = clusters.pop(max_similarity_clusters[1])
        clusters[max_similarity_clusters[0]].append(cluster_merge)
        print(L)
    return clusters