In [1]:
from sklearn.base import BaseEstimator
from sklearn.utils import check_random_state
from gensim import utils, corpora, matutils, models
import gensim
from gensim.test.utils import common_texts
from gensim.models import FastText
from collections import defaultdict
import warnings ## Ignore General warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)





In [2]:
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well  ordering",
"Graph minors A survey"]
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]


frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

dictionary = corpora.Dictionary(texts)

In [3]:
corpus = [dictionary.doc2bow(text) for text in texts]
print (corpus)

[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


In [4]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
Number_of_Topics = 2
Lda = models.LdaModel(corpus_tfidf, id2word=dictionary, 
                      num_topics=Number_of_Topics,minimum_probability=0,
                      random_state=2,
                      per_word_topics=False,
                      )
corpus_lda = Lda[corpus_tfidf]
corpus_lda

<gensim.interfaces.TransformedCorpus at 0x20b112bfcf8>

In [5]:
from gensim import corpora, models, similarities
index = similarities.MatrixSimilarity(Lda[corpus_tfidf])
index

<gensim.similarities.docsim.MatrixSimilarity at 0x20b0e743da0>

In [6]:
import nltk
from nltk.cluster.kmeans import KMeansClusterer
NUM_CLUSTERS = Number_of_Topics
kclusterer = KMeansClusterer(NUM_CLUSTERS, 
                             distance=nltk.cluster.util.cosine_distance, 
                             repeats=25)

NLTK_clusters = kclusterer.cluster(index, assign_clusters=True)
NLTK_clusters

[1, 1, 1, 1, 1, 0, 0, 0, 0]

In [7]:
from sklearn import metrics
silhouette_score = metrics.silhouette_score(index, NLTK_clusters, metric='cosine')
print ("Silhouette_score  :\t",round(silhouette_score*100,2))

Silhouette_score  :	 96.17


In [8]:
Doc_num = {}
for i, x in enumerate(corpus_lda):
    Doc_num[i] = documents[i]
Doc_num

{0: 'Human machine interface for lab abc computer applications',
 1: 'A survey of user opinion of computer system response time',
 2: 'The EPS user interface management system',
 3: 'System and human system engineering testing of EPS',
 4: 'Relation of user perceived response time to error measurement',
 5: 'The generation of random binary unordered trees',
 6: 'The intersection graph of paths in trees',
 7: 'Graph minors IV Widths of trees and well  ordering',
 8: 'Graph minors A survey'}

In [9]:
lda_corpus_porb = [max(prob,key=lambda y:y[1])for prob in corpus_lda ]
lists_label     = [ x[0] for i, x in enumerate(lda_corpus_porb) ]
Max_prob        = [ x[1] for i, x in enumerate(lda_corpus_porb) ]

In [10]:
list(corpus_lda)

[[(0, 0.32789627), (1, 0.6721037)],
 [(0, 0.19731417), (1, 0.8026858)],
 [(0, 0.20721765), (1, 0.79278237)],
 [(0, 0.2448308), (1, 0.7551693)],
 [(0, 0.24677397), (1, 0.75322604)],
 [(0, 0.716223), (1, 0.283777)],
 [(0, 0.7607941), (1, 0.2392059)],
 [(0, 0.75949466), (1, 0.2405054)],
 [(0, 0.61453646), (1, 0.3854635)]]

In [11]:
lda_corpus_porb

[(1, 0.6721115),
 (1, 0.80263805),
 (1, 0.79278815),
 (1, 0.755171),
 (1, 0.7531113),
 (0, 0.71621406),
 (0, 0.76079434),
 (0, 0.7596566),
 (0, 0.6143874)]

In [12]:
import pandas as pd
Data_frame = pd.DataFrame(columns=["Doc_Num","Document"])

Data_frame["Doc_Num"]     =  [ x for x in Doc_num.keys()]
Data_frame["Document"]    =  [ x for x in Doc_num.values()]
Data_frame["SIM_Matirx"]  =  index
Data_frame["TF_IDF"]      =  [ x for x in Lda[corpus_tfidf]]
Data_frame["Corpus"]      =  [ x for x in corpus]
Data_frame["Prob_Label"]  =  lists_label
Data_frame["Max_Prob"]    =  Max_prob

Data_frame

Unnamed: 0,Doc_Num,Document,SIM_Matirx,TF_IDF,Corpus,Prob_Label,Max_Prob
0,0,Human machine interface for lab abc computer a...,"(1.0, 0.9774877, 0.98045105, 0.9901688, 0.9906...","[(0, 0.32776433), (1, 0.6722356)]","[(0, 1), (1, 1), (2, 1)]",1,0.672112
1,1,A survey of user opinion of computer system re...,"(0.9774877, 1.0, 0.9998944, 0.9973909, 0.99713...","[(0, 0.19728279), (1, 0.80271727)]","[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]",1,0.802638
2,2,The EPS user interface management system,"(0.98045105, 0.9998944, 1.0, 0.99833477, 0.998...","[(0, 0.2072326), (1, 0.7927674)]","[(2, 1), (5, 1), (7, 1), (8, 1)]",1,0.792788
3,3,System and human system engineering testing of...,"(0.9901688, 0.9973909, 0.99833477, 0.99999994,...","[(0, 0.24476586), (1, 0.7552341)]","[(1, 1), (5, 2), (8, 1)]",1,0.755171
4,4,Relation of user perceived response time to er...,"(0.9906407, 0.99713856, 0.998132, 0.9999941, 0...","[(0, 0.24678937), (1, 0.75321066)]","[(3, 1), (6, 1), (7, 1)]",1,0.753111
5,5,The generation of random binary unordered trees,"(0.7387284, 0.57988816, 0.5916679, 0.63718784,...","[(0, 0.7162525), (1, 0.2837475)]","[(9, 1)]",0,0.716214
6,6,The intersection graph of paths in trees,"(0.6877785, 0.51913106, 0.53149855, 0.5794767,...","[(0, 0.7607017), (1, 0.23929833)]","[(9, 1), (10, 1)]",0,0.760794
7,7,Graph minors IV Widths of trees and well orde...,"(0.6890627, 0.5206436, 0.53299755, 0.58091885,...","[(0, 0.7596756), (1, 0.24032442)]","[(9, 1), (10, 1), (11, 1)]",0,0.759657
8,8,Graph minors A survey,"(0.8489264, 0.7183032, 0.72833925, 0.7666536, ...","[(0, 0.61460483), (1, 0.38539517)]","[(4, 1), (10, 1), (11, 1)]",0,0.614387


In [13]:
Data_frame['Rank_Prob'] = Data_frame.groupby('Prob_Label')['Max_Prob'].rank(ascending=False)


In [14]:
Data_frame.sort_values(by=['Prob_Label',"Rank_Prob"], ascending=[True,True],inplace=True)

In [15]:
from scipy.spatial.distance import cosine

def Cos_SIM_Funct(X,Y):  
    return round((1 - cosine(X,Y))*100,4)



In [16]:
#Update_Max_Prob['NewCol'] = Update_Max_Prob.apply(lambda x: Cosine_Distance_Cal(x['Cos_Cent'], x['SIM_Matirx']), axis=1)

In [17]:
def Cosine_mean_Sim(Cls_value):
    Temp_df = Data_frame.loc[Data_frame.Prob_Label == int(Cls_value)]
    Select_RanK_1 = Temp_df["SIM_Matirx"].values[0]
    Value_cos_dis = []
    
    for x in Temp_df.SIM_Matirx:
        Value_cos_dis.append(Cos_SIM_Funct(Select_RanK_1,x))
        print (Select_RanK_1)
        print (x)
        print ("~"*10)
        
    Temp_df["Cos_Simlarity"]  = Value_cos_dis
    print (Value_cos_dis)
    return Temp_df
    
        
    

In [18]:
blank_df = pd.DataFrame(columns=Data_frame.columns) 
for cls_ in range(Number_of_Topics):
    blank_df = pd.concat([blank_df,Cosine_mean_Sim(Cls_value=cls_)])
    

(0.6877785, 0.51913106, 0.53149855, 0.5794767, 0.58225644, 0.99735427, 1.0, 0.9999984, 0.9675303)
(0.6877785, 0.51913106, 0.53149855, 0.5794767, 0.58225644, 0.99735427, 1.0, 0.9999984, 0.9675303)
~~~~~~~~~~
(0.6877785, 0.51913106, 0.53149855, 0.5794767, 0.58225644, 0.99735427, 1.0, 0.9999984, 0.9675303)
(0.6890627, 0.5206436, 0.53299755, 0.58091885, 0.5836951, 0.99748147, 0.9999984, 1.0, 0.9679764)
~~~~~~~~~~
(0.6877785, 0.51913106, 0.53149855, 0.5794767, 0.58225644, 0.99735427, 1.0, 0.9999984, 0.9675303)
(0.7387284, 0.57988816, 0.5916679, 0.63718784, 0.63981605, 1.0, 0.99735427, 0.99748147, 0.98334414)
~~~~~~~~~~
(0.6877785, 0.51913106, 0.53149855, 0.5794767, 0.58225644, 0.99735427, 1.0, 0.9999984, 0.9675303)
(0.8489264, 0.7183032, 0.72833925, 0.7666536, 0.76884174, 0.98334414, 0.9675303, 0.9679764, 1.0)
~~~~~~~~~~
[100.0, 99.9999, 99.9069, 98.9995]
(0.9774877, 1.0, 0.9998944, 0.9973909, 0.99713856, 0.57988816, 0.51913106, 0.5206436, 0.7183032)
(0.9774877, 1.0, 0.9998944, 0.9973909, 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [19]:
blank_df

Unnamed: 0,Corpus,Cos_Simlarity,Doc_Num,Document,Max_Prob,Prob_Label,Rank_Prob,SIM_Matirx,TF_IDF
6,"[(9, 1), (10, 1)]",100.0,6,The intersection graph of paths in trees,0.760794,0,1.0,"(0.6877785, 0.51913106, 0.53149855, 0.5794767,...","[(0, 0.7607017), (1, 0.23929833)]"
7,"[(9, 1), (10, 1), (11, 1)]",99.9999,7,Graph minors IV Widths of trees and well orde...,0.759657,0,2.0,"(0.6890627, 0.5206436, 0.53299755, 0.58091885,...","[(0, 0.7596756), (1, 0.24032442)]"
5,"[(9, 1)]",99.9069,5,The generation of random binary unordered trees,0.716214,0,3.0,"(0.7387284, 0.57988816, 0.5916679, 0.63718784,...","[(0, 0.7162525), (1, 0.2837475)]"
8,"[(4, 1), (10, 1), (11, 1)]",98.9995,8,Graph minors A survey,0.614387,0,4.0,"(0.8489264, 0.7183032, 0.72833925, 0.7666536, ...","[(0, 0.61460483), (1, 0.38539517)]"
1,"[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]",100.0,1,A survey of user opinion of computer system re...,0.802638,1,1.0,"(0.9774877, 1.0, 0.9998944, 0.9973909, 0.99713...","[(0, 0.19728279), (1, 0.80271727)]"
2,"[(2, 1), (5, 1), (7, 1), (8, 1)]",99.9969,2,The EPS user interface management system,0.792788,1,2.0,"(0.98045105, 0.9998944, 1.0, 0.99833477, 0.998...","[(0, 0.2072326), (1, 0.7927674)]"
3,"[(1, 1), (5, 2), (8, 1)]",99.9262,3,System and human system engineering testing of...,0.755171,1,3.0,"(0.9901688, 0.9973909, 0.99833477, 0.99999994,...","[(0, 0.24476586), (1, 0.7552341)]"
4,"[(3, 1), (6, 1), (7, 1)]",99.9192,4,Relation of user perceived response time to er...,0.753111,1,4.0,"(0.9906407, 0.99713856, 0.998132, 0.9999941, 0...","[(0, 0.24678937), (1, 0.75321066)]"
0,"[(0, 1), (1, 1), (2, 1)]",99.4085,0,Human machine interface for lab abc computer a...,0.672112,1,5.0,"(1.0, 0.9774877, 0.98045105, 0.9901688, 0.9906...","[(0, 0.32776433), (1, 0.6722356)]"


In [20]:
def return_pos(mean_,limit_pos):
    #Index_STD = []
    Pos_1 = (list(limit_pos.values())[0])
    Pos_2 = (list(limit_pos.values())[1])
    Pos_3 = (list(limit_pos.values())[2])
    Pos_4 = (list(limit_pos.values())[3])
    Pos_5 = (list(limit_pos.values())[4])
    Pos_6 = (list(limit_pos.values())[5])
    Pos_7 = (list(limit_pos.values())[6])
    Pos_8 = (list(limit_pos.values())[7])
    Pos_9 = (list(limit_pos.values())[8])
    Pos_10 = (list(limit_pos.values())[9])
 
    if Pos_1 <= mean_ <= Pos_2:
        Index_STD =(list(limit_pos.keys())[0])   
    elif Pos_2 <= mean_ <= Pos_3:
        Index_STD = (list(limit_pos.keys())[1])
    elif Pos_3 <= mean_ <= Pos_4:
        Index_STD =(list(limit_pos.keys())[2])
    elif Pos_4 <= mean_ <= Pos_5:
        Index_STD =(list(limit_pos.keys())[3])
    elif Pos_5 <= mean_ <= Pos_6:
        Index_STD =(list(limit_pos.keys())[4])
    elif Pos_6 <= mean_ <= Pos_7:
        Index_STD =(list(limit_pos.keys())[5])
    elif Pos_7 <= mean_ <= Pos_8:
        Index_STD =(list(limit_pos.keys())[6])
    elif Pos_8 <= mean_ <= Pos_9:
        Index_STD =(list(limit_pos.keys())[7])
    elif Pos_9 <= mean_ <= Pos_10:
        Index_STD =(list(limit_pos.keys())[8])
    else:
        Index_STD =(2.50)        
    return Index_STD

def return_neg(mean_,limit_pos):
    #Index_STD = []
    Pos_1 = (list(limit_pos.values())[0])
    Pos_2 = (list(limit_pos.values())[1])
    Pos_3 = (list(limit_pos.values())[2])
    Pos_4 = (list(limit_pos.values())[3])
    Pos_5 = (list(limit_pos.values())[4])
    Pos_6 = (list(limit_pos.values())[5])
    Pos_7 = (list(limit_pos.values())[6])
    Pos_8 = (list(limit_pos.values())[7])
    Pos_9 = (list(limit_pos.values())[8])
    Pos_10 = (list(limit_pos.values())[9])
    if Pos_1 >= mean_ >= Pos_2:
        Index_STD =(list(limit_pos.keys())[0])
    elif Pos_2 >= mean_ >= Pos_3:
        Index_STD = (list(limit_pos.keys())[1])
    elif Pos_3 >= mean_ >= Pos_4:
        Index_STD =(list(limit_pos.keys())[2])
    elif Pos_4 >= mean_ >= Pos_5:
        Index_STD =(list(limit_pos.keys())[3])
    elif Pos_5 >= mean_ >= Pos_6:
        Index_STD =(list(limit_pos.keys())[4])
    elif Pos_6 >= mean_ >= Pos_7:
        Index_STD =(list(limit_pos.keys())[5])
    elif Pos_7 >= mean_ >= Pos_8:
        Index_STD =(list(limit_pos.keys())[6])
    elif Pos_8 >= mean_ >= Pos_9:
        Index_STD =(list(limit_pos.keys())[7])
    elif Pos_9 >= mean_ >= Pos_10:
        Index_STD =(list(limit_pos.keys())[8])
    else:
        Index_STD =(2.50)
    return Index_STD

In [21]:
import numpy as np
def STD_Sampling_Data_JD_Sep_5(Df,dbname):
    MAD = Df.Cos_Simlarity.std(ddof=0)
    Place_Value = np.where(Df.Cos_Simlarity == float(100.0000))
    print (Place_Value)
    Best_Dist   = Df.Cos_Simlarity.iloc[Place_Value[0]]
    if len(Best_Dist)==1:
        Best_Dist = float(Best_Dist)
    else:
        Best_Dist = 0
    print ("Best Distance",Best_Dist)
    
    Threhold_List = [0,0.25,0.50,0.75,1.0,1.25,1.50,1.75,2.0,2.25]
    Positive_Limit = [round(Best_Dist + (thresh * MAD),6) for thresh in Threhold_List]
    
    Dict_Positive_limit = dict(zip(Threhold_List, Positive_Limit))
    
    Negative_Limit = [round(Best_Dist - (thresh * MAD),6) for thresh in Threhold_List]
    
    Dict_Negative_Limit = dict(zip(Threhold_List, Negative_Limit))
    
    Df["Pos"] =Df.Cos_Simlarity.apply(lambda x: return_pos(x,Dict_Positive_limit))
    Df["Neg"] =Df.Cos_Simlarity.apply(lambda x: return_neg(x,Dict_Negative_Limit))
    
    
    
    List_Get = [x if x <=y  and x <= 2.50 else y for x,y in zip(Df["Pos"],Df["Neg"])]
    Df["Pos"]
    Df["Neg"]
    Df["Response"] = List_Get
#    Df.MATRIX = Df["MATRIX"].astype(str)
    Df.SIM_Matirx = Df["SIM_Matirx"].astype(str)
    Df.TF_IDF = Df["TF_IDF"].astype(str)
    print (Df)
    print (Df.info())
    Df.to_excel(str("Df")+str(".xlsx"))
    dbname = str(dbname)
    collection_name = mydb[dbname]
    collection_name.insert_many(Df.to_dict('records'))

In [22]:
import pandas as pd # Pandas 
import numpy as np # Numpy
from pymongo import MongoClient # MongoDB
client = MongoClient() # class of mongoDb
client = MongoClient('localhost', 27017) # connection port
mydb = client['New_Data_Email']
dbname = "LDA_Database"

def Clean_DB_First_JD(dbname):
    mydb = client['New_Data_Email']
    #print (mydb.collection_names())
    
    collection_name = mydb[dbname]
    print (collection_name.name)
    
    #print (collection_name.count())
    collection_name.remove()
    
Clean_DB_First_JD(dbname)
import os
os.chdir("D:\\1_JD_IPYTHON")
for j in range(Number_of_Topics):
    print ("Cluster Number is Processed :\t",j)
    Demo = blank_df.loc[blank_df.Prob_Label == j]
    STD_Sampling_Data_JD_Sep_5(Demo,dbname)
    

LDA_Database
Cluster Number is Processed :	 0
(array([0], dtype=int64),)
Best Distance 100.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = v

                       Corpus  Cos_Simlarity Doc_Num  \
6           [(9, 1), (10, 1)]       100.0000       6   
7  [(9, 1), (10, 1), (11, 1)]        99.9999       7   
5                    [(9, 1)]        99.9069       5   
8  [(4, 1), (10, 1), (11, 1)]        98.9995       8   

                                            Document  Max_Prob Prob_Label  \
6           The intersection graph of paths in trees  0.760794          0   
7  Graph minors IV Widths of trees and well  orde...  0.759657          0   
5    The generation of random binary unordered trees  0.716214          0   
8                              Graph minors A survey  0.614387          0   

   Rank_Prob                                         SIM_Matirx  \
6        1.0  (0.6877785, 0.51913106, 0.53149855, 0.5794767,...   
7        2.0  (0.6890627, 0.5206436, 0.53299755, 0.58091885,...   
5        3.0  (0.7387284, 0.57988816, 0.5916679, 0.63718784,...   
8        4.0  (0.8489264, 0.7183032, 0.72833925, 0.7666536, ...  