In [None]:
import re
import time
from pprint import pprint

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA, TruncatedSVD
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
import spacy
frlemma = spacy.load('fr')


def get_data(path, filename, comlumname):
    print('get data begin')
    df = pd.read_json(path + filename, orient='values')
    data = df[comlumname].astype(str).values.tolist()
    print('got it')
    return data


def get_stopwords():

    raw_stopword_list = stopwords.words('french')
    stopword_list = [word for word in raw_stopword_list]
    for w in ['les', 'aussi','comme','plus','deux','trois','très','cette','accélérateur','jeune','pousse','fablab','entreprises','studio','couveuse','incubateur','coworking','co-working'
              ,'entreprise','startup','start-up','start-ups','startups','être','pépinière','tout','faire','incubator','ils','oui','elles','non']:
        stopword_list.append(w)
    return stopword_list

def cleandata(data, stopwords):
    print('cleaning data begins')
    newdata = []
    print('number of document',len(data))
    i = 0
    for a in data:
        if i%1000 == 0:
           print('now in the', i)
        newdata.append(delete_stopwords(a, stopwords))
        i = i +1
    print('clean finish')
    return newdata

def delete_stopwords(raw, stopwords,encoding = 'utf-8'):
    no_commas = re.sub(r'[.|,|\']', ' ', raw)
    tokens = nltk.word_tokenize(no_commas)
    wordlist =[w.lower() for w in tokens]
    filtered_words = []
    #stemmer = SnowballStemmer("french")
    #lemmer = FrenchLefffLemmatizer()

    for item in wordlist:
        if item not in stopwords and item.isalpha() and len(item) > 1:
            #filtered_words.append(item)
            #filtered_words.append(stemmer.stem(item))
            filtered_words.append(frlemma(item)[0].lemma_)
    line = ''
    for word in filtered_words:
        
        if word.encode('utf-8') == '\n' or word.encode('utf-8') == 'nbsp' or word.encode('utf-8') == '\r\n':
            continue
        line += word
        line += ' '
    return line.strip()

def tfidf(corpus):
    print('tfidf begin')
    vectorizer = CountVectorizer()

    transformer = TfidfTransformer()

    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))

    #print(tfidf)
    words = vectorizer.get_feature_names()

    weight = tfidf.toarray()
    #pprint(word)
    print(weight)

    return weight,words

def lsa(weight, n):
    print('begin lsa')
    svd = TruncatedSVD(n_components=n)
    normalizer = Normalizer(copy= False)
    lsa = make_pipeline(svd,normalizer)
    X = lsa.fit_transform(weight)
    print('lsa end')
    return  X, svd

def pca(weight, dimension):
    print('原有维度: ', len(weight[0]))
    print('开始降维:')

    pca = PCA(n_components=dimension)  # 初始化PCA
    X = pca.fit_transform(weight)  # 返回降维后的数据
    print('降维后维度: ', len(X[0]))
    print(X)

    return X


def kmeans(X, k, words):  # X=weight

    from sklearn.cluster import KMeans

    print ('cluser begin for k = ', k)
    
    clusterer = KMeans(n_clusters=k, init='k-means++')  # 设置聚类模型

    y = clusterer.fit_predict(X)  # 把weight矩阵扔进去fit一下,输出label
    #print('kmeans labels:'+clusterer.labels_)
    print('kmeans inertia:', clusterer.inertia_)
    order_centroids = clusterer.cluster_centers_.argsort()[:, ::-1]
        
    for i in range(k):
        top_ten_words = [words[ind] for ind in order_centroids[i, :15]]
        print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

    print('kmeans catalogy:', y)
    return y

def Silhouette(X, y):
    from sklearn.metrics import silhouette_samples, silhouette_score

    print ('silhouette score:')

    silhouette_avg = silhouette_score(X, y)  # 平均轮廓系数
    sample_silhouette_values = silhouette_samples(X, y)  # 每个点的轮廓系数

    pprint(silhouette_avg)

    return silhouette_avg, sample_silhouette_values

def Draw(silhouette_avg, sample_silhouette_values, y, k):
    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    import numpy as np

    # 创建一个 subplot with 1-row 2-column
    fig, ax1 = plt.subplots(1)
#     fig.set_size_inches(18, 7)

    # 第一个 subplot 放轮廓系数点
    # 范围是[-1, 1]
    ax1.set_xlim([-1, 1])

    # 后面的 (k + 1) * 10 是为了能更明确的展现这些点
    ax1.set_ylim([0, len(y) + (k + 1) * 10])

    y_lower = 10

    for i in range(k):  # 分别遍历这几个聚类

        ith_cluster_silhouette_values = sample_silhouette_values[y == i]
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.Spectral(float(i) / k)  # 搞一款颜色
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0,
                          ith_cluster_silhouette_values,
                          facecolor=color,
                          edgecolor=color,
                          alpha=0.7)  # 这个系数不知道干什么的

        # 在轮廓系数点这里加上聚类的类别号
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # 计算下一个点的 y_lower y轴位置
        y_lower = y_upper + 10

    # 在图里搞一条垂直的评论轮廓系数虚线
  #  ax1.axvline(x=silhouette_avg, color='red', linestyle="--")

    plt.show()
    
def NewDraw(sample_silhouette_values,y,k):
    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    import numpy as np
    # 创建一个 subplot with 1-row 2-column
    fig, ax1 = plt.subplots(1)
    fig.set_size_inches(7, 5)

    # 第一个 subplot 放轮廓系数点
    # 范围是[-1, 1]
#     ax1.set_xlim([0, 1])

    # 后面的 (k + 1) * 10 是为了能更明确的展现这些点


    for i in range(k):  # 分别遍历这几个聚类

        ith_cluster_silhouette_values = sample_silhouette_values[y == i]
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        

        color = cm.Spectral(float(i) / k)  # 搞一款颜色
       
        ax1.bar(i, size_cluster_i, align='center', 
                #log = 'false',
                color = color)
       # plt.xticks(i, str(i))
#          在轮廓系数点这里加上聚类的类别号
        ax1.text(i-0.25, size_cluster_i + 10, str(size_cluster_i))

#         # 计算下一个点的 y_lower y轴位置
#         y_lower = y_upper + 10

    # 在图里搞一条垂直的评论轮廓系数虚线
  #  ax1.axvline(x=silhouette_avg, color='red', linestyle="--")
    plt.title('the quantity of document for each topic')
    plt.ylabel('Nr of documents')
    plt.xlabel('Topics')
    plt.show()

## get weight marix and the list of words

In [None]:
path = ""
filename = "articles_Accuracy.json"
comlumname = "body_fr"
data = get_data(path, filename, comlumname)
    #corpus = [ 'je suis ton Père. Elle est ta mère. ', 'Sunny day weather is suitable to exercise','I ate a Hotdog']

frenchstopwords = get_stopwords()
frenchstopwords.append(w for w in ['les', 'aussi','comme','plus','deux','trois','très','cette','ils','ça','entre'])
cleaned_data= cleandata(data, frenchstopwords)
weight,words = tfidf(cleaned_data);

In [None]:
for k in  range(8, 12):
    y = kmeans(weight,k,words)
    silhouette_avg, sample_silhouette_values = Silhouette(weight, y)
    Draw(silhouette_avg, sample_silhouette_values, y, k)
    

In [None]:
b, svd = lsa(weight, 800)

In [None]:
shortd = kmeans(b, 10, words)

In [None]:
silhouette_avg, sample_silhouette_values = Silhouette(b, shortd)
Draw(silhouette_avg, sample_silhouette_values, shortd, 10)

In [None]:
def lsakmeans(weight, k, ncompinents, words):
    from sklearn.cluster import KMeans
    svd = TruncatedSVD(n_components= ncompinents)
    normalizer = Normalizer(copy= False)
    lsa = make_pipeline(svd,normalizer)
    X = lsa.fit_transform(weight)
    km = KMeans(n_clusters=k, init='k-means++')  # 设置聚类模型
    y = km.fit_predict(X)  # 把weight矩阵扔进去fit一下,输出label
        #print('kmeans labels:'+clusterer.labels_)
    print('kmeans inertia:', km.inertia_)
    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
    for i in range(k):
            top_ten_words = [words[ind] for ind in order_centroids[i, :15]]
            print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))
    return X, y

In [None]:
for k in range(8, 15):
    newweight, y  = lsakmeans(weight, k, 500, words)
    silhouette_avg, sample_silhouette_values = Silhouette(newweight, y)
    Draw(silhouette_avg, sample_silhouette_values, y, k)
    NewDraw(sample_silhouette_values, y, k)

In [None]:
k = 10
newweight, y  = lsakmeans(weight, k, 500, words)
silhouette_avg, sample_silhouette_values = Silhouette(newweight, y)
NewDraw(sample_silhouette_values, y, k)

In [None]:
NewDraw(sample_silhouette_values, y, k)

In [None]:
for k in range(8, 15):
    newweight, y  = lsakmeans(weight, k, 500, words)
    silhouette_avg, sample_silhouette_values = Silhouette(newweight, y)
    Draw(silhouette_avg, sample_silhouette_values, y, k)
    NewDraw(sample_silhouette_values, y, k)

In [None]:
import matplotlib.pyplot as plt
x = [7175,7149,7120,7122,7114,7088,7070,6984,6827,6701,6624,6183]
y = [8,9,10,11,12,13,14,20,30,40,50,100]
plt.title('Métohd de coube')
plt.ylabel('k-inertia')
plt.xlabel('nombre de clusters k in [8,9, ... ,13,14,20,30,40,50,100]')
plt.plot(y,x)
plt.show()

In [None]:
k=50
newweight, y  = lsakmeans(weight, k, 500, words)
silhouette_avg, sample_silhouette_values = Silhouette(newweight, y)
Draw(silhouette_avg, sample_silhouette_values, y, k)
NewDraw(sample_silhouette_values, y, k)

In [None]:
k=30
newweight, y  = lsakmeans(weight, k, 500, words)
silhouette_avg, sample_silhouette_values = Silhouette(newweight, y)
Draw(silhouette_avg, sample_silhouette_values, y, k)
NewDraw(sample_silhouette_values, y, k)

In [None]:
k=20
newweight, y  = lsakmeans(weight, k, 500, words)
silhouette_avg, sample_silhouette_values = Silhouette(newweight, y)
Draw(silhouette_avg, sample_silhouette_values, y, k)
NewDraw(sample_silhouette_values, y, k)

In [None]:
k=40
newweight, y  = lsakmeans(weight, k, 500, words)
silhouette_avg, sample_silhouette_values = Silhouette(newweight, y)
Draw(silhouette_avg, sample_silhouette_values, y, k)
NewDraw(sample_silhouette_values, y, k)

In [None]:
k=100
newweight, y  = lsakmeans(weight, k, 500, words)
silhouette_avg, sample_silhouette_values = Silhouette(newweight, y)
Draw(silhouette_avg, sample_silhouette_values, y, k)
NewDraw(sample_silhouette_values, y, k)