In [4]:
from utils import readJSON
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import calinski_harabasz_score,silhouette_score,adjusted_rand_score
from sklearn import metrics
import warnings

warnings.filterwarnings('ignore')
plt.rcParams["font.sans-serif"] = ["SimHei"]  #设置字体
plt.rcParams["axes.unicode_minus"] = False


def preprocess(path, sheet_name):
    xzb = pd.read_excel(path, sheet_name=sheet_name)
    xzb.drop(columns=['病案号'], inplace=True)
    xzb.drop(labels=[1742, 1741], axis=0, inplace=True)
    xzb.drop(labels=xzb[xzb['性别'].isna()].index, inplace=True)
    xzb = xzb.sample(frac=1).astype(int)
    y = xzb['证名']
    X = xzb.drop(labels=['证名', '性别', '年龄'], axis=1)
    id2feature = readJSON('./input/id2feature.json')
    X.columns = id2feature.values()
    scaler = StandardScaler()
    X = scaler.fit_transform(X, y)
    X = pd.DataFrame(X, columns=scaler.feature_names_in_)
    lsvc = LinearSVC(C=0.01, penalty='l1', dual=False, random_state=64).fit(X, y)
    model = SelectFromModel(lsvc, prefit=True)
    col = [c for c, i in zip(X.columns, model.get_support()) if not i]
    X.drop(columns=col, inplace=True)

    le = LabelEncoder()
    y = le.fit_transform(y)
    data = pd.concat([pd.DataFrame(X), pd.DataFrame(y, columns=['证名'])], axis=1)
    xqx = data[data['证名'] == 0]  # 621
    xxyz = data[data['证名'] == 4]  # 547
    xqx['证名'] = 0
    xxyz['证名'] = 1
    tmp = pd.concat([xqx, xxyz], axis=0).sample(frac=1).reset_index(drop=True)
    X = tmp.drop(columns='证名')
    y = tmp['证名']
    return X, y


X, y = preprocess(path='./input/心总表.xlsx', sheet_name='总表')
# from sklearn.datasets import load_iris
# X,y = load_iris(return_X_y=True)
USE_Kmeans = True
USE_DBSCAN = True
USE_GMM  = True
USE_AffinityPropagation = False
USE_MeanShift = False
USE_OPTICS = True
USE_AgglomerativeClustering = True



if USE_Kmeans:
    from sklearn.cluster import KMeans

    # 肘部法
    mean_distortions = []
    for k in range(2, 9):
        kmeans = KMeans(n_clusters=k, random_state=1024)
        kmeans.fit(X)
        mean_distortions.append(kmeans.inertia_)
    plt.figure()
    plt.plot(range(1, 8), mean_distortions, 'bx-')
    plt.ylabel('inertia')  # Sum of squared distances of samples to their closest cluster center
    plt.xlabel('K')
    plt.title('KMeans-肘部法')
    plt.show()
    # 轮廓系数
    for i in range(2,9):
        kmeans = KMeans(n_clusters=i,random_state=1024)
        kmeans_pred = kmeans.fit_predict(X)
        sh_score = silhouette_score(X,kmeans_pred)
        ch_score = calinski_harabasz_score(X,kmeans_pred)
        arand_score = adjusted_rand_score(y,kmeans_pred)
        mutual_score = metrics.adjusted_mutual_info_score(y,kmeans_pred)
        homogeneity = metrics.homogeneity_score(y,kmeans_pred)
        completeness = metrics.completeness_score(y,kmeans_pred)
        v_measure_score  =metrics.v_measure_score(y,kmeans_pred)
        fowlkes_mallows_score = metrics.fowlkes_mallows_score(y,kmeans_pred)
        print(Counter(y))
        print(f'kmeans cluster={i}:{Counter(kmeans_pred)},    CH指数={round(ch_score,2)},   轮廓系数={round(sh_score,2)},    ARI={round(arand_score)}     互信息={round(mutual_score)}   homogeneity={homogeneity}   completeness={completeness}     v_measure_score={v_measure_score}    fowlkes_mallows_score={fowlkes_mallows_score}')

if USE_DBSCAN:
    from sklearn.cluster import DBSCAN
    from collections import Counter
    eps, min_samples = 0.4,5
    dbscan_pred = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X)
    print(f'DBSCAN',Counter(dbscan_pred))
    n_clusters_ = len(set(dbscan_pred)) - (1 if -1 in dbscan_pred else 0)

    sh_score = silhouette_score(X,kmeans_pred)
    ch_score = calinski_harabasz_score(X,kmeans_pred)
    arand_score = adjusted_rand_score(y,kmeans_pred)
    mutual_score = metrics.adjusted_mutual_info_score(y,kmeans_pred)
    homogeneity = metrics.homogeneity_score(y,kmeans_pred)
    completeness = metrics.completeness_score(y,kmeans_pred)
    v_measure_score  =metrics.v_measure_score(y,kmeans_pred)
    fowlkes_mallows_score = metrics.fowlkes_mallows_score(y,kmeans_pred)
    print(Counter(y))
    print(f'kmeans cluster={i}:{Counter(kmeans_pred)},    CH指数={round(ch_score,2)},   轮廓系数={round(sh_score,2)},    ARI={round(arand_score)}     互信息={round(mutual_score)}   homogeneity={round(homogeneity,2)}   completeness={round(completeness,2)}     v_measure_score={round(v_measure_score,2)}    fowlkes_mallows_score={round(fowlkes_mallows_score,2)}')
    # DBSCAN Counter({-1: 1168})
    # DBSCAN:Counter({-1: 1163, 0: 5})   CH指数=3.44   轮廓系数=-0.06

if USE_GMM:
    from sklearn.mixture import GaussianMixture as GMM
    for i in range(2,9):
        gmm_pred = GMM(n_components=i,random_state=1024).fit_predict(X)
        sh_score = silhouette_score(X,kmeans_pred)
        ch_score = calinski_harabasz_score(X,kmeans_pred)
        arand_score = adjusted_rand_score(y,kmeans_pred)
        mutual_score = metrics.adjusted_mutual_info_score(y,kmeans_pred)
        homogeneity = metrics.homogeneity_score(y,kmeans_pred)
        completeness = metrics.completeness_score(y,kmeans_pred)
        v_measure_score  =metrics.v_measure_score(y,kmeans_pred)
        fowlkes_mallows_score = metrics.fowlkes_mallows_score(y,kmeans_pred)
        print(Counter(y))
        print(f'kmeans cluster={i}:{Counter(kmeans_pred)},    CH指数={round(ch_score,2)},   轮廓系数={round(sh_score,2)},    ARI={round(arand_score)}     互信息={round(mutual_score)}   homogeneity={round(homogeneity,2)}   completeness={round(completeness,2)}     v_measure_score={round(v_measure_score,2)}    fowlkes_mallows_score={round(fowlkes_mallows_score,2)}')
        # GMM:Counter({1: 901, 0: 267})   CH指数=24.08   轮廓系数=0.18

if USE_AffinityPropagation:
    from sklearn.cluster import AffinityPropagation
    import numpy as np
    for i in np.arange(0.5,1,0.1):
        AffinityPropagation_pred = AffinityPropagation(damping=i,random_state=1024).fit_predict(X)
        sh_score = silhouette_score(X,kmeans_pred)
        ch_score = calinski_harabasz_score(X,kmeans_pred)
        arand_score = adjusted_rand_score(y,kmeans_pred)
        mutual_score = metrics.adjusted_mutual_info_score(y,kmeans_pred)
        homogeneity = metrics.homogeneity_score(y,kmeans_pred)
        completeness = metrics.completeness_score(y,kmeans_pred)
        v_measure_score  =metrics.v_measure_score(y,kmeans_pred)
        fowlkes_mallows_score = metrics.fowlkes_mallows_score(y,kmeans_pred)
        print(Counter(y))
        print(f'kmeans cluster={i}:{Counter(kmeans_pred)},    CH指数={round(ch_score,2)},   轮廓系数={round(sh_score,2)},    ARI={round(arand_score)}     互信息={round(mutual_score)}   homogeneity={round(homogeneity,2)}   completeness={round(completeness,2)}     v_measure_score={round(v_measure_score,2)}    fowlkes_mallows_score={round(fowlkes_mallows_score,2)}')

if USE_MeanShift:
    from sklearn.cluster import MeanShift
    meanshift_pred = MeanShift().fit_predict(X)
    sh_score = silhouette_score(X,kmeans_pred)
    ch_score = calinski_harabasz_score(X,kmeans_pred)
    arand_score = adjusted_rand_score(y,kmeans_pred)
    mutual_score = metrics.adjusted_mutual_info_score(y,kmeans_pred)
    homogeneity = metrics.homogeneity_score(y,kmeans_pred)
    completeness = metrics.completeness_score(y,kmeans_pred)
    v_measure_score  =metrics.v_measure_score(y,kmeans_pred)
    fowlkes_mallows_score = metrics.fowlkes_mallows_score(y,kmeans_pred)
    print(Counter(y))
    print(f'kmeans cluster={i}:{Counter(kmeans_pred)},    CH指数={round(ch_score,2)},   轮廓系数={round(sh_score,2)},    ARI={round(arand_score)}     互信息={round(mutual_score)}   homogeneity={round(homogeneity,2)}   completeness={round(completeness,2)}     v_measure_score={round(v_measure_score,2)}    fowlkes_mallows_score={round(fowlkes_mallows_score,2)}')

if USE_OPTICS:
    from sklearn.cluster import OPTICS
    optics_pred = OPTICS(eps=8,min_samples=41).fit_predict(X)
    n_clusters_ = len(set(optics_pred)) - (1 if -1 in optics_pred else 0)
    sh_score = silhouette_score(X,kmeans_pred)
    ch_score = calinski_harabasz_score(X,kmeans_pred)
    arand_score = adjusted_rand_score(y,kmeans_pred)
    mutual_score = metrics.adjusted_mutual_info_score(y,kmeans_pred)
    homogeneity = metrics.homogeneity_score(y,kmeans_pred)
    completeness = metrics.completeness_score(y,kmeans_pred)
    v_measure_score  =metrics.v_measure_score(y,kmeans_pred)
    fowlkes_mallows_score = metrics.fowlkes_mallows_score(y,kmeans_pred)
    print(Counter(y))
    print(f'kmeans cluster={i}:{Counter(kmeans_pred)},    CH指数={round(ch_score,2)},   轮廓系数={round(sh_score,2)},    ARI={round(arand_score)}     互信息={round(mutual_score)}   homogeneity={round(homogeneity,2)}   completeness={round(completeness,2)}     v_measure_score={round(v_measure_score,2)}    fowlkes_mallows_score={round(fowlkes_mallows_score,2)}')
    # OPTICS:Counter({0: 1111, -1: 57})   CH指数=17.08   轮廓系数=0.46
if USE_AgglomerativeClustering:
    from sklearn.cluster import AgglomerativeClustering
    for i in range(2,9):
        AgglomerativeClustering_pred = AgglomerativeClustering(n_clusters=i).fit_predict(X)
        sh_score = silhouette_score(X,kmeans_pred)
        ch_score = calinski_harabasz_score(X,kmeans_pred)
        arand_score = adjusted_rand_score(y,kmeans_pred)
        mutual_score = metrics.adjusted_mutual_info_score(y,kmeans_pred)
        homogeneity = metrics.homogeneity_score(y,kmeans_pred)
        completeness = metrics.completeness_score(y,kmeans_pred)
        v_measure_score  =metrics.v_measure_score(y,kmeans_pred)
        fowlkes_mallows_score = metrics.fowlkes_mallows_score(y,kmeans_pred)
        print(Counter(y))
        print(f'kmeans cluster={i}:{Counter(kmeans_pred)},    CH指数={round(ch_score,2)},   轮廓系数={round(sh_score,2)},    ARI={round(arand_score)}     互信息={round(mutual_score)}   homogeneity={round(homogeneity,2)}   completeness={round(completeness,2)}     v_measure_score={round(v_measure_score,2)}    fowlkes_mallows_score={round(fowlkes_mallows_score,2)}')


['https://movie.douban.com/subject/34841067/comments?start=0&limit=20&status=P&sort=new_score', 'https://movie.douban.com/subject/34841067/comments?start=20&limit=20&status=P&sort=new_score', 'https://movie.douban.com/subject/34841067/comments?start=40&limit=20&status=P&sort=new_score', 'https://movie.douban.com/subject/34841067/comments?start=60&limit=20&status=P&sort=new_score', 'https://movie.douban.com/subject/34841067/comments?start=80&limit=20&status=P&sort=new_score', 'https://movie.douban.com/subject/34841067/comments?start=100&limit=20&status=P&sort=new_score', 'https://movie.douban.com/subject/34841067/comments?start=120&limit=20&status=P&sort=new_score', 'https://movie.douban.com/subject/34841067/comments?start=140&limit=20&status=P&sort=new_score', 'https://movie.douban.com/subject/34841067/comments?start=160&limit=20&status=P&sort=new_score', 'https://movie.douban.com/subject/34841067/comments?start=180&limit=20&status=P&sort=new_score']


FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import random
urls=['https://movie.douban.com/subject/34841067/comments?start={}&limit=20&status=P&sort=new_score'.format(str(i)) for i in range(0, 200, 20)] #通过观察的url翻页的规律，使用for循环得到10个链接，保存到urls列表中
print(urls)
dic_h = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
comments_list = [] #初始化用于保存短评的列表

for url in urls: #使用for循环分别获取每个页面的数据，保存到comments_list列表
    r = requests.get(url=url,headers = dic_h).text

    soup = BeautifulSoup(r, 'lxml')
    ul = soup.find('div',id="comments")
    lis= ul.find_all('p')

    list2 =[]
    for li in lis:
        list2.append(li.find('span').string)
    # print(list2)
    comments_list.extend(list2)
    time.sleep(random.randint(0,3)) # 暂停0~3秒

with open('lhy_comments.txt', 'w', encoding='utf-8') as f: #使用with open()新建对象f
    # 将列表中的数据循环写入到文本文件中
    for i in comments_list:
        f.write(i+"\n") #写入数据
print('done')

Counter({0: 844, 2: 209, 1: 115})
