In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn

import jieba
import jieba.analyse

import re
import heapq

In [2]:
raw_data = pd.read_csv('../Data/CloudSOP_Data/cloudsop_2020_data_tidied.csv')
raw_data.drop('Unnamed: 0', axis = 1, inplace= True)

In [3]:
raw_data.columns
raw_data.shape

(112280, 14)

In [4]:
# 移除module为空的行：
df = raw_data.dropna(axis = 0, subset = ['MODULE'])
df.shape

(77581, 14)

In [5]:
test_desc = raw_data.DISCRIPT[0]
test_desc

'【拓扑适配】【新需求---US20200804307566 OpenORB由于衰退期软件需要切换】【自动发现】导入IP地址后点击保存IP地址，保存后再次打开自动发现，保存的IP只不存在，请定位'

### 分词：

In [6]:
def load_stop_words(file):
    stopwords = []
    with open(file, encoding='utf-8') as f:
        while True:
            word = f.readline()
            word = re.sub(r'\s', '', word)
            # print(word)
            if word:
                stopwords.append(word)
            elif not word:
                break
    return stopwords

def jieba_cut_without_stopwords(sentence, stop_words):
    res = jieba.cut(sentence)
    cutted = []
    for s in res:
        if s not in stop_words:
            cutted.append(s)
    return cutted



In [7]:
# 分词并移除stop_words
stop_words = load_stop_words('hit_stopwords.txt')
df['CUTTED'] = df.DISCRIPT.apply(lambda x: jieba_cut_without_stopwords(x, stop_words))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Y00591~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.707 seconds.
Prefix dict has been built successfully.


In [8]:
df['MODULE'] = df.MODULE.apply(int)
df['MODULE'] = df.MODULE.apply(str)

In [9]:
df.head()

Unnamed: 0,NO,DISCRIPT,CREATOR,SDEPT3SUBMIT,SORIGINTYPE,SODCATIVITYNO,BVERSION,DATAILDESCRIPTION,FILE_PATH,CREATE_TIME,FEATURE,MODULE,MICROSERVICE,RVERSION,CUTTED
0,DTS2020092908555,【拓扑适配】【新需求---US20200804307566 OpenORB由于衰退期软件需要...,jiahonghong WX335476,管控析集成与验证部,2020-09-29 00:00:00,有条件必然重现,CloudSOP V100R021C10SPC410B401,<p>【环境】<br> <br> https://8.7.242.118:31943/t...,gnldev/gnldev/impl/server/source/src/gnldev/so...,2020-10-23 14:31:24,,1110352450,TopoWebsite,CloudSOP V100R021,"[拓扑, 适配, 新, 需求, ---, US20200804307566, , Open..."
1,DTS2020092908555,【拓扑适配】【新需求---US20200804307566 OpenORB由于衰退期软件需要...,jiahonghong WX335476,管控析集成与验证部,2020-09-29 00:00:00,有条件必然重现,CloudSOP V100R021C10SPC410B401,<p>【环境】<br> <br> https://8.7.242.118:31943/t...,eam/eam/impl/server/source/src/eam_common/EAMD...,2020-10-23 14:31:24,,1110352450,TopoWebsite,CloudSOP V100R021,"[拓扑, 适配, 新, 需求, ---, US20200804307566, , Open..."
2,DTS2020093003837,【R21RR4】【Super】【MOUI】资源跳转MOUI界面后应默认进入总览，当前进入的是...,wuxing WX80136,管控析集成与验证部,2020-09-30 00:00:00,有条件必然重现,CloudSOP V100R021C10SPC410B002,<p>【测试环境】https://8.7.242.179:31943/<br> 【步骤与现象...,website/src/main/webapp/source/src/moui/main.js,2020-10-20 09:50:37,,1110352462,,CloudSOP V100R021,"[R21RR4, Super, MOUI, 资源, 跳转, MOUI, 界面, 后应, 默认..."
3,DTS2020093003837,【R21RR4】【Super】【MOUI】资源跳转MOUI界面后应默认进入总览，当前进入的是...,wuxing WX80136,管控析集成与验证部,2020-09-30 00:00:00,有条件必然重现,CloudSOP V100R021C10SPC410B002,<p>【测试环境】https://8.7.242.179:31943/<br> 【步骤与现象...,website/src/main/webapp/source/src/moui/main.js,2020-10-30 17:40:42,,1110352462,,CloudSOP V100R021,"[R21RR4, Super, MOUI, 资源, 跳转, MOUI, 界面, 后应, 默认..."
4,DTS2020093003837,【R21RR4】【Super】【MOUI】资源跳转MOUI界面后应默认进入总览，当前进入的是...,wuxing WX80136,管控析集成与验证部,2020-09-30 00:00:00,有条件必然重现,CloudSOP V100R021C10SPC410B002,<p>【测试环境】https://8.7.242.179:31943/<br> 【步骤与现象...,website/src/main/webapp/source/src/moui/main.js,2020-11-03 16:15:57,,1110352462,,CloudSOP V100R021,"[R21RR4, Super, MOUI, 资源, 跳转, MOUI, 界面, 后应, 默认..."


## 向量化

### 使用tf-idf
为了实现对中文更好的分词，结合jieba分词


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [11]:
vectorizer = TfidfVectorizer(tokenizer = jieba.cut, stop_words = stop_words)
vectorizer.fit(df.DISCRIPT)

TfidfVectorizer(stop_words=['———', '》），', '）÷（１－', '”，', '）、', '＝（', ':', '→',
                            '℃', '&', '*', '一一', '~~~~', '’', '.', '『', '.一',
                            './', '--', '』', '＝″', '【', '［＊］', '｝＞', '［⑤］］',
                            '［①Ｄ］', 'ｃ］', 'ｎｇ昉', '＊', '//', ...],
                tokenizer=<bound method Tokenizer.cut of <Tokenizer dictionary=None>>)

In [12]:
sparse_matrix = vectorizer.transform(df.DISCRIPT)

In [13]:
# 生成训练集和测试集：
x_train, x_test, y_train, y_test = train_test_split(sparse_matrix, df[['MODULE', 'RVERSION']],test_size = 0.2, random_state = 128)

In [14]:
y_train.to_numpy()

array([['1107652938', 'CloudSOP V100R007'],
       ['1111078192', 'CloudSOP V100R021'],
       ['1112191484', 'CloudSOP V100R021'],
       ...,
       ['1112191492', 'CloudSOP V100R021'],
       ['1107652938', 'CloudSOP V100R007'],
       ['1108737242', 'CloudSOP V100R007']], dtype=object)

### Naiive Bayes

In [15]:
from sklearn.naive_bayes import ComplementNB,  MultinomialNB
from sklearn.model_selection import cross_val_score

In [16]:
y_train

Unnamed: 0,MODULE,RVERSION
57436,1107652938,CloudSOP V100R007
45430,1111078192,CloudSOP V100R021
14125,1112191484,CloudSOP V100R021
35483,1110352374,CloudSOP V100R021
41245,1110352274,CloudSOP V100R021
...,...,...
64916,1107652696,CloudSOP V100R021
84947,1110352460,CloudSOP V100R021
13826,1112191492,CloudSOP V100R021
57601,1107652938,CloudSOP V100R007


In [17]:
cnb = ComplementNB()
cnb.fit(x_train, y_train.MODULE.to_numpy())

ComplementNB()

In [18]:
def top_k(proba_arr, y_arr, k = 3):
    res = np.zeros(shape = (arr.shape[0], k))
    for i, a in enumerate(arr):
        n_largest_proba = heapq.nlargest(k, a)
        idx = np.where(np.isin(a, n_largest_proba) == True)
        n_largest_item = y_arr[idx]
        res[i] = n_largest_item
    return res

arr = cnb.predict_proba(x_test)
top_k(arr, cnb.classes_)
arr

array([[0.00249464, 0.00249473, 0.00249386, ..., 0.00249473, 0.00249363,
        0.00249438],
       [0.0026038 , 0.00260388, 0.00260356, ..., 0.00260344, 0.00260859,
        0.00260393],
       [0.00260772, 0.00260733, 0.00260744, ..., 0.00260905, 0.00260817,
        0.00260798],
       ...,
       [0.00259172, 0.00259145, 0.00259214, ..., 0.00259132, 0.00258623,
        0.00259187],
       [0.00260717, 0.00260674, 0.00260693, ..., 0.00260701, 0.00262244,
        0.00260733],
       [0.00260942, 0.00260911, 0.0026092 , ..., 0.00261054, 0.00260478,
        0.00260953]])

In [19]:
tt = [np.array(10)]
tt.append(np.array(11))
tt
arr_test = np.array([1,2,3])
arr_test == 10

array([False, False, False])

In [20]:
class BayesCLF:
    def __init__(self, model = 'Complement',):
        try:
            if model == 'Complement':
                self.__clf__ = ComplementNB()
            elif model == 'Multinomial':
                self.__clf__ = MultinomialNB()
        
            print(self.__clf__)
        
        except:
            print('Model not found')
    
    def fit(self, x_test, y_test):
        '''
        x_test: n*p1
        y_test: n*1
        '''
        self.__clf__.fit(x_test, y_test)
        self.modules = self.__clf__.classes_
        
    def predict(self, x):
        '''
        x: n*p
        '''
        return self.__clf__.predict(x)

    def predict_proba(self, x):
        return self.__clf__.predict_proba(x)

    def predict_top_k(self, x, k = 3):
        res = np.zeros(shape = (x.shape[0], k), dtype = np.object)
        scores = self.__clf__.predict_proba(x)

        for i, score in enumerate(scores):
            # print(a)
            n_largest_proba = heapq.nlargest(k, score)
            idx = np.where(np.isin(score, n_largest_proba) == True)

            n_largest_item = self.modules[idx]
            tmp = n_largest_item

            res[i] = tmp
        return res     

    def accuracy(self, y_true, y_predict, top_k = 3):
        count = 0
        for i in range(y_true.shape[0]):
            # if i % 1000 == 0:
            #     print(i) 
            #     # print(y_predict[i], y_true[i])
            if y_true[i] in y_predict[i]:
                count += 1

        accuracy = count / y_true.shape[0]
        return accuracy


In [21]:
# 创建分类器：
CNB = BayesCLF(model = 'Complement')
MNB = BayesCLF(model = 'Multinomial')
FNB = BayesCLF(model = 'fake')
CNB.fit(x_train, y_train.MODULE.to_numpy())
MNB.fit(x_train, y_train.MODULE.to_numpy())

ComplementNB()
MultinomialNB()
Model not found


In [22]:
# 预测：
CNB.predict_top_k(x_test)

array([['1110352380', '1112191494', '1112191506'],
       ['1107652706', '1108737097', '1110352436'],
       ['1110352368', '1110352382', '1110352392'],
       ...,
       ['1110352408', '1110352508', '1111851251'],
       ['1112191484', '1112191492', '1112191500'],
       ['1110352422', '1110352436', '1111851247']], dtype=object)

In [23]:
# K-Fold 方法
from sklearn.model_selection import KFold
def kfold(clf, x, y, k):
    '''
    clf: classifier.
    k: number of folds
    '''
    accs = []

    kf = KFold(n_splits = k)
    i = 1
    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf.fit(x_train, y_train)
        y_predict = clf.predict_top_k(x_test, k = 3)

        acc = clf.accuracy(y_test, y_predict)
        accs.append(acc)
        print('FOLD: {} ======= Accuracy: {}'.format(i, acc))
        i += 1

    return accs

In [24]:
# 计算CNB的10-Fold的准确率
accs1 = kfold(BayesCLF(), sparse_matrix, df.MODULE.to_numpy(), 10)
accs2 = kfold(BayesCLF(model='Multinomial'), sparse_matrix, df.MODULE.to_numpy(), 10)

ComplementNB()
MultinomialNB()


In [25]:
# 计算平均准确率
sum(accs1)/len(accs1), sum(accs2)/len(accs2)


(0.5330673099034159, 0.4607172118327041)

In [26]:
# 案列试算: **DTS2020121400802**
# 模块PBI编号：**1110352547** 
CNB.predict_top_k(vectorizer.transform(['【AIFM】【DCN】DCN需要支撑新的License部署场景，无平台License服务时，产品自行控制是否能够进入Incident界面']))

array([['1110352384', '1110352420', '1110352546']], dtype=object)

实现按照R版本隔离，实现方案：

1. 输入数据时，同时保存各样本的module信息，将预测出来的对应模块的概率设置为0，再推送结果
2. 将 R版本作为输入，和proba一起作为输入放入LR进行一次分类


In [28]:
df.RVERSION.value_counts()

CloudSOP V100R021    58507
CloudSOP V100R007    18984
CloudSOP V100R006       35
CloudSOP V100R002        8
CloudSOP V100R003        1
Name: RVERSION, dtype: int64

In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder

# 对R版本进行onehot 编码
X = y_train['RVERSION'].fillna('No record')
x = X.to_numpy().reshape(-1, 1)
enc = OneHotEncoder()
x = enc.fit_transform(x)

# 采用随机森林的方法，针对R版本进行预测
dtc = DecisionTreeClassifier(random_state=0)
dtc.fit(x, y_train['MODULE'])



DecisionTreeClassifier(random_state=0)

In [161]:
CNB2 = BayesCLF()
CNB2.fit(x_train, y_train['MODULE'])



ComplementNB()


In [160]:
sum(prob1)

array([1.])

In [181]:
test_index = 30

CNB2.predict_top_k(x_test)
prob1 = CNB2.predict_proba(x_test)[test_index].reshape(-1, 1)
rr = enc.transform(np.array(y_test['RVERSION'].to_numpy()[test_index]).reshape(-1, 1))
prob2 = dtc.predict_proba(rr).reshape(-1, 1)


In [176]:
heapq.nlargest(3, (prob1 * prob2) / sum(prob1 * prob2))

[array([0.19047133]), array([0.10091433]), array([0.0333843])]

In [178]:
CNB2.predict_proba(x_test)

array([[0.00249464, 0.00249473, 0.00249386, ..., 0.00249473, 0.00249363,
        0.00249438],
       [0.0026038 , 0.00260388, 0.00260356, ..., 0.00260344, 0.00260859,
        0.00260393],
       [0.00260772, 0.00260733, 0.00260744, ..., 0.00260905, 0.00260817,
        0.00260798],
       ...,
       [0.00259172, 0.00259145, 0.00259214, ..., 0.00259132, 0.00258623,
        0.00259187],
       [0.00260717, 0.00260674, 0.00260693, ..., 0.00260701, 0.00262244,
        0.00260733],
       [0.00260942, 0.00260911, 0.0026092 , ..., 0.00261054, 0.00260478,
        0.00260953]])

ValueError: Input contains NaN

----
## Word2Vec
----

In [119]:
from gensim.models import word2vec
import logging


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [120]:
model = word2vec.Word2Vec(df.CUTTED, min_count = 3, size = 200)
model.save('zh_word2vec')

2020-12-11 17:06:01,856 : INFO : collecting all words and their counts
2020-12-11 17:06:01,857 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-12-11 17:06:01,889 : INFO : PROGRESS: at sentence #10000, processed 193710 words, keeping 3035 word types
2020-12-11 17:06:01,917 : INFO : PROGRESS: at sentence #20000, processed 394373 words, keeping 4980 word types
2020-12-11 17:06:01,943 : INFO : PROGRESS: at sentence #30000, processed 577098 words, keeping 6834 word types
2020-12-11 17:06:01,968 : INFO : PROGRESS: at sentence #40000, processed 766631 words, keeping 8283 word types
2020-12-11 17:06:01,995 : INFO : PROGRESS: at sentence #50000, processed 950406 words, keeping 9560 word types
2020-12-11 17:06:02,025 : INFO : PROGRESS: at sentence #60000, processed 1146983 words, keeping 10998 word types
2020-12-11 17:06:02,052 : INFO : PROGRESS: at sentence #70000, processed 1338317 words, keeping 11734 word types
2020-12-11 17:06:02,072 : INFO : collected 12129 

In [132]:
test_desc

'【拓扑适配】【新需求---US20200804307566 OpenORB由于衰退期软件需要切换】【自动发现】导入IP地址后点击保存IP地址，保存后再次打开自动发现，保存的IP只不存在，请定位'

In [51]:
class animal:
    count = 0
    def run(self):
        print('animal is running...')
    def __init__(self):
        print('New instance is created!')
        animal.count += 1
class dog(animal):
    pass

dog1 = dog()
dog1.run()

New instance is created!
animal is running...
