In [1]:
import pandas as pd
import numpy as np 
import re
import matplotlib.pyplot as plt

# 資料載入，raw data檢查

In [31]:
df = pd.read_csv('data/新回饋.csv', delimiter=',',encoding='ANSI', names=['文本','編號'])

In [32]:
df.head(3)

Unnamed: 0,文本,編號
0,學生意見,隨機編號
1,讚,5
2,酷,5


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5657 entries, 0 to 5656
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   文本      5655 non-null   object
 1   編號      5657 non-null   object
dtypes: object(2)
memory usage: 88.5+ KB


In [34]:
# df['標記'].value_counts().plot(kind='barh', title= 'raw data')
# plt.show()

------------

# 前處理
    1. 刪除無效文本 (回覆字數小於4)
    2. 刪除英文文本
    3. 刪除重複文本
    4. 刪除特殊字元(@, #, >, <,"",等)，並將","替代為"，"
    5. 針對文本以句號或細項(數字)斷句

## Heleper function
#### 斷句標準
1. 如果有數字+. 則斷句
2. 如果是句號，則斷句
3. 如果有分號； 則斷句

In [63]:
def remove_punctuation(line):
    rule = re.compile(r"[^a-zA-Z0-9\u4e00-\u9fa5\，]")
    line = rule.sub('',line)
    return line

In [36]:
def preprocess_txt(txt):
    txt = re.sub(r'\s+','',txt)
    txt = re.sub(r',', '，', txt) #把, 轉成，
    txt = re.sub(r'[ˇ></)/(/）/（“”「」]','', txt) # 移除ˇ > < ) ( 符號。
    return txt

In [37]:
def text_len(txt):
    return len(txt)

In [38]:
def split_txt(txt):
    results = re.split(r'\d+\.|。|；', txt) # split txt by [number]. or 。
    return list(filter(None, results))

In [39]:
# 非字串直接刪除
for idx, txt in enumerate(df['文本']):
    if type(txt) != str:
        df.drop(idx, inplace= True)

## 2. 文字處理 + 刪除無效文本
若文本字數小於4，則視為無效文本

In [40]:
# df['文本'] = df['文本'].apply(lambda txt: preprocess_txt(txt))

In [41]:
# df['文本'] = df['文本'].apply(lambda txt: remove_punctuation(txt))

In [42]:
# df = df[df['文本'].apply(text_len)>4]

In [43]:
# df.head(5)

## 1. 刪除英文文本

In [44]:
from langdetect import detect

for idx, txt in zip(df.index,df['文本']):
    try:
        if detect(txt) == 'en':
            df.drop(idx, inplace= True)
        else: 
            df.loc[idx, 'lang'] = detect(txt)
    except:
        pass

## 3. 刪除重複文本
or by using tf-idf?

In [45]:
df.drop_duplicates(subset=['文本'],inplace= True)

## 4. 依照項目斷句
* 為了將相同文本斷句後仍可以將斷句對應回原本文本編號，建立df_new(注意：不能drop=True)
* split_txt 為 helper func，將文本切成list，包含sub_txt
* 依照sub_txt重新建立新dataframe = df_more

In [64]:
df_new = df.reset_index().rename(columns={"index": "文本編號"})

In [65]:
df_new['文本'] = df_new['文本'].apply(split_txt)

In [66]:
df_new.columns

Index(['文本編號', '文本', '編號', 'lang'], dtype='object')

In [67]:
data = []
for idx, row in df_new.iterrows():
    for sub_txt in row['文本']:
        data.append([row['文本編號'], sub_txt, row['編號'], row['lang']])

In [68]:
cols = df_new.columns
df_more = pd.DataFrame(data,columns = cols)

In [69]:
df_more.tail(5)

Unnamed: 0,文本編號,文本,編號,lang
5324,5656,\t不同意，問問題的回饋很多都是助教在回答，要不就是不回答說會影響成績，考卷也都不發回來給我...,2,ko
5325,5656,學期末了上禮拜本來說要讓我們確認成績，但突然說停課一次之後再補，以至於我們到現在連自己幾分、...,2,ko
5326,5656,\n,2,ko
5327,5656,\t不同意，評量方式公平並沒有錯，但一點都不適切,2,ko
5328,5656,實際上python的應用是在上機，並不是在紙筆測驗考背誦能力、細心度，而應要注意的是是否能確...,2,ko


In [70]:
df_more['文本'] = df_more['文本'].apply(lambda txt: remove_punctuation(txt))

In [71]:
df_more = df_more[df_more['文本'].apply(text_len)>4].drop(['lang'],axis=1)

In [72]:
df_more.drop_duplicates(subset=['文本'],inplace= True)

In [73]:
print('斷句後回饋文本數: {}'.format(len(df_more)))

斷句後回饋文本數: 3864


In [74]:
df_more.tail(5)

Unnamed: 0,文本編號,文本,編號
5322,5656,但很多是當下有問題需要解決的，而並非學期結束後再做詢問,2
5324,5656,不同意，問問題的回饋很多都是助教在回答，要不就是不回答說會影響成績，考卷也都不發回來給我們看...,2
5325,5656,學期末了上禮拜本來說要讓我們確認成績，但突然說停課一次之後再補，以至於我們到現在連自己幾分錯...,2
5327,5656,不同意，評量方式公平並沒有錯，但一點都不適切,2
5328,5656,實際上python的應用是在上機，並不是在紙筆測驗考背誦能力細心度，而應要注意的是是否能確切...,2


## 5. 寫入檔案，建立新csv file

In [75]:
import csv

In [76]:
with open('data/第一階段處理完新回饋.csv', 'w', newline='') as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    
    writer.writerow(list(df_more.columns))
    
    for idx in range(len(df_more)):
    
        writer.writerow(list(df_more.iloc[idx].values))

--------------

## Trial 

In [77]:
re.split(r'\d+\.|。','哇哈哈。到底是11.怎樣11、根本但是目前18:00')

['哇哈哈', '到底是', '怎樣11、根本但是目前18:00']

In [78]:
df_more[df_more['文本'].apply(text_len)>400].loc[1214]['文本']

KeyError: 1214

In [None]:
# pattern = re.compile(r'，')
# matches = pattern.finditer(txt_long)
# matches_list = list(matches)
# middle_idx = int(len(matches_list)/2)
# print(middle_idx)


-----

# 正負面情緒用詞分析

In [None]:
from wordcloud import WordCloud
import jieba
from jieba import analyse

### 結巴分詞

In [None]:
jieba.set_dictionary('jieba_traditional.txt')

In [None]:
analyse.set_stop_words('ch_stop_words.txt')

In [None]:
df_clean = df_more

In [None]:
print('正面回饋佔比: {:.3f}'.format(df_clean['標記'].value_counts()[1]/len(df_clean)))
print('負面回饋佔比: {:.3f}'.format(df_clean['標記'].value_counts()[-1]/len(df_clean)))
print('中性回饋佔比: {:.3f}'.format(df_clean['標記'].value_counts()[0]/len(df_clean)))

In [None]:
# with open('df_clean.csv', 'w', newline='', encoding= 'utf-8') as csv_file:
#     writer = csv.writer(csv_file, delimiter=',')
    
#     writer.writerow(list(df_clean.columns))
    
#     for idx in range(len(df_clean)):
    
#         writer.writerow(list(df_clean.iloc[idx].values))

In [None]:
df_neg = df_clean[df_clean['標記'] == -1]
df_pos = df_clean[df_clean['標記'] == 1]

In [None]:
# txt = ''.join(jieba.cut(stc))
# cloud = WordCloud().generate(txt)
# cloud.to_file('output.png')

------

# TF-IDF 
## 關鍵字擷取
#### 需load stop_words.txt, user_dict.txt

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
# from ckiptagger import data_utils, construct_dictionary, WS, POS, NER

In [None]:
class info_extractor:
    
    def __init__(self, clean_corpus):
        
        self.clean_corpus= clean_corpus
        
        with open('ch_stop_words.txt', 'r', encoding='utf-8') as f:
            self.stop_words = f.read().splitlines()
            
        jieba.load_userdict('user_dict.txt')
        
        self.vectorizer= TfidfVectorizer(max_df= 0.8,
                              min_df= 0.001,
                              ngram_range= (1,1),
                              stop_words= self.stop_words,
                              token_pattern=r"(?u)\b\w+\b")
        
    def keyword_extraction(self, idx, topK=10, pos= True):

        df_neg = self.clean_corpus[self.clean_corpus['標記'] == -1]
        df_pos = self.clean_corpus[self.clean_corpus['標記'] == 1]

        corpus = list(df_neg['文本']) if pos == False else list(df_pos['文本'])
        
        if idx >= len(corpus):
            print('請輸入小於 {} 的數字'.format(len(corpus)))
            return
        
        corpus_cleaned= []
        for txt in corpus:
            words_cut = jieba.cut(txt, cut_all= False)
            corpus_cleaned.append(' '.join(words_cut))

        tfidf_matrix = self.vectorizer.fit_transform(corpus_cleaned)
        tfidf_df = pd.DataFrame(tfidf_matrix.T.toarray(), index= self.vectorizer.get_feature_names())
        
        keywords = tfidf_df[idx][tfidf_df[idx]>0].sort_values(ascending=False)[:topK]
        
        print('關鍵字 及 關鍵字分數:\n ')
        for i, (kw, score) in enumerate(keywords.items()):
            print('{}. {} : {:.2f}'.format(i+1, kw, score))
            
        print('\n原回饋內容:\n\n{}'.format(corpus[idx]))
            
        return list(jieba.cut(corpus[idx]))
    
    def document_matching(self, input_txt, topK = 3):
        
        corpus = list(self.clean_corpus['文本'])

        corpus.append(input_txt)

        corpus_cut= []
        for txt in corpus:
            words_cut = jieba.cut(txt)
            corpus_cut.append(' '.join(words_cut))

        vectorizer= TfidfVectorizer(max_df= 0.9,
                              min_df= 0.001,
                              ngram_range= (1,3),
                              stop_words= None,
                              token_pattern=r"(?u)\b\w+\b")

        tfidf_matrix = vectorizer.fit_transform(corpus_cut)

        cosine_similarities = linear_kernel(tfidf_matrix[-1], tfidf_matrix).flatten()

        related_docs_indices = cosine_similarities.argsort()[:-(topK+2):-1]
        # print(related_docs_indices)
        # print(cosine_similarities[related_docs_indices])
        print('查詢關鍵字: {}\n'.format(txt))
        print('前 {} 個最佳符合的回饋內容:\n'.format(topK))
        for idx, (docu_idx, score) in enumerate(zip(related_docs_indices[1:], cosine_similarities[related_docs_indices[1:]])):
            print('{}. {} 相似程度: {:.2f}\n'.format(idx+1, corpus[docu_idx], score))

In [None]:
extractor= info_extractor(df_clean)

In [None]:
cut_results= extractor.keyword_extraction(63, pos= True)

In [None]:
## neg ok 364 362 353

-----

# 文本查詢

In [None]:
txt = '教學品質'
extractor.document_matching(txt, topK= 5)