## 參考文件
* pandas ：https://pandas.pydata.org/docs/user_guide/index.html
* jieba ：https://github.com/fxsjy/jieba
* pickle : https://www.datacamp.com/community/tutorials/pickle-python-tutorial
* TF-IDF : https://medium.com/datamixcontent-lab/%E6%96%87%E6%9C%AC%E5%88%86%E6%9E%90%E5%85%A5%E9%96%80-%E6%A6%82%E5%BF%B5%E7%AF%87-%E7%B5%A6%E6%88%91%E4%B8%80%E6%AE%B5%E8%A9%B1-%E6%88%91%E5%91%8A%E8%A8%B4%E4%BD%A0%E9%87%8D%E9%BB%9E%E5%9C%A8%E5%93%AA-%E5%B0%8D%E6%96%87%E6%9C%AC%E9%87%8D%E9%BB%9E%E5%AD%97%E8%A9%9E%E5%8A%A0%E6%AC%8A%E7%9A%84tf-idf%E6%96%B9%E6%B3%95-f6a2790b4991
* TextRank : https://tan800630.medium.com/textrank-%E6%BC%94%E7%AE%97%E6%B3%95%E4%BB%8B%E7%B4%B9-e73b44679bce

# 匯入套件

In [None]:
import pandas as pd
import pickle
import jieba
import jieba.analyse as analyse
import jieba.posseg as pseg

# Pickle

In [None]:
# data
x1 = [1,2,3]

x2 = {
    '1' : 1,
    '2' : 2
}

In [None]:
# 寫入
outfile = open('lawtech-test.pkl','wb')
pickle.dump(x1,outfile)
outfile.close()

In [None]:
# 讀取
outfile = open("lawtech-test.pkl","rb") # 用 Binary 形式讀取判決書資料
data = pickle.load(outfile) # 把資料匯入
print(data)

# Pandas

### 建立 DataFrame

In [None]:
dataset1 = [
    {
        'std_name' : 'wyne',
        'std_no' : '06170171'
    },{
        'std_name' : 'nick',
        'std_no' : '06170271'
    }   
]

dataset2 = {
    'std_name' : ['wyne','nick'],
    'std_no' : ['06170171','06170271']
}

In [None]:
columns = ['std_name','std_no']
df = pd.DataFrame(dataset2,columns=columns)

In [None]:
df.head()

### 新增資料

In [None]:
# Column
df['is_std'] = 1
df.head()

In [None]:
# Row
dataset3 = {
    'std_name' : 'Alex',
    'std_no' : '06170371',
    'is_std' : 1
}
df = df.append(dataset1,ignore_index=True)
df.head()

In [None]:
# Specific column
df['is_std'][2] = 1

### 查詢資料

In [None]:
# Specific column
df['std_name']

In [None]:
# Specific column to list
df['std_name'].values.tolist()

In [None]:
# Specific row
df.iloc[1]

In [None]:
# Specific row to list
df.iloc[1].values.tolist()

In [None]:
# Query by one specific condition
df[df['std_no'] == '06170171']

In [None]:
# Query by multiple specific condition
df[(df['std_name'] == 'wyne') & (df['std_no'] == '06170171')]

In [None]:
# Query by one specific condition with specific output
df[df['std_no'] == '06170171']['std_name']

In [None]:
# Query by one specific condition with specific output in list
df[df['std_no'] == '06170171']['std_name'].values.tolist()

### 刪除資料

In [None]:
# drop by specific column
df.drop(['is_std'],axis = 1)

In [None]:
# drop by specific row
df.drop([2],axis = 0)

### 特殊功能

In [None]:
# unique data
df['std_name'].value_counts()

In [None]:
# the total number of row and column
df.shape

In [None]:
# check data object
df.info()

In [None]:
# export in csv
df.to_csv('test_lawtech.csv',encoding='utf-8')

In [None]:
# export in excel(xlsx)
df.to_excel('test_lawtech.xlsx',encoding='utf-8')

In [None]:
# Python - 1 - Lambda
output = lambda x : x+1 

In [None]:
# apply function to specific column
df['is_std'].apply(lambda x : x+1 )

In [None]:
# Python - 2 - function
def add(x):
    return x+1

In [None]:
# apply function to specific column
df['is_std'].apply(add)

In [None]:
# check for NaN(Not A Number) under a specific column
df['is_std'].isnull().value_counts()

In [None]:
# reset index
df.reset_index(drop=True)

In [None]:
# time series
df['time'] = '2019-06-02'
type(df['time'][0])

In [None]:
df['time'] = pd.to_datetime(df['time'])
type(df['time'][0])

In [None]:
# HTML 爬蟲（僅允許 Table 標籤下採用）
bus_table = pd.read_html("https://www.ubus.com.tw/Booking/FareInquiry")
print(bus_table[0])

In [None]:
law_table = pd.read_html("https://law.judicial.gov.tw/FJUD/qryresultlst.aspx?ty=JUDBOOK&q=67eba8f847ad242311f991eabce86127")
law_table

# jieba

In [None]:
# jieba.cut
## 參數：
### cut_all(True 為 全模式 / False【default】 為 精確模式)
text = "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。"
seg_list = jieba.cut(text,cut_all=True)
print(list(seg_list))

In [None]:
# jieba.analyse 基於 TF-IDF
for x, w in jieba.analyse.extract_tags(text, withWeight=True):
    print('%s %s' % (x, w))

In [None]:
# jieba.analyse 基於 Text Rank
for x, w in jieba.analyse.textrank(text, withWeight=True):
    print('%s %s' % (x, w))

In [None]:
# 詞性標記
words = pseg.cut(text,use_paddle=True) #paddle模式
for word, flag in words:
    print('%s %s' % (word, flag))