In [1]:
import os
import numpy as np
import pandas as pd
import re,random
import jieba
import time
from sklearn import metrics
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn import tree
#from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [3]:
stop_list=[]
with open('./stopword.txt','r',encoding='utf-8') as f:
    for line in f.readlines():
        stop_list.append(line.strip())

In [4]:
def preprocess(data,all_data,category):
    for idx, line in enumerate(data):
        line = re.sub(r'[^\w]','',line)
        line = re.sub(r'[A-Za-z0-9]','',line)
        line = re.sub(u'[\uFF01-\uFF5A]','',line)
        line = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))','',line)
        segment_list = jieba.lcut(line)
        segment_list = filter(lambda x: len(x)>1,segment_list)
        segment_list = filter(lambda x: x not in stop_list,segment_list)
        all_data.append( (' '.join(segment_list),category[idx]) )
    return all_data

In [5]:
def load_data(fileName):
    filePath = "label_result/"  
    pos = pd.read_csv(filePath + fileName + ".csv_P.csv")
    neg = pd.read_csv(filePath + fileName + ".csv_N.csv")
    pre_data = pd.concat([pos,neg],axis=0)
    data = []
    text = pd.DataFrame(columns=["text","label"])
    text["text"] = pre_data["title"].fillna("")+pre_data["content"].fillna("")
    text["label"] = pre_data["label"] 
    text= text.reset_index(drop=True)
    data =  preprocess(text["text"],data,text["label"])
    return data

In [6]:
fileName = '台積電'
data = load_data(fileName)
random.shuffle(data)
x,y = zip(*data)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=666)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\xiaoy\AppData\Local\Temp\jieba.cache
Loading model cost 0.588 seconds.
Prefix dict has been built successfully.


In [9]:
vec = CountVectorizer(
    analyzer='word', 
    ngram_range=(2,6), 
    max_features=8000
)
vec.fit(x_train)

tvec = TfidfVectorizer(
    analyzer='word',
    ngram_range=(2,6), 
    max_features=8000 
)
tvec.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=8000,
                min_df=1, ngram_range=(2, 6), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [14]:
# baseline decision tree
dt = tree.DecisionTreeClassifier() 
dt.fit(tvec.transform(x_train),y_train)
dt.score(tvec.transform(x_test),y_test)

0.567223963775688

In [16]:
xgboost = XGBClassifier()
xgboost.fit(tvec.transform(x_train),y_train)
xgboost.score(tvec.transform(x_test),y_test)

0.584813653779171

In [18]:
print(dt.predict(tvec.transform(['資本 美元 資挺鴻海 目標 外資'])))
print(xgboost.predict(tvec.transform(['資本 美元 資挺鴻海 目標 外資'])))

[-1.]
[1.]
