In [1]:
import pandas as pd
import numpy as np

In [73]:
filename = "./data/text_classify_1.csv"

In [74]:
df = pd.read_csv(filename)

In [75]:
df.head()

Unnamed: 0,target,context
0,买房,房子还在吗
1,买房,什么时候可以看房？
2,买房,您好，请问这套房子还在吗？
3,买房,你好，我想在浦东新场附近买个商住两用的房子
4,买房,我是想全款买套房，560左右，想找个不满二税费高的房子，这样我买就很划算


In [102]:
print("在 target 列中总共有 %d 个空值." % df['target'].isnull().sum())
print("在 context 列中总共有 %d 个空值." % df['context'].isnull().sum())
df[df.isnull().values==True]
df = df[pd.notnull(df['context'])]

在 target 列中总共有 0 个空值.
在 context 列中总共有 0 个空值.


In [103]:
import jieba
import re
jieba.load_userdict('./data/dict.txt')
#定义删除除字母,数字，汉字以外的所有符号的函数
def remove_punctuation(line):
    line = str(line)
    if line.strip()=='':
        return ''
    rule = re.compile(u"[^a-zA-Z0-9\u4E00-\u9FA5]")
    line = rule.sub('',line)
    return line
 
def stopwordslist(filepath):  
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]  
    return stopwords  
 
#加载停用词
stopwords = stopwordslist("./data/stop_words.txt")

In [104]:
#删除除字母,数字，汉字以外的所有符号
df['clean_content'] = df['context'].apply(remove_punctuation)
df.sample(10)

Unnamed: 0,target,context,clean_content,cut_context,target_id
1057,咨询,通过个人网上业务平台登录时，“普通用户”忘记密码如何处理？,通过个人网上业务平台登录时普通用户忘记密码如何处理,网上 业务 平台 登录 时 普通用户 忘记 密码,2
776,租赁,计划租一个大约7000块钱上下次卧地公寓,计划租一个大约7000块钱上下次卧地公寓,计划 租 7000 块钱 下次 卧地 公寓,1
151,买房,找一个maifang大约700萬毛坯,找一个maifang大约700萬毛坯,找 maifang 700 萬 毛坯,0
678,租赁,看看海淀安宁庄6000的房子,看看海淀安宁庄6000的房子,海淀 安宁 庄 6000,1
811,咨询,房地产开发商哪家好？该如何选择开发商？,房地产开发商哪家好该如何选择开发商,房地产 开发商 哪家 好该 选择 开发商,2
688,租赁,是不是整租,是不是整租,整租,1
1159,咨询,如何开通住房补贴个人网上业务系统使用权限？,如何开通住房补贴个人网上业务系统使用权限,开通 住房补贴 网上 业务 系统 权限,2
360,买房,我想要mai大概950W以里精装的房子,我想要mai大概950W以里精装的房子,想要 mai 950W 以里 精装,0
275,买房,我想要大概400WRMB以内精装的屋子,我想要大概400WRMB以内精装的屋子,想要 400WRMB 以内 精装 屋子,0
1022,咨询,外籍、港澳台在京工作人员离职如何提取住房公积金？,外籍港澳台在京工作人员离职如何提取住房公积金,外籍 港澳台 京 工作人员 离职 提取 住房 公积金,2


In [105]:
#分词，并过滤停用词
df['cut_context'] = df['clean_content'].apply(lambda x: " ".join([w for w in list(jieba.cut(x)) if w not in stopwords]))
df.tail()

Unnamed: 0,target,context,clean_content,cut_context,target_id
1182,咨询,你的城市房租收入比是多少？,你的城市房租收入比是多少,城市 房租 收入 比是,2
1183,咨询,北京老人能投靠子女落户吗？怎么办理？,北京老人能投靠子女落户吗怎么办理,北京 老人 投靠 子女 落户 办理,2
1184,咨询,北京能办理夫妻投靠落户吗？怎么办理？,北京能办理夫妻投靠落户吗怎么办理,北京 办理 夫妻 投靠 落户 办理,2
1185,咨询,2017年环京限购政策一览,2017年环京限购政策一览,2017 年 环京 限购 政策 一览,2
1186,咨询,买商铺需要什么资格，要注意什么？,买商铺需要什么资格要注意什么,买 商铺 资格,2


In [328]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(df['cut_context'], size=100, window=2, min_count=5, workers=4)
#model.save("word2vec.model")

In [329]:
##特征工程

In [330]:
import time
def getVector_v1(cutWords, model_w):
        count = 0
        article_vector = np.zeros(100)
        for cutWord in cutWords:
            try:
                if cutWord in model_w:
                        article_vector += model_w[cutWord]
                        count += 1
            except KeyError:
                continue
        if count != 0:
            return article_vector / count
        else:
            return article_vector
 
startTime = time.time()
vector_list = []
for cutWords in df['cut_context']:
    #print(cutWords)
    vector_list.append(getVector_v1(cutWords, word2vec_model) )
X = np.array(vector_list)

print('Total Time You Need To Get X:%.2f秒' % (time.time() - startTime) )

Total Time You Need To Get X:0.17秒


  import sys
  


In [331]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
y = labelEncoder.fit_transform(df['target'])

In [332]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [333]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
lr_model = LogisticRegression(C=1.0, 
                            class_weight='balanced', 
                            dual=False, 
                            fit_intercept=False,
                            max_iter=10000,
                            multi_class='ovr', 
                            n_jobs=-1, 
                            penalty='l2', 
                            solver='lbfgs',
                            random_state=2019, 
                            tol=0.0001)
lr_model.fit(train_X,train_y)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=False, intercept_scaling=1, l1_ratio=None,
                   max_iter=10000, multi_class='ovr', n_jobs=-1, penalty='l2',
                   random_state=2019, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [334]:
lr_y_pred = lr_model.predict(test_X)

In [335]:
test_X

array([[ 1.74846313e-01,  1.37848028e-01,  1.26552732e-02, ...,
        -1.73304025e-01,  1.45268401e-01,  1.12070800e-01],
       [ 1.98234639e-01,  1.59262179e-01,  1.70744178e-03, ...,
        -2.15147700e-01,  1.74750092e-01,  1.27009476e-01],
       [ 2.06257808e-01,  1.65660626e-01, -1.13691510e-04, ...,
        -2.27007715e-01,  1.85515044e-01,  1.30936066e-01],
       ...,
       [ 1.84460615e-01,  1.45165396e-01,  1.11325172e-02, ...,
        -1.85311531e-01,  1.59023134e-01,  1.15954757e-01],
       [ 1.89842030e-01,  1.50663805e-01,  1.16109594e-02, ...,
        -1.97987235e-01,  1.60188174e-01,  1.18930071e-01],
       [ 1.90394097e-01,  1.54418047e-01,  2.56339446e-03, ...,
        -2.06731212e-01,  1.68761785e-01,  1.19188345e-01]])

In [336]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [337]:
print('accuracy %s' % accuracy_score(lr_y_pred, test_y))
print(labelEncoder.inverse_transform([[x] for x in range(3)]))
print(classification_report(test_y, lr_y_pred))

accuracy 0.6764705882352942
['买房' '咨询' '租赁']
              precision    recall  f1-score   support

           0       0.87      0.31      0.46        84
           1       0.76      1.00      0.86        76
           2       0.55      0.76      0.63        78

    accuracy                           0.68       238
   macro avg       0.72      0.69      0.65       238
weighted avg       0.73      0.68      0.64       238



  y = column_or_1d(y, warn=True)


In [338]:
def getVectorMatrix(text):
    return np.array([getVector_v1(text, word2vec_model)])
def lr_myPredict(text_pre):
    format_sec=" ".join([w for w in list(jieba.cut(remove_punctuation(text_pre))) if w not in stopwords])
    return lr_model.predict(getVectorMatrix(format_sec))[0]

In [339]:
featureMatrix.shape

(1, 100)

In [340]:
lr_myPredict("租房")

  import sys
  


1

In [341]:
sv_clf = LinearSVC(C=0.1,                                     
                   class_weight=None,       
                   fit_intercept=False,       
                   loss='squared_hinge',    
                   max_iter=10000,
                   multi_class='ovr',       
                   penalty='l2',        
                   random_state=2019, 
                   tol=0.0001)

In [342]:
sv_clf.fit(train_X, train_y)

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=False,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=2019, tol=0.0001,
          verbose=0)

In [343]:
sv_y_pred = lr_model.predict(test_X)

In [344]:
print('accuracy %s' % accuracy_score(sv_y_pred, test_y))
print(labelEncoder.inverse_transform([[x] for x in range(3)]))
print(classification_report(test_y, sv_y_pred))

accuracy 0.6764705882352942
['买房' '咨询' '租赁']
              precision    recall  f1-score   support

           0       0.87      0.31      0.46        84
           1       0.76      1.00      0.86        76
           2       0.55      0.76      0.63        78

    accuracy                           0.68       238
   macro avg       0.72      0.69      0.65       238
weighted avg       0.73      0.68      0.64       238



  y = column_or_1d(y, warn=True)


In [345]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(learning_rate=0.01,        #学习率
                        n_estimators=1000,          # 树的个数--1000棵树建立xgboost
                        max_depth=6,             # 树的深度
                        min_child_weight=1,        # 叶子节点最小权重
                        gamma=0.,                  # 惩罚项中叶子结点个数前的参数
                        subsample=0.8,             # 随机选择80%样本建立决策树
                        colsample_btree=0.8,       # 随机选择80%特征建立决策树
                        objective='multi:softmax', # 指定损失函数
                        colsample_bynode=1,        #每个节点（拆分）的列的子样本比率
                        colsample_bylevel=1,       #构建每一层时，列采样率 
                        nthread=20,                #并行线程数
                        random_state=2019)

In [346]:
xgb_clf.fit(train_X,train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_btree=0.8,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              gamma=0.0, learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=20, objective='multi:softprob', random_state=2019,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1)

In [347]:
xgb_y_pred = xgb_clf.predict(test_X)

In [348]:
print('accuracy %s' % accuracy_score(xgb_y_pred, test_y))
print(labelEncoder.inverse_transform([[x] for x in range(3)]))
print(classification_report(test_y, xgb_y_pred))

accuracy 0.9243697478991597
['买房' '咨询' '租赁']
              precision    recall  f1-score   support

           0       0.95      0.89      0.92        84
           1       0.94      0.96      0.95        76
           2       0.89      0.92      0.91        78

    accuracy                           0.92       238
   macro avg       0.92      0.93      0.92       238
weighted avg       0.93      0.92      0.92       238



In [354]:
def getVectorMatrix(text):
    return np.array([getVector_v1(text, word2vec_model)])
def xgb_myPredict(text_pre):
    format_sec=" ".join([w for w in list(jieba.cut(remove_punctuation(text_pre))) if w not in stopwords])
    c = xgb_clf.predict(getVectorMatrix(format_sec))[0]
    if c == 0:
        return "买房"
    elif c == 1:
        return "咨询"
    else:
        return "租赁"

In [355]:
xgb_myPredict("租三室一厅")

  import sys
  


'租赁'