In [1]:
import random
import json
import jieba
import numpy as np
from random import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split
from config import *


total_nlp_features = 16


#对短信中的用户名前缀和内部的url链接进行过滤删除
def filter(line):
    #前缀的正则
    username_regex = re.compile(r"^\d+::")
    #URL，为了防止对中文的过滤，所以使用[a-zA-Z0-9]而不是\w
    url_regex = re.compile(r"""
        (https?://)?
        ([a-zA-Z0-9]+)
        (\.[a-zA-Z0-9]+)
        (\.[a-zA-Z0-9]+)*
        (/[a-zA-Z0-9]+)*
    """, re.VERBOSE|re.IGNORECASE)
    #剔除日期
    data_regex = re.compile(u"""        #utf-8编码
        年 |
        月 |
        日 |
        (周一) |
        (周二) | 
        (周三) | 
        (周四) | 
        (周五) | 
        (周六)
    """, re.VERBOSE)
    #剔除所有数字
    decimal_regex = re.compile(r"[^a-zA-Z]\d+")
    #剔除空格
    space_regex = re.compile(r"\s+")

    line = username_regex.sub(r"", line)
    line = url_regex.sub(r"", line)
    line = data_regex.sub(r"", line)
    line = decimal_regex.sub(r"", line)
    line = space_regex.sub(r"", line)

    return line

def stopwordslist(filepath):  
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

def trans(sentence,cutwordslist = None):
    sentence = filter(sentence)
    sentence_seged = jieba.cut(sentence,cut_all=False)
    stopwords = stopwordslist('data/stopwords.txt')  # 这里加载停用词的路径      
    outstr = ''  
    for word in sentence_seged:
        if word not in stopwords:  
            if word != '\t':
                if cutwordslist!= None:
                    cutwordslist += [word]
                outstr += word  
                outstr += " "
    return outstr

X = []
f = open("../input/input.txt", "r", encoding="utf8")
for line in f:
    x = json.loads(line)
    X.append([x["A"],x["B"],x["C"]])
    
X = np.array(X)
indices = np.arange(X.shape[0])
X_train, X_valid,indices_train,indices_valid = train_test_split(X,indices, test_size = 0.1, random_state = 1996)

### 

In [2]:
se = list()
[rows, cols] = X_train.shape
for i in range(rows):
    for j in range(cols):
        se.append(X_train[i][j])

In [3]:
cutwordslist = []
data = se
for a in range(0, len(data)):
    data[a] = trans(data[a],cutwordslist)

# outputwords = dict(Counter(cutwordslist))
# outputwords_sorted = sorted(outputwords.items(), key= lambda x : x[1], reverse=True)[:100]
# print(outputwords_sorted)

tfidf_model = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b").fit(data)
sparse_result = tfidf_model.transform(data)


[rows, cols] = X_valid.shape
count = 0

valid_tfidf_1 = []
valid_tfidf_2 = []
for i in range(rows):
    d1 = X_valid[i][0]
    d2 = X_valid[i][1]
    d3 = X_valid[i][2]
    
    y = [
        trans(d1),
        trans(d2),
        trans(d3)
    ]

    y = tfidf_model.transform(y)
    y = y.todense()
    
    v1 = np.sum(np.dot(y[0], np.transpose(y[1])))
    v2 = np.sum(np.dot(y[0], np.transpose(y[2])))
    valid_tfidf_1.append(v1)
    valid_tfidf_2.append(v2)
    
        
valid_tfidf_1 = np.array(valid_tfidf_1).reshape(-1,1)
valid_tfidf_2 = np.array(valid_tfidf_2).reshape(-1,1)

[rows, cols] = X_train.shape

train_tfidf_1 = []
train_tfidf_2 = []
for i in range(rows):
    d1 = X_train[i][0]
    d2 = X_train[i][1]
    d3 = X_train[i][2]
    
    y = [
        trans(d1),
        trans(d2),
        trans(d3)
    ]

    y = tfidf_model.transform(y)
    y = y.todense()
    
    v1 = np.sum(np.dot(y[0], np.transpose(y[1])))
    v2 = np.sum(np.dot(y[0], np.transpose(y[2])))
    train_tfidf_1.append(v1)
    train_tfidf_2.append(v2)
    
train_tfidf_1 = np.array(train_tfidf_1).reshape(-1,1)
train_tfidf_2 = np.array(train_tfidf_2).reshape(-1,1)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/8j/p8231m8944s1l43bpl8h5t780000gn/T/jieba.cache
Loading model cost 0.678 seconds.
Prefix dict has been built succesfully.


In [4]:
print(train_tfidf_1.shape)
print(train_tfidf_2.shape)
print(valid_tfidf_1.shape)
print(valid_tfidf_2.shape)

(450, 1)
(450, 1)
(50, 1)
(50, 1)


### nlp features

In [5]:
nlp_feature_train_1 = pd.read_csv(FEATURE_TRAIN_1).values
nlp_feature_train_2 = pd.read_csv(FEATURE_TRAIN_2).values

train_1 = nlp_feature_train_1[indices_train]
train_2 = nlp_feature_train_2[indices_train]
valid_1 = nlp_feature_train_1[indices_valid]
valid_2 = nlp_feature_train_2[indices_valid]


print(train_1.shape)
print(train_2.shape)
print(valid_1.shape)
print(valid_2.shape)


(450, 16)
(450, 16)
(50, 16)
(50, 16)


In [6]:
new_train_1 = np.concatenate((train_1,train_tfidf_1),axis = 1)
new_train_2 = np.concatenate((train_2,train_tfidf_2),axis = 1)
new_valid_1 = np.concatenate((valid_1,valid_tfidf_1),axis = 1)
new_valid_2 = np.concatenate((valid_2,valid_tfidf_2),axis = 1)

In [7]:
print(new_train_1.shape)
print(new_train_2.shape)
print(new_valid_1.shape)
print(new_valid_2.shape)

(450, 17)
(450, 17)
(50, 17)
(50, 17)


### feature normalization

In [9]:
# normalize the features
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(np.vstack((new_train_1, new_train_2,new_valid_1,new_valid_2)))
new_train_1 = ss.transform(new_train_1)
new_train_2 = ss.transform(new_train_2)
new_valid_1 = ss.transform(new_valid_1)
new_valid_2 = ss.transform(new_valid_2)

In [11]:
print(new_train_1.shape)
print(new_train_2.shape)
print(new_valid_1.shape)
print(new_valid_2.shape)

(450, 17)
(450, 17)
(50, 17)
(50, 17)


In [13]:
index = [i for i in range(450)]
shuffle(index)
new_train_1 = new_train_1[index]
new_train_2 = new_train_2[index]

In [14]:
index = [i for i in range(50)]
shuffle(index)
new_valid_1 = new_valid_1[index]
new_valid_2 = new_valid_2[index]

In [15]:
import torch 
from torch.autograd import Variable 
import torch.nn.functional as F

# 定义一个构建神经网络的类 
class Net(torch.nn.Module): # 继承torch.nn.Module类 
    def __init__(self, n_feature = 17, n_hidden = 10, n_output = 5): 
        super(Net, self).__init__() # 获得Net类的超类（父类）的构造方法 
        # 定义神经网络的每层结构形式 
        # 各个层的信息都是Net类对象的属性 
        self.hidden = torch.nn.Linear(n_feature, n_hidden) # 隐藏层线性输出 
        
        self.predict = torch.nn.Linear(n_hidden, n_output) # 输出层线性输出 
        self.margin = 0.05
        
        
    def doc_encoding(self, d):
        d = F.relu(self.hidden(d)) # 对隐藏层的输出进行relu激活 
        d = self.predict(d)  # batch_size x 5
        return d1

    # 将各层的神经元搭建成完整的神经网络的前向通路 
    def forward(self,d1,d2,d3): 
        d1 = self.doc_encoding(d1)
        
        d2 = self.doc_encoding(d2)
        
        d3 = self.doc_encoding(d3)
        
        pos_sim=F.cosine_similarity(d1, d2)
        neg_sm=F.cosine_similarity(d1, d3)
        
        loss=(self.margin-good_sim+bad_sim).clamp(min=1e-6).mean()
        return loss,

In [None]:
net = Net(17, 10, 5)
optimizer = torch.optim.SGD(net.parameters(), lr=0.5) # 传入网络参数和学习率 

In [None]:
for t in range(300): 
    prediction = net(x) # 把数据x喂给net，输出预测值 
    loss = loss_function(prediction, y) # 计算两者的误差，要注意两个参数的顺序 
    optimizer.zero_grad() # 清空上一步的更新参数值 
    loss.backward() # 误差反相传播，计算新的更新参数值 
    optimizer.step() # 将计算得到的更新值赋给net.parameters() 
    print('loss:', loss)