# Project 1: 使用Numpy进行文本的四分类问题

## 步骤1: 实现数据读取

### 导入pandas读取

In [3]:
import pandas as pd 

### 读入数据

In [4]:
train_data = pd.read_csv("./train.tsv",header=0,delimiter="\t")
test_data = pd.read_csv("./test.tsv",header=0,delimiter="\t")

### 数据内容

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


### 数据标题

In [6]:
train_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


### 计算数据比例

In [7]:
print(train_data.Sentiment.value_counts()/train_data.Sentiment.count())

2    0.509945
3    0.210989
1    0.174760
4    0.058990
0    0.045316
Name: Sentiment, dtype: float64


In [8]:
X = train_data['Phrase']
Y = train_data['Sentiment']
test_X = test_data['Phrase']
data_train = list(X)
label_train = list(Y)

In [9]:
len(data_train)

156060

### 实现分词

In [10]:
## 实现文本的分词
def get_word(text):
    return [word.lower() for word in text.split(' ')]

def get_whole_word(data):
    return[get_word(text) for text in data]

word_list = get_whole_word(data_train)
print(word_list[0])

['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.']


### 实现词袋(N-gram)
实现二元语法。

In [11]:
def get_2gram(word_list):
    word_bag = []
    for words in word_list:
        if len(words)==1:
            word_bag.append(words)
        else:
            ngram = [(a+' '+b) for a,b in zip(words[:-1],words[1:])]
            word_bag.append(ngram)
    return word_bag
word_bag = get_2gram(word_list)
print(word_bag[0])

['a series', 'series of', 'of escapades', 'escapades demonstrating', 'demonstrating the', 'the adage', 'adage that', 'that what', 'what is', 'is good', 'good for', 'for the', 'the goose', 'goose is', 'is also', 'also good', 'good for', 'for the', 'the gander', 'gander ,', ', some', 'some of', 'of which', 'which occasionally', 'occasionally amuses', 'amuses but', 'but none', 'none of', 'of which', 'which amounts', 'amounts to', 'to much', 'much of', 'of a', 'a story', 'story .']


### 实现可迭代的对象
`idx_to_char`：一个列表，实现的是从迭代对象中抽取词袋作为一个总列表。

`char_to_idx`：一个字典，实现的是词袋和出现的位置的对应（标签号）.

In [12]:
import collections
def get_vocab(word_bag):
    counter = collections.Counter([x for sublist in word_bag for x in sublist])
    to_char = [item[0] for item in counter.items()]
    to_idx = dict([(char,idx)for idx,char in enumerate(to_char)])
    return to_char,to_idx
idx_to_char,char_to_idx = get_vocab(word_bag)
print(len(idx_to_char))
print(char_to_idx['jumbled fantasy'])

100664
83497


### 实现数据集的切割
按照`测试集`和`训练集`为1:6的比例进行拆分。

In [13]:
import random # 实现随机打乱原始的dict
original_dict = list(zip(data_train,label_train))
random.shuffle(original_dict)
data_train[:],label_train[:] = zip(*original_dict)
len_train = int(len(original_dict) * 0.83)
train_phrase = data_train[:len_train]
train_label = label_train[:len_train]
test_phrase = data_train[len_train:]
test_label = label_train[len_train:]
print(len(train_phrase),len(train_label),len(test_phrase),len(test_label))

129529 129529 26531 26531


### 实现测试集和训练集的ngram词袋

In [14]:
train_2gram = get_2gram(get_whole_word(train_phrase))
test_2gram =  get_2gram(get_whole_word(test_phrase))

In [15]:
def sentence2idx(sentence,idx_list):
    try:
        return[idx_list[token] for token in sentence]
    except (KeyError,TypeError):
        # print(sentence)
        count = 0

### 加载数据

In [55]:
def load_data(train_phrase,train_label,batch_size):
    final_data = []
    batch_num = len(train_phrase)//batch_size
    for i in range(batch_num):
        batch_phrase = train_phrase[max(0,i*batch_size):min((i+1)*batch_size,len(train_phrase))]
        batch_label = train_label[max(0,i*batch_size):min((i+1)*batch_size,len(train_phrase))]
        ngram = []
        for sentence in batch_phrase:
            ngram.append(sentence2idx(sentence,char_to_idx))
        final_data.append((ngram,batch_label))
    return final_data

In [58]:
batch_size = 16
train_iter = load_data(train_2gram,train_label,batch_size)
test_iter = load_data(test_2gram,test_label,batch_size)

In [59]:
for x,y in train_iter:
    print(x)
    break

[[76575], [85321, 22589, 10453, 30372, 85322, 85323], [11720, 11721, 11722, 11723, 1465, 11724, 11725, 11563, 11726, 11727, 10866, 11728, 11729, 11730, 11731, 11732, 11733, 11734], [1467, 1986, 6410, 77163, 77164, 1497, 35714, 35715, 1467, 4000, 1442, 36873, 25313, 32607, 77165], [21186], [12753, 78711, 13594, 16390, 970, 971, 3446, 1843, 1863, 65514, 43679, 72102, 27332, 78712, 78713], [7228], [30, 79055, 89745, 89746, 89747, 89748], [38813, 4147, 70977, 70978, 70979, 2813, 70980], [14939, 14940, 14941], [27620, 27621, 27622, 27623, 27624, 643, 27625, 27626, 211, 24242, 2194, 27627, 27628, 27629, 21536, 12434, 27630, 27631, 27632, 27633], [259, 86083], [81938], [3691, 59029, 42118], [81349, 81350, 81351, 81352], [81706]]


## 建模

In [60]:
import numpy as np 
from tqdm import tqdm 
import time

### Softmax实现

In [61]:
def softmax(x_input):
    x_input_exp = np.exp(x_input)
    partion = np.sum(x_input_exp,axis=1,keepdims=True)
    return x_input_exp/partion

### 小批量初始化

In [62]:
def feature(x):
    batch_size=len(x)
    feature_size=len(idx_to_char)
    inputs=np.zeros((batch_size,feature_size))
    for b,i in enumerate(x):
        for idx in i:
            inputs[b][idx]=1
    return inputs

### 反向传播

In [63]:
def backward(x,probability,y):
    probability[range(probability.shape[0]), y]-=1
    dw=x.T.dot(probability)/batch_size #feature_size*n_class
    db=np.sum(probability,axis=0)/batch_size #n_class
    return dw,db

### 评估函数

In [64]:
def evaluate(test_iter,W,b):
    right=0.0
    n=0.0
    for x,y in test_iter:
        n+=batch_size
        x=feature(x)
        probability=softmax(np.matmul(x,W)+b)
        right+=np.sum(np.argmax(probability,axis=1)==y)
    return right/n

### 训练函数

In [65]:
def train(train_data,test_data,lr,num_epoch,W,b,batch_size):
    for epoch in range(num_epoch):
        l_sum,start,n=0.0,time.time(),0
        train_iter=iter(train_data)
        test_iter=iter(test_data)
        for x,y in tqdm(train_iter):
            x=feature(x) #[batch_size,feature]
            probability=softmax(np.matmul(x,W)+b) #[batch_size,n_class]
            loss= np.sum(-np.log(probability[range(probability.shape[0]), y]))
            grad_w,grad_b=backward(x,probability,y)
            #print(x.shape,probability.shape,loss.shape,grad_w.shape,grad_b.shape)
            #print(grad_w,grad_b)
            W=W-lr*grad_w
            b=b-lr*grad_b
            l_sum+=loss
            n+=1
            #print(loss)
        print("epoch %d ,loss %.3f ,test_acc %.2f,time %.2f"%(epoch+1,l_sum/n,evaluate(test_iter,W,b),time.time()-start))
    return W,b

## 训练

In [66]:
batch_size=64
train_iter=load_data(train_2gram,train_label,batch_size)
test_iter=load_data(test_2gram,test_label,batch_size)
feature_size=len(idx_to_char)
n_class=5
W=np.random.normal(0,0.01,(feature_size,n_class))
b=np.zeros(n_class)
lr,num_epoch=0.01,10
print(train_iter[0])

([[76575], [85321, 22589, 10453, 30372, 85322, 85323], [11720, 11721, 11722, 11723, 1465, 11724, 11725, 11563, 11726, 11727, 10866, 11728, 11729, 11730, 11731, 11732, 11733, 11734], [1467, 1986, 6410, 77163, 77164, 1497, 35714, 35715, 1467, 4000, 1442, 36873, 25313, 32607, 77165], [21186], [12753, 78711, 13594, 16390, 970, 971, 3446, 1843, 1863, 65514, 43679, 72102, 27332, 78712, 78713], [7228], [30, 79055, 89745, 89746, 89747, 89748], [38813, 4147, 70977, 70978, 70979, 2813, 70980], [14939, 14940, 14941], [27620, 27621, 27622, 27623, 27624, 643, 27625, 27626, 211, 24242, 2194, 27627, 27628, 27629, 21536, 12434, 27630, 27631, 27632, 27633], [259, 86083], [81938], [3691, 59029, 42118], [81349, 81350, 81351, 81352], [81706], [70052, 4713, 15543, 13348, 70053, 70054], [54065, 11166, 43292], [713, 18276], [86273, 86274], [316, 20494, 27212], [85448], [47854, 2965, 58441, 5646, 2251, 58442, 58443, 58444, 58445, 58446, 58447, 58448, 511, 9312, 58449, 58450, 58451, 58452, 58453, 58454], [1673

In [67]:
train(train_iter,test_iter,lr,num_epoch,W,b,batch_size)

2023it [00:38, 52.36it/s]


epoch 1 ,loss 84.909 ,test_acc 0.51,time 42.48


2023it [00:38, 52.04it/s]


epoch 2 ,loss 81.760 ,test_acc 0.51,time 42.71


2023it [00:38, 52.26it/s]


epoch 3 ,loss 81.317 ,test_acc 0.51,time 42.58


2023it [00:40, 50.15it/s]


epoch 4 ,loss 80.964 ,test_acc 0.51,time 44.20


2023it [00:38, 52.37it/s]


epoch 5 ,loss 80.649 ,test_acc 0.52,time 42.49


2023it [00:38, 52.09it/s]


epoch 6 ,loss 80.362 ,test_acc 0.52,time 42.65


2023it [00:39, 51.01it/s]


epoch 7 ,loss 80.095 ,test_acc 0.52,time 43.49


2023it [00:38, 52.15it/s]


epoch 8 ,loss 79.846 ,test_acc 0.52,time 42.63


2023it [00:38, 52.02it/s]


epoch 9 ,loss 79.610 ,test_acc 0.52,time 42.72


2023it [00:39, 51.84it/s]


epoch 10 ,loss 79.386 ,test_acc 0.52,time 42.89


(array([[ 0.00481624, -0.01876387,  0.00425946, -0.0020165 ,  0.00392172],
        [ 0.01699159,  0.00447008, -0.00694894,  0.0004948 , -0.00068289],
        [ 0.01479213, -0.02134954, -0.02270377, -0.01470759,  0.00015227],
        ...,
        [-0.01166653,  0.0151503 , -0.01169591, -0.02102606,  0.00221335],
        [-0.0101952 ,  0.00664214,  0.00657163, -0.00285486, -0.00502911],
        [ 0.01231343,  0.01513464, -0.01066177, -0.00928344, -0.01183595]]),
 array([-1.13175303,  0.19544835,  1.42866685,  0.39233987, -0.88470205]))