# 航空公司评价数据集

In [1]:
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np 
import pandas as pd 

In [65]:
data = pd.read_csv('Tweets.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [66]:
data = data[['airline_sentiment', 'text']]
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [67]:
# 统计标签数
data.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [68]:
data_p = data[data.airline_sentiment == 'positive']
data_n = data[data.airline_sentiment == 'negative']

In [69]:
# 平衡正负评价比例
data_n = data_n.iloc[:len(data_p)]
len(data_n), len(data_p)

(2363, 2363)

In [70]:
data = pd.concat([data_n, data_p])
data.head()

Unnamed: 0,airline_sentiment,text
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
15,negative,@VirginAmerica SFO-PDX schedule is still MIA.
17,negative,@VirginAmerica I flew from NYC to SFO last we...


In [71]:
# 乱序
data.sample(len(data))

Unnamed: 0,airline_sentiment,text
3245,negative,@united 50 minute wait...still at the gate wit...
3102,negative,@united still awaiting a reply from ur custome...
6820,positive,@JetBlue glad you like it. Feel free to steal it.
3660,negative,@united My feedback and concerns via your site...
5059,positive,@SouthwestAir great example of customer servic...
...,...,...
1394,negative,@united This is the 2nd time I was rebooked (w...
1591,negative,@united this airline is a joke my friends been...
3464,negative,@united please explain why I need to pay bag f...
475,positive,@VirginAmerica Dad on Segway is the best part ...


In [72]:
# 将评价数据转换成数值类型
data['review'] = (data.airline_sentiment == 'positive').astype(int)

In [73]:
data.head()

Unnamed: 0,airline_sentiment,text,review
3,negative,@VirginAmerica it's really aggressive to blast...,0
4,negative,@VirginAmerica and it's a really big bad thing...,0
5,negative,@VirginAmerica seriously would pay $30 a fligh...,0
15,negative,@VirginAmerica SFO-PDX schedule is still MIA.,0
17,negative,@VirginAmerica I flew from NYC to SFO last we...,0


In [74]:
del data['airline_sentiment']
data.head()

Unnamed: 0,text,review
3,@VirginAmerica it's really aggressive to blast...,0
4,@VirginAmerica and it's a really big bad thing...,0
5,@VirginAmerica seriously would pay $30 a fligh...,0
15,@VirginAmerica SFO-PDX schedule is still MIA.,0
17,@VirginAmerica I flew from NYC to SFO last we...,0


In [75]:
# 利用正则表达式去掉刚开始的特殊符号
import re

In [76]:
# 只保留这些字符，其他的都去掉
token = re.compile('[A-Za-z]+|[!?.,()]')

In [77]:
def reg_text(text):
    new_text = token.findall(text)
    new_text = [word.lower() for word in new_text]
    return new_text

In [78]:
data['text'] = data.text.apply(reg_text)

In [79]:
data.head()

Unnamed: 0,text,review
3,"[virginamerica, it, s, really, aggressive, to,...",0
4,"[virginamerica, and, it, s, a, really, big, ba...",0
5,"[virginamerica, seriously, would, pay, a, flig...",0
15,"[virginamerica, sfo, pdx, schedule, is, still,...",0
17,"[virginamerica, i, flew, from, nyc, to, sfo, l...",0


In [80]:
# 建立set数据结构，取出重复数据的将所有数据填充进去
word_set = set()
for text in data.text:
    for word in text:
        word_set.add(word)
len(word_set)

7100

In [81]:
# 给 dict 数据建立标签
word_list = list(word_set)

In [82]:
word_list.index('spending')

3132

In [83]:
# 单词填充的时候一般从0开始，所以单词编码一般从 1 开始
word_index = dict((word, word_list.index(word) + 1) for word in word_list)

In [84]:
# word_index

In [86]:
# 将每一个文本变成一个整数索引列表
data_ok = data.text.apply(lambda x: [word_index.get(word, 0) for word in x])
data_ok[:10]

3     [6764, 255, 5349, 3644, 5487, 2288, 6518, 6871...
4     [6764, 1837, 255, 5349, 508, 3644, 5630, 5504,...
5     [6764, 3918, 3723, 1409, 508, 1555, 4594, 3156...
15       [6764, 3972, 6034, 2412, 179, 6819, 390, 6485]
17    [6764, 709, 5751, 4714, 2039, 2288, 3972, 360,...
20    [6764, 4212, 2956, 2271, 5160, 4967, 6469, 166...
24    [6764, 4593, 4153, 2076, 6424, 3442, 3616, 648...
25    [6764, 458, 3373, 4379, 6485, 709, 4933, 1837,...
26    [6764, 4882, 6388, 4685, 4241, 4580, 4043, 428...
28    [6764, 2296, 2288, 6689, 5231, 5293, 2681, 553...
Name: text, dtype: object

In [87]:
len(data_ok.iloc[2]), len(data_ok.iloc[20])   # (25, 28)  评价的句子长度是不一样的，所以要进行填充
maxlen = max(len(x) for x in data_ok)   # 找到最大的句子长度

(25, 28)

In [34]:
# 最大单词数目 + 1 
max_word = len(word_set) + 1
max_word

7101

In [35]:
# 填充
data_ok = keras.preprocessing.sequence.pad_sequences(data_ok.values, maxlen=maxlen)
# 输入数据：batch，句子的长度(一个句子有多长)，句子维度(embedding)
data_ok.shape

(4726, 40)

In [36]:
# 标签数据
data.review.values

array([0, 0, 0, ..., 1, 1, 1])

# create Rnn

In [57]:
model = keras.Sequential()

In [58]:
# 把文本映射成一个密集向量，与one-hot编码相对应
# 参数：输入文本数，映射成的密集向量的长度(自定义)，输入文本长度
model.add(tf.keras.layers.Embedding(max_word, 50, input_length=maxlen))

In [59]:
model.add(layers.LSTM(64))
model.add(layers.Dense(1, activation='sigmoid'))

In [60]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 40, 50)            355050    
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                29440     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 384,555
Trainable params: 384,555
Non-trainable params: 0
_________________________________________________________________


In [61]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc']
)

In [62]:
model.fit(data_ok, data.review.values, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c9236fa6c8>