In [1]:
import xml.etree.cElementTree as ET
import pandas as pd
from nltk.tokenize import word_tokenize

# XML 解析

## restuarants

In [8]:
path = 'data/restaurants-train.xml'
tree = ET.parse(path)
root = tree.getroot()
root.tag

'sentences'

In [9]:
root[0][0].text
token=word_tokenize(root[0][0].text)
token

['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.']

In [10]:
from nltk.corpus import stopwords

stop_words = []
for w in ['!',',','.','?','-s','-ly','</s>','s','(',')',' ']:
    stop_words.append(w)
train = [w for w in token if w not in stop_words]
train

['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us']

In [11]:
data = []
for sentence in root.findall('sentence'):
    text = sentence.find('text').text
    clean_text = [w for w in word_tokenize(text) if w not in stop_words]
    aspectCategories = sentence.find('aspectCategories')
    for aspectCategory in aspectCategories.findall('aspectCategory'):
        category = aspectCategory.get('category')
        polarity = aspectCategory.get('polarity')
        data.append((text, category, polarity))
data = np.random.permutation(data)

In [12]:
df = pd.DataFrame(data,columns=['text', 'aspect', 'polarity'])
df = df[df['polarity'].isin(['positive', 'negative', 'neutral'])]
df.groupby('aspect')['aspect'].count()
df['aspect'] = df['aspect'].replace('anecdotes/miscellaneous', 'anecdotes miscellaneous')
df['polarity'] = df['polarity'].map(
    {'positive': 1, 'neutral': 0, 'negative': -1})

In [13]:
df.to_csv('data/data.csv', sep=' ',index=0)
df

Unnamed: 0,text,aspect,polarity
0,This is the kind of place you'd like to take a...,anecdotes miscellaneous,1
1,"Food was okay, nothing great.",food,0
2,"Fabulous service, fantastic food, and a chille...",food,1
3,I highly recommend the Sophia pizza.,anecdotes miscellaneous,1
4,All conveniently delivered right to the door.,service,1
...,...,...,...
3709,Just because it's cheap does NOT mean the port...,price,1
3710,"Everything is delicious, though, my gf and I c...",food,1
3711,Decent wine at reasonable prices.,price,1
3712,Any if you have a reservation you'll wait for ...,service,1


In [126]:
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [127]:
dataset = pd.read_csv('data/data.csv', sep=' ')

In [128]:
dataset

Unnamed: 0,text,aspect,polarity
0,Out of the hundreds of Italian restaurants in ...,anecdotes miscellaneous,1
1,I go twice a month!,anecdotes miscellaneous,0
2,My fiance took me to Scopa last week for my bi...,anecdotes miscellaneous,0
3,Incredible food at a very agreable price bring...,price,1
4,When you're sitting in their main dining room ...,ambience,1
...,...,...,...
3513,"The red curry is weak and tasteless, the pad t...",food,-1
3514,We usually just get some of the dinner special...,food,1
3515,"We recently decided to try this location, and ...",ambience,1
3516,"The Thali was small, thoroughly unremarkable, ...",price,-1


In [129]:
# 英文缩写替换
text_abbreviation = []
for item in dataset['text'].tolist():
    item = item.lower().replace("it's", "it is").replace("i'm", "i am").replace("he's", "he is").replace("she's", "she is")\
        .replace("we're", "we are").replace("they're", "they are").replace("you're", "you are").replace("that's", "that is")\
        .replace("this's", "this is").replace("can't", "can not").replace("don't", "do not").replace("doesn't", "does not")\
        .replace("we've", "we have").replace("i've", " i have").replace("isn't", "is not").replace("won't", "will not")\
        .replace("hasn't", "has not").replace("wasn't", "was not").replace("weren't", "were not").replace("let's", "let us")\
        .replace("couldn't", "could not")
    text_abbreviation.append(item)

In [130]:
# 删除标点符号、数字等其他字符
text_clear = []
for item in text_abbreviation:
    item = re.sub("[^a-zA-Z]", " ", item)
    text_clear.append(' '.join(item.split()))

In [131]:
processed_text = []
# 分词、词形归一化、删除停用词
for item in text_clear:
    words_token = word_tokenize(item)  # 分词
    words = [w for w in words_token if w not in stop_words]
    processed_text.append(' '.join(words))

In [145]:
df = pd.DataFrame({'text': processed_text,
                   'aspect': dataset['aspect'] ,
                   'polarity': dataset['polarity']})
df.head()

Unnamed: 0,text,aspect,polarity
0,out of the hundreds of italian restaurants in ...,anecdotes miscellaneous,1
1,i go twice a month,anecdotes miscellaneous,0
2,my fiance took me to scopa last week for my bi...,anecdotes miscellaneous,0
3,incredible food at a very agreable price bring...,price,1
4,when you are sitting in their main dining room...,ambience,1


In [148]:
df.insert(0,'id',list(range(df.shape[0])))
df.head()

Unnamed: 0,id,text,aspect,polarity
0,0,out of the hundreds of italian restaurants in ...,anecdotes miscellaneous,1
1,1,i go twice a month,anecdotes miscellaneous,0
2,2,my fiance took me to scopa last week for my bi...,anecdotes miscellaneous,0
3,3,incredible food at a very agreable price bring...,price,1
4,4,when you are sitting in their main dining room...,ambience,1


In [149]:
train = df[:int(0.7*len(data))]
test = df[int(0.7*len(data)):]
train.to_csv('data/train_laptop.tsv', sep='\t',index=0)
test.to_csv('data/test_lapop.tsv', sep='\t',index=0)

In [150]:
corpus = pd.DataFrame(data=processed_text)
corpus.to_csv('data/corpus.csv', header=0, index=0)

# 词向量

In [None]:
from gensim.models import word2vec 

In [None]:
sentences = word2vec.LineSentence('data/corpus.csv')
model = word2vec.Word2Vec(sentences, min_count=1)# 待考虑

In [None]:
pairs = [
    ('food', 'exceptional'),  
    ('fair', 'kitchen'),   
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2,model.wv.similarity(w1, w2)))

In [None]:
model.wv.save_word2vec_format('data/myvector.vector', binary=False)

1 数据处理
打乱 删除conflict 属性
