In [12]:
import pandas as pd
import os

# 如果data下面有coffee.csv文件，直接读取
if 'coffee.csv' not in os.listdir('../data'):
    # 读取ignore文件夹下面的Grocery_and_Gourmet_Food_5.json
    df = pd.read_json('../ignore/Grocery_and_Gourmet_Food_5.json', lines=True)
    df.head()
    # 将json文件读取为dataframe之后进行筛选
    # 共有114386条数据
    len(df)
    # 搜索reviewText或summary中包含"coffee"的数据
    df_coffee = df[df['reviewText'].str.contains('coffee', case=False) | df['summary'].str.contains('coffee', case=False)]
    # 查看咖啡数据条数 99183
    len(df_coffee)
    df_coffee.head()
    # 查看有多少种类的咖啡 9527
    df_coffee['asin'].nunique()
    # 将df_coffee保存为csv文件, 保留reviewText/summary/overall/reviwerTime/reviwerName/asin
    df_coffee[['reviewText', 'summary', 'overall', 'reviewTime', 'reviewerName', 'asin']].to_csv('coffee.csv', index=False)

In [14]:
# 读取coffee.csv文件
df = pd.read_csv('../data/coffee.csv')
df.head()

Unnamed: 0,reviewText,summary,overall,reviewTime,reviewerName,asin
0,Best tea for my single cup coffee maker. I pur...,Best tea for my single cup coffee maker,5,"08 28, 2016",cowgirl,4639725043
1,This tea looks like coffee grounds. Brewed it ...,does not look anything like tea.,1,"03 11, 2016",shopAholicr,4639725043
2,Half yellow label and half black tea brewed in...,... black tea brewed in a French coffee pot wo...,5,"08 2, 2015",Thomas B. miller,4639725043
3,The best coffee ever I love it,So good,5,"10 1, 2016",juanita m.,5463213682
4,been using this coffee for years and love the ...,Five Stars,5,"08 9, 2016",GRE,5463213682


In [38]:
# 首先尝试有监督学习，通过reviewText/summary预测overall
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

# 下载stopwords资源
# nltk.download('stopwords')

# 加载数据
df = pd.read_csv('../data/coffee.csv')
# 取前5000条数据
df = df.head(5000)

# 文本预处理
df['review'] = df['reviewText'] + ' ' + df['summary']  # 合并reviewText和summary
df['review'] = df['review'].str.lower()  # 转换为小写
df['review'] = df['review'].str.replace('[^\w\s]','')  # 删除标点符号
df['review'] = df['review'].astype(str)# review转换成str类型

df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))  # 删除停用词

ps = PorterStemmer()
df['review'] = df['review'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))  # 词干提取

# 创建词袋模型
cv = CountVectorizer(max_features = 10000)
X = cv.fit_transform(df['review']).toarray()
y = df.iloc[:,2].values  # overrall作为标签

print(X.shape)
print(y.shape)
print(X[0])
print(y)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

(5000, 10000)
(5000,)
[0 0 0 ... 0 0 0]
[5 1 5 ... 3 5 4]


In [39]:
# 训练baseline模型
classifier = GaussianNB()
classifier.fit(X_train, y_train)
# 预测测试集
y_pred = classifier.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.492
