In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans

In [2]:
data = pd.read_csv('./data2.csv')
data['price']

0        138
1      45-90
2        188
3         98
4       29.9
       ...  
132     11.9
133     28.9
134     35.5
135     29.5
136     32.5
Name: price, Length: 137, dtype: object

In [3]:
data.shape

(137, 7)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             137 non-null    int64 
 1   title          137 non-null    object
 2   weight         137 non-null    int64 
 3   price          137 non-null    object
 4   month_num      137 non-null    int64 
 5   pinglun_num    137 non-null    int64 
 6   shouchang_num  137 non-null    int64 
dtypes: int64(5), object(2)
memory usage: 7.6+ KB


In [5]:
data.describe()

Unnamed: 0,id,weight,month_num,pinglun_num,shouchang_num
count,137.0,137.0,137.0,137.0,137.0
mean,69.0,400.218978,37375.562044,238097.3,209046.6
std,39.692569,453.197216,61195.192477,283803.8,419857.9
min,1.0,36.0,631.0,2997.0,1946.0
25%,35.0,154.0,8955.0,43927.0,30088.0
50%,69.0,212.0,15608.0,128489.0,77252.0
75%,103.0,520.0,44588.0,274148.0,189590.0
max,137.0,2764.0,444591.0,1092616.0,3217214.0


In [6]:
data.isnull().sum()

id               0
title            0
weight           0
price            0
month_num        0
pinglun_num      0
shouchang_num    0
dtype: int64

In [26]:
max_price = []
min_price = []
for i in data['price']:
    t = i.split('-')
    if len(t) == 2:
        min_price.append(float(t[0]))
        max_price.append(float(t[1]))
    else:
        min_price.append(float(t[0]))
        max_price.append(float(t[0]))
data['max_price'] = max_price
data['min_price'] = min_price
data

Unnamed: 0,id,title,weight,price,month_num,pinglun_num,shouchang_num,max_price,min_price
0,1,预售【巨型萌宠大礼包/1804g】休闲零食网红食品吃货,1804,138,42802,831591,563166,138.0,138.0
1,2,满减【夏威夷果160g】干货零食坚果干果散装袋装奶油味,160,45-90,55719,676830,363020,90.0,45.0
2,3,【巨型零食大礼包/30袋装】休闲零食网红食品吃货送女友,2764,188,51386,388037,805650,188.0,188.0
3,4,【坚果大礼包1463g/8件】零食端午礼盒每日坚果混合送礼,1463,98,69195,1019082,2490660,98.0,98.0
4,5,【乳酸菌小伴侣520g/整箱】营养早餐蛋糕面包代餐点心,520,29.9,46429,864396,640866,29.9,29.9
...,...,...,...,...,...,...,...,...,...
132,133,【兰花豆205g】休闲零食干果坚果炒货蚕豆豌豆牛肉味,205,11.9,14991,108127,66192,11.9,11.9
133,134,【猪肉脯210g】麻辣零食风干熟食小吃肉脯肉干小包装,120,28.9,10702,9554,6388,28.9,28.9
134,135,满减【奶香华夫饼248g】休闲零食面包蛋糕早餐代餐网红,248,35.5,15181,19822,6524,35.5,35.5
135,136,满减【夹心海苔36g】即食海味儿童零食紫菜芝麻/巴旦木,36,29.5,3494,107607,30088,29.5,29.5


In [8]:
(data['min_price'] <= 0).any()

False

In [9]:
(data['month_num'] <= 0).any()

False

In [10]:
(data['title'] == 0).any()

False

In [11]:
data['title']

0        预售【巨型萌宠大礼包/1804g】休闲零食网红食品吃货
1        满减【夏威夷果160g】干货零食坚果干果散装袋装奶油味
2        【巨型零食大礼包/30袋装】休闲零食网红食品吃货送女友
3      【坚果大礼包1463g/8件】零食端午礼盒每日坚果混合送礼
4        【乳酸菌小伴侣520g/整箱】营养早餐蛋糕面包代餐点心
                   ...              
132       【兰花豆205g】休闲零食干果坚果炒货蚕豆豌豆牛肉味
133       【猪肉脯210g】麻辣零食风干熟食小吃肉脯肉干小包装
134      满减【奶香华夫饼248g】休闲零食面包蛋糕早餐代餐网红
135      满减【夹心海苔36g】即食海味儿童零食紫菜芝麻/巴旦木
136      满减【Q弹鸡蛋干240g】休闲零食蛋制品类豆干类豆制品
Name: title, Length: 137, dtype: object

In [35]:
tag = []
desc = []
sub_title = []
for i in data['title']:
    j = i.find('【')
    k = i.find('】')
    if j == 0:
        tag.append('')
    else:
        if i[0:j].endswith('_'):
            tag.append(i[0:j - 1])
        else:
            tag.append(i[0:j])
    desc.append(i[j + 1:k])
    sub_title.append(i[k + 1:])
    #print(i[j + 1:k])
data['tag'] = tag
data['desc'] = desc
data['sub_title'] = sub_title
data

Unnamed: 0,id,title,weight,price,month_num,pinglun_num,shouchang_num,max_price,min_price,tag,desc,sub_title
0,1,预售【巨型萌宠大礼包/1804g】休闲零食网红食品吃货,1804,138,42802,831591,563166,138.0,138.0,预售,巨型萌宠大礼包/1804g,休闲零食网红食品吃货
1,2,满减【夏威夷果160g】干货零食坚果干果散装袋装奶油味,160,45-90,55719,676830,363020,90.0,45.0,满减,夏威夷果160g,干货零食坚果干果散装袋装奶油味
2,3,【巨型零食大礼包/30袋装】休闲零食网红食品吃货送女友,2764,188,51386,388037,805650,188.0,188.0,,巨型零食大礼包/30袋装,休闲零食网红食品吃货送女友
3,4,【坚果大礼包1463g/8件】零食端午礼盒每日坚果混合送礼,1463,98,69195,1019082,2490660,98.0,98.0,,坚果大礼包1463g/8件,零食端午礼盒每日坚果混合送礼
4,5,【乳酸菌小伴侣520g/整箱】营养早餐蛋糕面包代餐点心,520,29.9,46429,864396,640866,29.9,29.9,,乳酸菌小伴侣520g/整箱,营养早餐蛋糕面包代餐点心
...,...,...,...,...,...,...,...,...,...,...,...,...
132,133,【兰花豆205g】休闲零食干果坚果炒货蚕豆豌豆牛肉味,205,11.9,14991,108127,66192,11.9,11.9,,兰花豆205g,休闲零食干果坚果炒货蚕豆豌豆牛肉味
133,134,【猪肉脯210g】麻辣零食风干熟食小吃肉脯肉干小包装,120,28.9,10702,9554,6388,28.9,28.9,,猪肉脯210g,麻辣零食风干熟食小吃肉脯肉干小包装
134,135,满减【奶香华夫饼248g】休闲零食面包蛋糕早餐代餐网红,248,35.5,15181,19822,6524,35.5,35.5,满减,奶香华夫饼248g,休闲零食面包蛋糕早餐代餐网红
135,136,满减【夹心海苔36g】即食海味儿童零食紫菜芝麻/巴旦木,36,29.5,3494,107607,30088,29.5,29.5,满减,夹心海苔36g,即食海味儿童零食紫菜芝麻/巴旦木


In [39]:
data.sort_values('month_num', ascending=False).head(10)

Unnamed: 0,id,title,weight,price,month_num,pinglun_num,shouchang_num,max_price,min_price,tag,desc,sub_title
49,50,【手撕面包1kg】全麦蛋糕吐司早餐代餐食品整箱糕点零食,1000,29.9,444591,1063381,1613308,29.9,29.9,,手撕面包1kg,全麦蛋糕吐司早餐代餐食品整箱糕点零食
50,51,【售罄_零食大礼包】休闲食品零食小吃抖音爆款,581,44.9,320393,678440,232994,44.9,44.9,,售罄_零食大礼包,休闲食品零食小吃抖音爆款
8,9,【每日坚果750g/30包】零食吃货大礼包干果混合孕妇礼盒,750,138,298445,1037960,1533564,138.0,138.0,,每日坚果750g/30包,零食吃货大礼包干果混合孕妇礼盒
6,7,【麻辣零食大礼包】网红卤味鸭脖小零食充饥夜宵整箱,390,39.8,267903,1062649,656568,39.8,39.8,,麻辣零食大礼包,网红卤味鸭脖小零食充饥夜宵整箱
25,26,推荐_【岩烧乳酪吐司520g/整箱】面包早餐零食营养食品,520,29.9,189138,251573,263496,29.9,29.9,推荐,岩烧乳酪吐司520g/整箱,面包早餐零食营养食品
16,17,满减【猪肉脯100g】麻辣零食熟食风干靖江特产小吃肉干,100,29-45.9,143241,803331,298054,45.9,29.0,满减,猪肉脯100g,麻辣零食熟食风干靖江特产小吃肉干
47,48,【零食大礼包】休闲零食网红吃货食品饼干箱装吃货充饥,927,59.9,123044,1092616,3217214,59.9,59.9,,零食大礼包,休闲零食网红吃货食品饼干箱装吃货充饥
20,21,满减【蜀香牛肉】休闲麻辣零食小吃肉脯特产牛肉干美食,100,38.9,115698,522270,200836,38.9,38.9,满减,蜀香牛肉,休闲麻辣零食小吃肉脯特产牛肉干美食
83,84,推荐_【BIG大辣片230gx1袋】网红老式大辣条童年零食,230,11.9,110289,138235,77252,11.9,11.9,推荐,BIG大辣片230gx1袋,网红老式大辣条童年零食
12,13,【芒果干116gx3】零食小吃蜜饯水果干网红休闲食品果脯,348,29.9,101583,1020113,1043230,29.9,29.9,,芒果干116gx3,零食小吃蜜饯水果干网红休闲食品果脯


In [40]:
data.duplicated().sum()

0

In [43]:
max_price = pd.value_counts(data['max_price'])
min_price = pd.value_counts(data['min_price'])
month_num = pd.value_counts(data['month_num'])