# 折扣店销售

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

plt.style.use('seaborn-v0_8')

path = r'C:\Users\user\Desktop\折扣店销售数据.csv'
df = pd.read_csv(path, encoding='gbk')

df.head()

Unnamed: 0,商品编号,商品重量,是否低脂肪,货架上此商品总展示区域占比,商品类型,商品最高售价,折扣店编号,折扣店开办时间,折扣店规模,折扣店所在城市类型,折扣店类型,商品销售量
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   商品编号           8523 non-null   object 
 1   商品重量           7060 non-null   float64
 2   是否低脂肪          8523 non-null   object 
 3   货架上此商品总展示区域占比  8523 non-null   float64
 4   商品类型           8523 non-null   object 
 5   商品最高售价         8523 non-null   float64
 6   折扣店编号          8523 non-null   object 
 7   折扣店开办时间        8523 non-null   int64  
 8   折扣店规模          6113 non-null   object 
 9   折扣店所在城市类型      8523 non-null   object 
 10  折扣店类型          8523 non-null   object 
 11  商品销售量          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


# 数据预处理

In [3]:
# 去除不需要的特征
df.drop(['商品编号', '折扣店编号', '折扣店开办时间'], axis=1, inplace=True)

# 去除缺失的数据
df.dropna(subset=['折扣店规模'], inplace=True)


# 对缺失值用0填充
def fill_weight(df):
    return 0


df['商品重量'] = df[['商品重量']].apply(fill_weight, axis=1)

df['商品重量'] = df['商品重量'].astype('float64')
train_df = pd.get_dummies(df, drop_first=True)

# 模型训练

In [4]:
X = train_df.drop('商品销售量', axis=1)
y = train_df['商品销售量']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

In [5]:
LR = LinearRegression()
LR.fit(X_train, y_train)
preds = LR.predict(X_test)

In [6]:
def metrics(y_test, preds):
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)

    print("RMSE:", round(rmse))
    print("R2 Score:", round(r2, 2))


metrics(y_test, preds)  # 输出指标

RMSE: 1171
R2 Score: 0.57


# 结论
---
## 结果
- RMSE:1171
- R2:0.57
## 模型提升
- 对于缺失值可以采用其他方式进行填充，而不是直接用0
- 可以和其他模型对比分析