In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import mean_squared_error as MSE

day = 'day'
product_id = 'product_id'
cat_name = 'cat_name'
leaf_cat_id = 'leaf_cat_id'
live_rate = 'live_rate'
product_sat_score = 'product_sat_score'
product_sat_count = 'product_sat_count'
amazing_day_rate = 'amazing_day_rate'
amazing_price_segment = 'amazing_price_segment'
amazing_discount = 'amazing_discount' 
amazing_order_limit = 'amazing_order_limit' 
amazing_proposed_stock = 'amazing_proposed_stock' 
promotion_day_rate = 'promotion_day_rate' 
promotion_price_segment = 'promotion_price_segment' 
promotion_discount = 'promotion_discount' 
promotion_order_limit = 'promotion_order_limit' 
promotion_proposed_stock = 'promotion_proposed_stock' 
comments_counts_7 = 'comments_counts_7' 
rec_comments_count_7 = 'rec_comments_count_7' 
dkp_amazing_notif_count_7 = 'dkp_amazing_notif_count_7' 
normal_order_limit = 'normal_order_limit' 
normal_price_segment = 'normal_price_segment' 
normal_discount = 'normal_discount' 
holiday = 'holiday' 
warehouse1_share = 'warehouse1_share' 
warehouse2_share = 'warehouse2_share' 
amazing_sold_count = 'amazing_sold_count' 
promotion_sold_count = 'promotion_sold_count' 
normal_sold_count = 'normal_sold_count'


trainAddress = "DIGIKALA_TRAIN_DATASET.csv"
testAddress = "DIGIKALA_TEST_DATASET.csv"

targets = [warehouse1_share,
                warehouse2_share,
                amazing_sold_count,
                promotion_sold_count,
                normal_sold_count]

AmazingList = [amazing_day_rate,
                            amazing_discount,
                            amazing_order_limit,
                            amazing_price_segment,
                            amazing_proposed_stock,
                            dkp_amazing_notif_count_7]

PromotionList = [promotion_day_rate,
                            promotion_discount,
                            promotion_order_limit,
                            promotion_price_segment,
                            promotion_proposed_stock]

NormalList = [normal_order_limit,
              normal_price_segment,
              normal_discount]

commonFeatures = [leaf_cat_id,
                            live_rate,
                            product_sat_score,
                            product_sat_count,
                            comments_counts_7,
                            rec_comments_count_7,
                            holiday,
                            'CB',
                            'CH',
                            'CP',
                            'FD',
                            'FF']

# Load Data

In [2]:
def loadData(address):
    df = pd.read_csv(address)
    return df

train = loadData(trainAddress)
test = loadData(testAddress)

def deletUnneededCol(data):
    del data['Unnamed: 0']
    del data[warehouse2_share]
    del data[product_id]
    return data

train = deletUnneededCol(train)
test = deletUnneededCol(test)

## Dealing with noises

In [3]:
def checkScories(data):
    data['product_sat_score'] = data['product_sat_score'].fillna(-1)
    data = data.drop(data[(data[product_sat_count] == 0) & (data[product_sat_score] != -1)].index)
    data = data.drop(data[(data[product_sat_count] != 0) & (data[product_sat_score] == -1)].index)
    return data

train = checkScories(train)
test = checkScories(test)

In [4]:
def amazing_promotionNoiseDetection(train, checkList):
    deletingIndex = []
    for index, row in train.iterrows():
        values = []
        for i in range(5):
            values.append(row[checkList[i]])
        isOneNan = False
        areAllNan = True
        for value in values:
            isNaN = np.isnan(value)
            isOneNan = isOneNan or isNaN
            areAllNan = areAllNan and isNaN
        isOneZero = 0 in values
        OneSitu = isOneZero or isOneNan
        areAllZero = all(v == 0 for v in values)
        AllSitu = areAllNan or areAllZero
        if (OneSitu) and (not AllSitu):
            deletingIndex.append(index)

    for noiseIndex in deletingIndex:
        train = train.drop(noiseIndex)
    
        
amazing_promotionNoiseDetection(train, PromotionList)
amazing_promotionNoiseDetection(train, AmazingList)

amazing_promotionNoiseDetection(test, PromotionList)
amazing_promotionNoiseDetection(test, AmazingList)

# Dealing with missing values

In [5]:
def handleMissingValues(data):
    ZeroToNanFeatures = [day, cat_name, leaf_cat_id, live_rate, normal_price_segment, holiday]
    for feature in ZeroToNanFeatures:
        data[feature] = data[feature].replace(0, np.nan)

    ZeroToNanFeaturesAFTER = [product_sat_score, comments_counts_7]    
    for feature in ZeroToNanFeaturesAFTER:
        data[feature] = data[feature].replace(0, -100)

    NantoZeroFeatures = [product_sat_score, comments_counts_7, rec_comments_count_7, dkp_amazing_notif_count_7]
    for feature in NantoZeroFeatures:
        data[feature] = data[feature].fillna(0)

    for feature in ZeroToNanFeaturesAFTER:
        data[feature] = data[feature].replace(-100, np.nan)

    featureNaNReplace = [leaf_cat_id, live_rate, 
                            product_sat_count, product_sat_score,
                            comments_counts_7, rec_comments_count_7,
                            dkp_amazing_notif_count_7, holiday]

    for feature in featureNaNReplace:
        newVal = data[feature].median()
        data[feature] = data[feature].fillna(newVal)

    cat_name_mode = data[cat_name].mode()
    data[cat_name] = data[cat_name].fillna(cat_name_mode)
    return data

train = handleMissingValues(train)
test = handleMissingValues(test)

# Categorical variables to numerical

In [6]:
def fixCatName(data):
    dummy = pd.get_dummies(data[cat_name])
    data = pd.concat([data, dummy], axis = 1)
    del data[cat_name]
    return data
    
train = fixCatName(train)
test = fixCatName(test)

In [7]:
def fixHoliday(data):
    data[holiday] = data[holiday].replace(1, 3)
    data[holiday] = data[holiday].replace(0.2, 2)
    data[holiday] = data[holiday].replace(0.1, 1)
    return data

train = fixHoliday(train)
test = fixHoliday(test)

#              Features normalization

In [8]:
def scaleData(df, columns):
    for column in columns:
        scaler = RobustScaler()
        unscaledData = np.array(df[column]).reshape(-1, 1)
        scaledData = scaler.fit_transform(unscaledData)
        df[column] = scaledData
        
features = list(train.columns.values)
featuresSet = set(features)
targetsSet = set(targets)
features = list(featuresSet - targetsSet)
features.remove(day)

scaleData(train, features)
scaleData(test, features)

# Features selection

### Split Data to 3 Datas
#### 1. Normal
#### 2. Amazing
#### 3. Promotion

In [9]:
from sklearn import decomposition

PCAcomponents = 4

def PCAFeature(train):
    features = list(train.columns.values)
    featuresSet = set(features)
    targetsSet = set(targets)
    features = list(featuresSet - targetsSet)
    X = train[features].copy()
    pca = decomposition.PCA(n_components=PCAcomponents)
    pca.fit(X)
    X = pca.transform(X)

    return X

In [10]:
def NormalPartData(train):
    trainNormal = train.copy()
    for feature in AmazingList:
        del trainNormal[feature]
    for feature in PromotionList:
        del trainNormal[feature]
    del trainNormal[promotion_sold_count]
    del trainNormal[amazing_sold_count]
    trainNormal = trainNormal[trainNormal[normal_order_limit].notna()]
    trainNormal.reset_index(inplace=True)
    del trainNormal['index']
    X_Normal = PCAFeature(trainNormal)
    X_Normal = pd.DataFrame(X_Normal)
    trainNormal = pd.concat([trainNormal, X_Normal], axis=1)
    for feature in commonFeatures:
        del trainNormal[feature]
    for feature in NormalList:
        del trainNormal[feature]
    return trainNormal

trainNormal = NormalPartData(train)
testNormal = NormalPartData(test)

In [11]:
def PromotionPartData(train):
    trainPromotion = train.copy()
    for feature in AmazingList:
        del trainPromotion[feature]
    for feature in NormalList:
        del trainPromotion[feature]
    del trainPromotion[normal_sold_count]
    del trainPromotion[amazing_sold_count]
    # trainPromotion = trainPromotion[trainPromotion[promotion_day_rate].notna()]
    # trainPromotion = trainPromotion[trainPromotion[promotion_order_limit].notna()]
    trainPromotion = trainPromotion.dropna()
    trainPromotion.reset_index(inplace=True)
    del trainPromotion['index']
    X_Promotion = PCAFeature(trainPromotion)
    X_Promotion = pd.DataFrame(X_Promotion)
    trainPromotion = pd.concat([trainPromotion, X_Promotion], axis=1)
    for feature in commonFeatures:
        del trainPromotion[feature]
    for feature in PromotionList:
        del trainPromotion[feature]
    return trainPromotion
        
trainPromotion = PromotionPartData(train)
testPromotion = PromotionPartData(test)

In [12]:
def AmazingPartData(train):
    trainAmazing = train.copy()
    for feature in PromotionList:
        del trainAmazing[feature]
    for feature in NormalList:
        del trainAmazing[feature]
    del trainAmazing[normal_sold_count]
    del trainAmazing[promotion_sold_count]
    # trainAmazing = trainAmazing[trainAmazing[amazing_day_rate].notna()]
    # trainAmazing = trainAmazing[trainAmazing[amazing_order_limit].notna()]
    trainAmazing = trainAmazing.dropna()
    trainAmazing.reset_index(inplace=True)
    del trainAmazing['index']
    X_Amazing = PCAFeature(trainAmazing)
    X_Amazing = pd.DataFrame(X_Amazing)
    trainAmazing = pd.concat([trainAmazing, X_Amazing], axis=1)
    for feature in commonFeatures:
        del trainAmazing[feature]
    for feature in AmazingList:
        del trainAmazing[feature]
    return trainAmazing

trainAmazing = AmazingPartData(train)
testAmazing = AmazingPartData(test)

# Building the Model

In [13]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
# from sklearn.metrics import mean_absolute_percentage_error as MAPE

In [14]:
def splitData(train, targetNames):
    attributesNames = [0, 1, 2, 3, day]
    testSize = 0.25
    xTrain, xTest, yTrain, yTest = train_test_split(train[attributesNames], train[targetNames], test_size=testSize)
    return xTrain, xTest, yTrain, yTest

In [15]:
def TrainAndTest(targetName, train, saleType):
    targets = [warehouse1_share, targetName]
    xTrain, xTest, yTrain, yTest = splitData(train, targets)

    estimator = KNeighborsRegressor(n_neighbors= 12)
    estimator.fit(xTrain, yTrain)
    yPred = estimator.predict(xTest)
    mse = MSE(yTest, yPred)
    mae = MAE(yTest, yPred)
    print('MSE for predection on', saleType,':' ,mse)
    print('MAE for predection on', saleType,':' ,mae)

In [17]:
TrainAndTest(normal_sold_count, trainNormal, 'normal')

MSE for predection on normal : 81.45992969482306
MAE for predection on normal : 2.763539131579966


In [18]:
TrainAndTest(amazing_sold_count, trainAmazing, 'Amazing')

MSE for predection on Amazing : 35155.50725276049
MAE for predection on Amazing : 68.22476759908074


In [19]:
TrainAndTest(promotion_sold_count, trainPromotion, 'Promotion')

MSE for predection on Promotion : 211.64899974686674
MAE for predection on Promotion : 4.505521957319434


## Predicting Test

In [39]:
targetNames = [warehouse1_share, normal_sold_count]
attributesNames = [0, 1, 2, 3, day]
                                                    
xTrain = trainNormal[attributesNames]
xTest = testNormal[attributesNames]
yTrain = trainNormal[targetNames]
yTest = testNormal[targetNames]    

estimator = KNeighborsRegressor(n_neighbors= 12)
estimator.fit(xTrain, yTrain)
yPred = estimator.predict(xTest)
yPredDFNormal = pd.DataFrame(yPred, columns=[warehouse1_share, normal_sold_count])

In [40]:
yPredDFNormal

Unnamed: 0,warehouse1_share,normal_sold_count
0,0.078273,31.833333
1,0.214519,13.333333
2,0.125000,5.166667
3,0.333333,2.333333
4,0.392858,8.750000
...,...,...
29979,0.620876,15.750000
29980,0.394444,16.583333
29981,0.861111,2.250000
29982,0.663194,4.833333


In [42]:
targetNames = [warehouse1_share, promotion_sold_count]
attributesNames = [0, 1, 2, 3, day]
                                                    
xTrain = trainPromotion[attributesNames]
xTest = testPromotion[attributesNames]
yTrain = trainPromotion[targetNames]
yTest = testPromotion[targetNames]    

estimator = KNeighborsRegressor(n_neighbors= 12)
estimator.fit(xTrain, yTrain)
yPred = estimator.predict(xTest)
yPredDFPromotion = pd.DataFrame(yPred, columns=[warehouse1_share, promotion_sold_count])

In [43]:
yPredDFPromotion

Unnamed: 0,warehouse1_share,promotion_sold_count
0,0.225000,4.416667
1,0.291667,4.166667
2,0.087962,5.166667
3,0.291667,4.166667
4,0.160462,8.500000
...,...,...
13664,0.291666,2.833333
13665,0.809058,14.000000
13666,0.625000,1.916667
13667,0.666733,8.583333


In [44]:
targetNames = [warehouse1_share, amazing_sold_count]
attributesNames = [0, 1, 2, 3, day]
                                                    
xTrain = trainAmazing[attributesNames]
xTest = testAmazing[attributesNames]
yTrain = trainAmazing[targetNames]
yTest = testAmazing[targetNames]    

estimator = KNeighborsRegressor(n_neighbors= 12)
estimator.fit(xTrain, yTrain)
yPred = estimator.predict(xTest)
yPredDFAmazing = pd.DataFrame(yPred, columns=[warehouse1_share, amazing_sold_count])

In [45]:
yPredDFAmazing

Unnamed: 0,warehouse1_share,amazing_sold_count
0,0.399697,38.416667
1,0.318329,180.833333
2,0.251067,204.000000
3,0.101986,282.416667
4,0.206189,239.916667
...,...,...
2055,0.273180,190.416667
2056,0.261997,344.333333
2057,0.336390,138.250000
2058,0.401797,349.666667
