In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('churn.csv')
df.drop(index=0, inplace=True)
print(df.shape)

df.head()

(3810, 9)


Unnamed: 0,GroupId,Province,Material Name,month,cur_volume,next_month,next_volume,volume_diff,Churn
1,ERW_A1DS2023040002,Bangkok,M SUP AIO PROTECTION TBS 5W-30 4X7L/CAR,5,12544,9.0,8120.0,-4424.0,Yes
2,ERW_A1DS2023040002,Bangkok,M SUP AIO PROTECTION TBS 5W-30 4X7L/CAR,9,8120,11.0,5040.0,-3080.0,Yes
3,ERW_A1DS2023040002,Bangkok,M SUP AIO Protection 5W-30 4x4+1L/CAR,5,7160,10.0,4580.0,-2580.0,Yes
4,ERW_A1DS2023040005,Udon Thani,M DEL LEGEND CH-4 15W-40 H.D. CTN4X7L:TH,11,16436,12.0,5908.0,-10528.0,Yes
5,ERW_A1DS2023040005,Udon Thani,"M DEL LEGEND CH-4 15W-40 H.D.,1X12L CTN",6,7800,10.0,10348.0,2548.0,No


In [3]:
from unidecode import unidecode

for ind in df.index:
    mat = df.loc[ind, 'Material Name']
    df.loc[ind, 'Material Name'] = unidecode(mat)

In [4]:
df['oil_type'] = pd.NA

# Keywords for Product with qty >= 500
# ATF, DELVAC, MOBILUBE, SUPER, MOBIL 1
for mat in ['ATF', 'DELVAC', 'MOBILUBE', 'MOBIL 1']:
    df.loc[df['Material Name'].str.contains(mat), 'oil_type'] = mat
    
# special condition
df.loc[df['Material Name'].str.contains('MOBIL SUPER'), 'oil_type'] = 'SUPER'
df.loc[df['Material Name'].str.contains('M SUP'), 'oil_type'] = 'SUPER'
df.loc[df['Material Name'].str.contains('M-Super'), 'oil_type'] = 'SUPER'

df.loc[df['Material Name'].str.contains('DEL'), 'oil_type'] = 'DELVAC'
df.loc[df['Material Name'].str.contains('M-Del'), 'oil_type'] = 'DELVAC'

df.loc[df['Material Name'].str.contains('M1'), 'oil_type'] = 'MOBIL 1'

# cleansing for product with qty < 100
df.fillna('Other', inplace=True)

In [5]:
pd.crosstab(df['oil_type'], columns='N', normalize=True).sort_values('N', ascending=False)

col_0,N
oil_type,Unnamed: 1_level_1
SUPER,0.345932
DELVAC,0.345407
MOBILUBE,0.138058
ATF,0.068766
MOBIL 1,0.066929
Other,0.034908


In [6]:
df.isnull().any()

GroupId          False
Province         False
Material Name    False
month            False
cur_volume       False
next_month       False
next_volume      False
volume_diff      False
Churn            False
oil_type         False
dtype: bool

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

In [8]:
sales = df[ ['Province', 'Material Name', 'month', 'cur_volume', 'next_month', 'next_volume']]
sales.head()

Unnamed: 0,Province,Material Name,month,cur_volume,next_month,next_volume
1,Bangkok,M SUP AIO PROTECTION TBS 5W-30 4X7L/CAR,5,12544,9.0,8120.0
2,Bangkok,M SUP AIO PROTECTION TBS 5W-30 4X7L/CAR,9,8120,11.0,5040.0
3,Bangkok,M SUP AIO Protection 5W-30 4x4+1L/CAR,5,7160,10.0,4580.0
4,Udon Thani,M DEL LEGEND CH-4 15W-40 H.D. CTN4X7L:TH,11,16436,12.0,5908.0
5,Udon Thani,"M DEL LEGEND CH-4 15W-40 H.D.,1X12L CTN",6,7800,10.0,10348.0


In [9]:
pd.options.mode.copy_on_write = True

sales['next_month'] = sales['next_month'].convert_dtypes('int')
sales['next_volume'] = sales['next_volume'].convert_dtypes('int')

In [10]:
le = LabelEncoder()
le.fit_transform(sales['Material Name'])

sales.loc[:, 'Material Name'] = le.fit_transform(sales['Material Name'])
sales.loc[:, 'Province'] = le.fit_transform(sales['Province'])

sales['Province'] = sales['Province'].astype('int')
sales['Material Name'] = sales['Material Name'].astype('int')

sales.head()

Unnamed: 0,Province,Material Name,month,cur_volume,next_month,next_volume
1,1,16,5,12544,9,8120
2,1,16,9,8120,11,5040
3,1,18,5,7160,10,4580
4,59,4,11,16436,12,5908
5,59,6,6,7800,10,10348


In [11]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3810 entries, 1 to 3810
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Province       3810 non-null   int32
 1   Material Name  3810 non-null   int32
 2   month          3810 non-null   int64
 3   cur_volume     3810 non-null   int64
 4   next_month     3810 non-null   Int64
 5   next_volume    3810 non-null   Int64
dtypes: Int64(2), int32(2), int64(2)
memory usage: 156.4 KB


## Time-series Forecast

In [12]:
X_train, X_test, y_train, y_test = train_test_split(sales.drop(columns=['next_month', 'next_volume']),
                                                    sales['next_month'],
                                                    test_size=0.2)
print(X_train.shape, y_train.shape)

(3048, 4) (3048,)


In [13]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [14]:
predict = pd.DataFrame(y_pred.round(0), columns=['month_pred'])
pre_cat = pd.concat([X_test, y_test], axis=1)
result = pd.concat([pre_cat.reset_index(drop=True), predict], axis=1)

## Churn prediction

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [16]:
churn = pd.concat([sales, df['Churn']], axis=1)
churn.head()

Unnamed: 0,Province,Material Name,month,cur_volume,next_month,next_volume,Churn
1,1,16,5,12544,9,8120,Yes
2,1,16,9,8120,11,5040,Yes
3,1,18,5,7160,10,4580,Yes
4,59,4,11,16436,12,5908,Yes
5,59,6,6,7800,10,10348,No


Feature Engineering

In [17]:
churn.loc[:, 'Churn'] = churn['Churn'].map({'Yes': 1, 'No': 0})
churn['Churn'] = churn['Churn'].astype('int')
churn.head()

Unnamed: 0,Province,Material Name,month,cur_volume,next_month,next_volume,Churn
1,1,16,5,12544,9,8120,1
2,1,16,9,8120,11,5040,1
3,1,18,5,7160,10,4580,1
4,59,4,11,16436,12,5908,1
5,59,6,6,7800,10,10348,0


In [18]:
cur_vol = churn['cur_volume']
churn.loc[:, 'cur_volume'] = (cur_vol - cur_vol.mean()) / cur_vol.std()

next_vol = churn['next_volume']
churn.loc[:, 'next_volume'] = (next_vol - next_vol.mean()) / next_vol.std()

churn.head()

Unnamed: 0,Province,Material Name,month,cur_volume,next_month,next_volume,Churn
1,1,16,5,1.389148,9,0.420011,1
2,1,16,9,0.437082,11,-0.245232,1
3,1,18,5,0.230486,10,-0.344587,1
4,59,4,11,2.226725,12,-0.057754,1
5,59,6,6,0.368217,10,0.901233,0


In [19]:
X_train, X_test, y_train, y_test = train_test_split(churn.drop(columns='Churn'), churn['Churn'],
                                                    test_size=0.2)
print(X_train.shape, y_train.shape)

(3048, 6) (3048,)


In [20]:
pd.crosstab(y_train, 'N', normalize=True)

col_0,N
Churn,Unnamed: 1_level_1
0,0.371063
1,0.628937


### Machine Learning

In [21]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89       286
           1       0.94      0.92      0.93       476

    accuracy                           0.92       762
   macro avg       0.91      0.91      0.91       762
weighted avg       0.92      0.92      0.92       762



In [22]:
pd.DataFrame({
    'column': X_train.columns,
    'Feature_Importance': model.feature_importances_
}).sort_values('Feature_Importance', ascending=False)

Unnamed: 0,column,Feature_Importance
5,next_volume,0.359651
3,cur_volume,0.28148
2,month,0.171314
4,next_month,0.141318
1,Material Name,0.03213
0,Province,0.014108


In [23]:
from sklearn import tree

tree.export_graphviz(model, out_file='churn_mobil.dot',
                    feature_names=X_train.columns,
                    class_names=churn['Churn'].map({1: 'Yes', 0: 'No'}).unique(),
                     label='all',
                    rounded=True,
                    filled=True)

Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print(classification_report(y_test, rf.predict(X_test)))

In [None]:
pd.DataFrame({
    'Column': X_train.columns,
    'Features_importance': rf.feature_importances_
})

XGBoost

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

print(classification_report(y_test, xgb.predict(X_test)))

In [None]:
pd.DataFrame({
    'Column': X_train.columns,
    'Features_importance': rf.feature_importances_
}).sort_values('Features_importance', ascending=False)