# Code required for CRISP-DM

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from matplotlib import font_manager, rc

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.svm import SVR
from sklearn.pipeline import Pipeline

import joblib

In [None]:
def load_data():
    '''Returns dataframe, loading raw data'''
    
    df = pd.read_csv('data/201901-202003.csv')
    
    return df

In [None]:
def create_year(col):
    '''Returns year from the form of YYYYMM'''
    
    col = str(col)
    return int(col[:4])

def create_month(col):
    '''Returns month from the form of YYYYMM'''
    
    col = str(col)
    return int(col[4:])

In [None]:
def clean_dataframe(df):
    '''Returns dataframe with year and month column, and without REG_YYMM column'''
    
    df['year'] = df['REG_YYMM'].apply(lambda x: create_year(x))
    df['month'] = df['REG_YYMM'].apply(lambda x: create_month(x))
    df = df.drop(['REG_YYMM'], axis=1)
    
    return df

In [None]:
def plot_distribution(df):
    '''Plots 4 distributions from the dataframe: CARD_SIDO_NM, STD_CLSS_NM, AGE, FLC'''
    
    # Make Korean language work in plots
    plt.rcParams['axes.unicode_minus'] = False
    f_path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=f_path).get_name()
    rc('font', family=font_name)
    
    cols = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'FLC']
    indices = []
    counts = []
    
    for col in cols:
        indices.append(df[col].value_counts().index)
        counts.append(df[col].value_counts().values)
        
    plt.figure(figsize=(20, 40))
    plt.subplot(2, 2, 1)
    plt.bar(indices[0], counts[0])
    plt.subplot(2, 2, 2)
    plt.bar(indices[1], counts[1])
    plt.xticks(rotation='vertical')
    plt.subplot(2, 2, 3)
    plt.bar(indices[2], counts[2])
    plt.subplot(2, 2, 4)
    plt.bar(indices[3], counts[3])
    plt.show()

In [None]:
def prepare_data(df):
    '''Returns dataframe with given aggregated columns and drops other unnecessary columns'''
    
    cols = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD',
            'FLC', 'year', 'month']
    df = df.groupby(cols).sum().reset_index(drop=False)
    
    return df

In [None]:
def encode_label(df):
    '''Encodes categorical variables to prepare modeling'''
    
    dtypes = df.dtypes
    label_encoders = {}
    
    for column in df.columns:
        if str(dtypes[column]) == 'object':
            encoder = LabelEncoder()
            encoder.fit(df[column])
            label_encoders[column] = encoder
    
    for column in label_encoders.keys():
        encoder = label_encoders[column]
        df[column] = encoder.transform(df[column])
        
    return df, label_encoders

In [None]:
def engineer_features(df):
    '''Splits the features and the target variable'''
    
    features = df.drop(['AMT', 'CNT'], axis=1)
    target = df['AMT'].apply(lambda x: np.log(x))
    return features, target

In [None]:
def plot_flc_graph(df):
    '''Plots FLC to the sum of purchase volume graph'''
    
    flc_sum_amt = df.groupby('FLC').sum()['AMT']

    print(flc_sum_amt)
    plt.bar(flc_sum_amt.index, np.log10(flc_sum_amt.values))
    plt.show;

In [None]:
def plot_clss_graph(df):
    '''Plots industry domains to the sum of purchase volume graph'''
    
    clss_sum_amt = df.groupby('STD_CLSS_NM').sum()['AMT'] \
                     .sort_values(ascending=False)

    print(clss_sum_amt)
    plt.figure(figsize=(20,10))
    plt.bar(clss_sum_amt.index, np.log10(clss_sum_amt.values))
    plt.xticks(rotation='vertical')
    plt.show;

In [None]:
def plot_month_graph(df):
    '''Plots the median purchase volume graph accross the each month'''
    
    month_median_amt = df.groupby('month').median()['AMT']
    
    print(month_median_amt)
    plt.bar(month_median_amt.index, np.log10(month_median_amt.values))
    plt.xticks(month_median_amt.index)
    plt.show;

In [None]:
def svr_pipeline(features, target):
    x_train, x_test, y_train, y_test = train_test_split(features, target,
                                                        test_size =0.33, random_state=8)
    
    parameters = {'kernel':('linear', 'rbf'),
                  'C':[1, 10],
                  'tol':[0.0001, 0.001, 0.01]}
    
    svr = SVR(cache_size=8192)
    
    reg = GridSearchCV(svr, parameters)
    reg.fit(x_train, y_train)
    
    y_pred = reg.predict(x_test)
    print('Calculated MSE is {}'.format(mean_squared_error(y_test, y_pred)))
    
    r2 = reg.score(x_test, y_test)
    print('Calculated R^2 is {}'.format(r2))
    
    return reg

In [None]:
def split_model_evaluate(features, target):
    '''Splits the dataset into train and test data, create model, and evaluate the model with MSE score'''
    
    # Split
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size =0.33, random_state=8)
    
    # Model
    model = RandomForestRegressor(random_state=0)
    model.fit(x_train, y_train)
    
    # Evaluate
    y_pred = model.predict(x_test)
    print('Calculated MSE is {}'.format(mean_squared_error(y_test, y_pred)))
    
    r2 = model.score(x_test, y_test)
    print('Calculated R^2 is {}'.format(r2))
    
    return model

In [None]:
def plot_feature_importance(features, model):
    '''Calculates the feature importance from the created model and plots the graph'''
    
    plt.bar(features.columns, model.feature_importances_)
    plt.xticks(rotation='vertical')
    plt.show;
    print(model.feature_importances_)

In [None]:
def save_model(model, filepath):
    '''Saves the created model to the designated file path'''
    
    joblib.dump(model, filepath)
    
    print("{} is saved.".format(model)) 

# CRISP-DM

In this notebook, most of results are handled rougly. For further information, please look into the following blog post.   https://medium.com/@lim.andrew1/trying-to-open-business-in-korea-here-is-the-market-research-ee03d616970b

### Business Understanding

**Q1. How much is the purchase volume across customers' Family Life Cycle (FLC), in the given period (2019.01.~2020.03.)?**

**Q2. How much is the purchase volume across the industry sectors?**

**Q3. How much is the median purchase volume across the month, and does the purchase volume increase in the peak season?**

### Data Understanding

In [None]:
# Load data
df = load_data()

In [None]:
# The number of nulls in the dataset
df.isnull().sum()

In [None]:
# The duplicated data in the dataset
df[df.duplicated()]

The data set didn't have duplicated rows. There are some null values in CARD_CCG_NM, and HOM_CCG_NM, but it's not a huddle because these columns are unnecessary for this current research.

### Data Wrangling

In [None]:
# Glimpse of dataset
df.head()

In [None]:
df = clean_dataframe(df)

For the convinience in treating time frame data, the REG_YYMM column is separated in to year and month columns. And then the original column, REG_YYMM, is dropped. Missing values aren't treated yet, but will be treated after plotting graphs.

### Data Analysis with Visualization

For the interpretation of visualizations and the analysis, please move to the blog post.  
https://medium.com/@lim.andrew1/trying-to-open-business-in-korea-here-is-the-market-research-ee03d616970b

In [None]:
plot_distribution(df)

In [None]:
plot_flc_graph(df)

In [None]:
plot_clss_graph(df)

In [None]:
plot_month_graph(df)

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(data=df.corr(), annot=True,
            fmt= '.2f', linewidths=.5, cmap='Blues')

### Prepare Data

In [None]:
df, label_encoders = encode_label(prepare_data(df))

In [None]:
features, target = engineer_features(df)

In [None]:
features.head()

### Data Modeling & Evaluating the Results

**Random Forest**

In [None]:
model = split_model_evaluate(features, target)

In [None]:
plot_feature_importance(features, model)

In [None]:
save_model(model, "model.pkl")

**SVM Regression**

In [None]:
reg = svr_pipeline(features, target)

In [None]:
save_model(reg, "reg.pkl")

### Deploy

The results of the research is shared in blog post. The link is written below.  
https://medium.com/@lim.andrew1/trying-to-open-business-in-korea-here-is-the-market-research-ee03d616970b