1). Import Library

In [11]:
import pandas as pd #dataframe library
import numpy as np #numeric library
import datetime as dt

#plot library
import matplotlib.pyplot as plt 
import seaborn as sns 

#evaluation library
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report,recall_score,precision_score

#label encoder library
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# for min_max scaling (Kaggle)
from mlxtend.preprocessing import minmax_scaling

from sklearn.preprocessing import MinMaxScaler, StandardScaler

# for changing datatime type
from sklearn.preprocessing import StandardScaler

# for Box-Cox Transformation
from scipy import stats

# set seed for reproducibility
np.random.seed(0)

2). Load Dataset

In [9]:
training_url = "https://docs.google.com/spreadsheets/d/1Lii15km8Q91IWQKrhY6I0f92ZR2o4st7/edit#gid=560237263"
training_dataset = training_url.replace('/edit#gid=', '/export?format=csv&gid=')

In [10]:
data = pd.read_csv(training_dataset)

  data = pd.read_csv(training_dataset)


3). Make a copy

In [12]:
data = data.copy()

4). Dropping Un-needed Column

In [13]:
data = data.drop(['subs_key', 'b_no','event_action_code', 'destination_poc'], axis = 1)

5). Preprocessing (Change order and name)

In [14]:
print(data.columns)

Index(['day_no', 'time_no', 'service_filter', 'payment_cat', 'subs_no',
       'prefix_dom_sk_id', 'originating_cell_id', 'home_poc', 'city_name',
       'physical_poc', 'roam_zone', 'rem_bal', 'rtd_amt', 'free_unit',
       'usg_unit', 'sid'],
      dtype='object')


In [15]:
col_name = [
    'date',
    'date_time',
    'service_type',
    'payment_type',
    'subs_id',
    'prefix_domain',
    'cell_id',
    'home_poc',
    'city_name',
    'physical_poc',
    'roam_zone',
    'remaining_balance',
    'amount_of_trx',
    'free_unit',
    'usage_unit',
    'sid'
]

data.columns = col_name
print(data.columns)

Index(['date', 'date_time', 'service_type', 'payment_type', 'subs_id',
       'prefix_domain', 'cell_id', 'home_poc', 'city_name', 'physical_poc',
       'roam_zone', 'remaining_balance', 'amount_of_trx', 'free_unit',
       'usage_unit', 'sid'],
      dtype='object')


In [16]:
order = [
    'date',
    'date_time',
    'service_type',
    'payment_type',
    'subs_id',
    'prefix_domain',
    'cell_id',
    'home_poc',
    'city_name',
    'physical_poc',
    'roam_zone',
    #'destination_poc',
    'sid',
    'remaining_balance',
    'amount_of_trx',
    'free_unit',
    'usage_unit'
]

data = data[order]


In [17]:
data.head()

Unnamed: 0,date,date_time,service_type,payment_type,subs_id,prefix_domain,cell_id,home_poc,city_name,physical_poc,roam_zone,sid,remaining_balance,amount_of_trx,free_unit,usage_unit
0,1/9/2021,1/9/2021 21:24,SMSBAS,PRE,2106705274,5636,510.11.34314.18692,JK0,JAKARTA,KA0,LOCXLM,SILVER,673,50,0.0,1
1,1/9/2021,1/9/2021 21:29,SMSBAS,PRE,2106705274,5636,510.11.34314.38401,JK0,JAKARTA,KA0,LOCXLM,SILVER,623,50,0.0,1
2,1/9/2021,1/9/2021 21:36,SMSBAS,PRE,2106705274,5636,510.11.34314.18693,JK0,JAKARTA,KA0,LOCXLM,SILVER,573,50,0.0,1
3,1/9/2021,1/9/2021 7:43,VASOCC,PRE,1075588229,-1,0.0.0.0,NOPOC,JAKARTA,B,LOCXLM,SILVER,7220,5500,0.0,1
4,1/9/2021,1/9/2021 4:24,CONTEN,PRE,1075588229,-1,510.11.411042.3,JK0,JAKARTA,JK0,NATZ10,TRIBE,101720,55,,1048576


5.2). Missing Values

In [18]:
missing_values_count = data.isnull().sum() + data.isin(['', ' ','00000000', None, np.nan]).sum()
missing_values_count

date                     0
date_time                0
service_type             0
payment_type             0
subs_id                  0
prefix_domain            0
cell_id                  0
home_poc                 0
city_name                0
physical_poc             0
roam_zone                0
sid                      0
remaining_balance        0
amount_of_trx            0
free_unit            29292
usage_unit               0
dtype: int64

In [19]:
data = data.fillna(0)
data = data.replace(['', ' ', None], 0)

In [20]:
missing_values_count = data.isnull().sum() + data.isin(['', ' ', None]).sum()
missing_values_count

date                 0
date_time            0
service_type         0
payment_type         0
subs_id              0
prefix_domain        0
cell_id              0
home_poc             0
city_name            0
physical_poc         0
roam_zone            0
sid                  0
remaining_balance    0
amount_of_trx        0
free_unit            0
usage_unit           0
dtype: int64

6). Data type conversion for numerical

In [21]:
data[['remaining_balance', 'amount_of_trx', 'free_unit', 'usage_unit']].dtypes

remaining_balance     object
amount_of_trx          int64
free_unit            float64
usage_unit             int64
dtype: object

In [22]:
def non_numeric(x):
    non_numeric_data = pd.DataFrame(data[data[x].astype(str).str.contains('[^0-9.-]')])
    return non_numeric_data

In [23]:
non_numeric('remaining_balance')

Unnamed: 0,date,date_time,service_type,payment_type,subs_id,prefix_domain,cell_id,home_poc,city_name,physical_poc,roam_zone,sid,remaining_balance,amount_of_trx,free_unit,usage_unit
1108,8/9/2021,8/9/2021 14:36,CONTEN,PRE,545592898,-1,510.11.220267.9,BD0,BANDUNG,MD0,NATZ10,CATCHALL,2425:06:00,35,0.0,3993600
1109,8/9/2021,8/9/2021 14:36,CONTEN,PRE,545592898,-1,510.11.220267.9,BD0,BANDUNG,MD0,NATZ10,VOLDUO,2423:06:00,2,0.0,102400
1110,8/9/2021,8/9/2021 14:36,CONTEN,PRE,545592898,-1,510.11.220267.9,BD0,BANDUNG,MD0,NATZ10,VOLVIKI,2421:06:00,2,0.0,102400
1111,8/9/2021,8/9/2021 14:36,CONTEN,PRE,545592898,-1,510.11.220267.9,BD0,BANDUNG,MD0,NATZ10,VOLIG,2419:06:00,2,0.0,102400
1112,8/9/2021,8/9/2021 14:36,CONTEN,PRE,545592898,-1,510.11.220267.9,BD0,BANDUNG,MD0,NATZ10,NETWORK,2417:06:00,2,0.0,102400
1113,8/9/2021,8/9/2021 14:36,CONTEN,PRE,545592898,-1,510.11.220267.9,BD0,BANDUNG,MD0,NATZ10,VOLMFISH,2415:06:00,2,0.0,102400


In [24]:
def non_numeric(x):
    non_numeric_mask = data[x].astype(str).str.contains('[^0-9.-]')
    data.drop(data[non_numeric_mask].index, inplace=True)
    return data

non_numeric('remaining_balance')

Unnamed: 0,date,date_time,service_type,payment_type,subs_id,prefix_domain,cell_id,home_poc,city_name,physical_poc,roam_zone,sid,remaining_balance,amount_of_trx,free_unit,usage_unit
0,1/9/2021,1/9/2021 21:24,SMSBAS,PRE,2106705274,5636,510.11.34314.18692,JK0,JAKARTA,KA0,LOCXLM,SILVER,673,50,0.0,1
1,1/9/2021,1/9/2021 21:29,SMSBAS,PRE,2106705274,5636,510.11.34314.38401,JK0,JAKARTA,KA0,LOCXLM,SILVER,623,50,0.0,1
2,1/9/2021,1/9/2021 21:36,SMSBAS,PRE,2106705274,5636,510.11.34314.18693,JK0,JAKARTA,KA0,LOCXLM,SILVER,573,50,0.0,1
3,1/9/2021,1/9/2021 7:43,VASOCC,PRE,1075588229,-1,0.0.0.0,NOPOC,JAKARTA,B,LOCXLM,SILVER,7220,5500,0.0,1
4,1/9/2021,1/9/2021 4:24,CONTEN,PRE,1075588229,-1,510.11.411042.3,JK0,JAKARTA,JK0,NATZ10,TRIBE,101720,55,0.0,1048576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43942,30/4/2022,4/30/22 18:22,NATNAL,PRE,712325998,3207,510.11.25190.31588,JK0,JAKARTA,BD6,NGBXLM,SILVER,2206.0,3950,0.0,2460
43943,30/4/2022,4/30/22 7:49,NATNAL,PRE,712325998,3207,510.11.25190.31588,JK0,JAKARTA,BD6,NGBXLM,SILVER,6788.0,346,0.0,470
43944,30/4/2022,4/30/22 20:39,CONTEN,PRE,1182349800,-1,510.11.540155.1,JK0,JAKARTA,SB0,NATZ10,VOLYOUTUBE,1567.0,668,0.0,34201600
43945,30/4/2022,4/30/22 11:27,CONTEN,PRE,1182349800,-1,510.11.540155.1,JK0,JAKARTA,SB0,NATZ10,VOLYOUTUBE,5415.0,6,0.0,307200


In [25]:
#non_numeric('remaining_balance')

def non_numeric(x):
    non_numeric_data = pd.DataFrame(data[data[x].astype(str).str.contains('[^0-9.-]')])
    return non_numeric_data

In [26]:
non_numeric('amount_of_trx')

Unnamed: 0,date,date_time,service_type,payment_type,subs_id,prefix_domain,cell_id,home_poc,city_name,physical_poc,roam_zone,sid,remaining_balance,amount_of_trx,free_unit,usage_unit


In [27]:
non_numeric('free_unit')

Unnamed: 0,date,date_time,service_type,payment_type,subs_id,prefix_domain,cell_id,home_poc,city_name,physical_poc,roam_zone,sid,remaining_balance,amount_of_trx,free_unit,usage_unit


In [28]:
non_numeric('usage_unit')

Unnamed: 0,date,date_time,service_type,payment_type,subs_id,prefix_domain,cell_id,home_poc,city_name,physical_poc,roam_zone,sid,remaining_balance,amount_of_trx,free_unit,usage_unit


In [29]:
# show data types from each column
print(data.dtypes)

date                  object
date_time             object
service_type          object
payment_type          object
subs_id                int64
prefix_domain          int64
cell_id               object
home_poc              object
city_name             object
physical_poc          object
roam_zone             object
sid                   object
remaining_balance     object
amount_of_trx          int64
free_unit            float64
usage_unit             int64
dtype: object


In [30]:
data['amount_of_trx'] = data['amount_of_trx'].astype(float)
data['remaining_balance'] = data['remaining_balance'].astype(float)
data['usage_unit'] = data['usage_unit'].astype(float)

In [31]:
# show data types from each column
print(data.dtypes)

date                  object
date_time             object
service_type          object
payment_type          object
subs_id                int64
prefix_domain          int64
cell_id               object
home_poc              object
city_name             object
physical_poc          object
roam_zone             object
sid                   object
remaining_balance    float64
amount_of_trx        float64
free_unit            float64
usage_unit           float64
dtype: object


7). Split Time from 'data_time' cloumn

In [32]:
print(data.dtypes)

date                  object
date_time             object
service_type          object
payment_type          object
subs_id                int64
prefix_domain          int64
cell_id               object
home_poc              object
city_name             object
physical_poc          object
roam_zone             object
sid                   object
remaining_balance    float64
amount_of_trx        float64
free_unit            float64
usage_unit           float64
dtype: object


In [33]:
#drop the date column
data = data.drop('date', axis=1)