In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler #you can use minmax scaler too
from sklearn.ensemble import GradientBoostingClassifier
#Import other necessary model libraries, for this example, using Logistic Regression

### Importing the Dataset

In [2]:
train_data = pd.read_csv("data/train_dataset.csv")

In [3]:
train_data.tail()

Unnamed: 0,primary_identifier,device_type,device_category,gender,district_name,age_group,network_stay,average_monthly_bill_amount,dusage_sum,dusage_min,...,vusage_offnet_max,vusage_offnet_min,vusage_offnet_days,vusage_offnet_avg,vusage_offnet_stddev,number_of_fixed_bb_accounts,number_of_iptv_accounts,add_on_tot_rental,add_on_count,next_month_plan
10495,1438655,4G,Smartphone,MALE,Kalutara,40-50,64,2088.13,479.0,0.346959,...,29.866667,0.0,7.0,14.571429,8.017054,0.0,0.0,,,PKG6
10496,7827264,4G,Smartphone,FEMALE,Colombo,<20,6,2053.5,23168.0,0.219985,...,25.366667,0.0,3.0,10.861111,5.151029,0.0,0.0,580.0,3.0,PKG1
10497,1433957,4G,Smartphone,FEMALE,Colombo,30-40,69,2827.33,4553.0,4.831711,...,0.0,0.0,0.0,,0.0,0.0,1.0,,,PKG6
10498,8494507,4G,Smartphone,MALE,Kandy,60-70,233,645.43,2069.0,4.722031,...,18.65,0.0,18.0,5.887963,4.956822,0.0,1.0,,,PKG1
10499,3569439,3G,Smartphone,MALE,Puttalam,40-50,33,498.2,536.0,0.003048,...,4.366667,0.0,7.0,2.228571,1.096541,0.0,0.0,,,PKG1


### Label Encoding the y variable

In [4]:
train_data['next_month_plan'] = train_data['next_month_plan'].astype('category')

In [5]:
train_data['next_month_plan'].unique()

['PKG2', 'PKG1', 'PKG6', 'PKG4', 'PKG5', 'PKG3', 'PKG8', 'PKG7']
Categories (8, object): ['PKG2', 'PKG1', 'PKG6', 'PKG4', 'PKG5', 'PKG3', 'PKG8', 'PKG7']

In [6]:
labels_x = ['PKG0','PKG1', 'PKG2', 'PKG3', 'PKG4', 'PKG5', 'PKG6', 'PKG7', 'PKG8']

In [7]:
le = preprocessing.LabelEncoder()

In [8]:
le.fit(labels_x)

LabelEncoder()

In [9]:
le.classes_

array(['PKG0', 'PKG1', 'PKG2', 'PKG3', 'PKG4', 'PKG5', 'PKG6', 'PKG7',
       'PKG8'], dtype='<U4')

In [10]:
train_data['encoded_class_labels'] = le.transform(train_data['next_month_plan'])

In [12]:
train_data.head()

Unnamed: 0,primary_identifier,device_type,device_category,gender,district_name,age_group,network_stay,average_monthly_bill_amount,dusage_sum,dusage_min,...,vusage_offnet_min,vusage_offnet_days,vusage_offnet_avg,vusage_offnet_stddev,number_of_fixed_bb_accounts,number_of_iptv_accounts,add_on_tot_rental,add_on_count,next_month_plan,encoded_class_labels
0,6875768,4G,Smartphone,MALE,Kegalle,40-50,219,761.62,6317.0,2.773958,...,0.0,27.0,25.099383,34.759221,0.0,1.0,,,PKG2,2
1,6406277,4G,Smartphone,MALE,Galle,40-50,36,2482.52,28365.0,3.079438,...,0.0,15.0,5.071111,6.779644,0.0,0.0,600.0,2.0,PKG1,1
2,3563570,4G,Smartphone,MALE,Kandy,30-40,186,1565.55,18581.0,48.193713,...,0.0,0.0,,0.0,0.0,0.0,0.0,1.0,PKG6,6
3,1853666,4G,Smartphone,FEMALE,Colombo,40-50,24,2397.49,91071.0,22.437842,...,0.0,0.0,,0.0,0.0,0.0,1110.0,2.0,PKG4,4
4,2794331,4G,Smartphone,FEMALE,Kandy,50-60,37,2501.75,19736.0,53.302092,...,0.0,23.0,11.256522,16.224071,0.0,1.0,1350.0,2.0,PKG4,4


### Handling Missing values

In [13]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   primary_identifier           10500 non-null  int64   
 1   device_type                  10477 non-null  object  
 2   device_category              10470 non-null  object  
 3   gender                       10365 non-null  object  
 4   district_name                10500 non-null  object  
 5   age_group                    10365 non-null  object  
 6   network_stay                 10500 non-null  int64   
 7   average_monthly_bill_amount  10500 non-null  float64 
 8   dusage_sum                   10500 non-null  float64 
 9   dusage_min                   10500 non-null  float64 
 10  dusage_max                   10500 non-null  float64 
 11  dusage_avg                   9642 non-null   float64 
 12  dusage_days                  10500 non-null  float64 
 13  d

#### Removing the missing values in categorical variables

In [14]:
train_data = train_data.dropna(subset = ['device_type', 'device_category', 'gender', 'age_group'], axis = 0)

In [15]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10337 entries, 0 to 10499
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   primary_identifier           10337 non-null  int64   
 1   device_type                  10337 non-null  object  
 2   device_category              10337 non-null  object  
 3   gender                       10337 non-null  object  
 4   district_name                10337 non-null  object  
 5   age_group                    10337 non-null  object  
 6   network_stay                 10337 non-null  int64   
 7   average_monthly_bill_amount  10337 non-null  float64 
 8   dusage_sum                   10337 non-null  float64 
 9   dusage_min                   10337 non-null  float64 
 10  dusage_max                   10337 non-null  float64 
 11  dusage_avg                   9513 non-null   float64 
 12  dusage_days                  10337 non-null  float64 
 13  d

#### Adding zeros to the quantitative variables

In [16]:
train_data = train_data.drop(columns = ['next_month_plan', 'vusage_onnet_avg'])

In [18]:
train_data.isnull().sum()

primary_identifier                0
device_type                       0
device_category                   0
gender                            0
district_name                     0
age_group                         0
network_stay                      0
average_monthly_bill_amount       0
dusage_sum                        0
dusage_min                        0
dusage_max                        0
dusage_avg                      824
dusage_days                       0
dusage_stddev                     0
vusage_onnet_sum                  0
vusage_onnet_max                  0
vusage_onnet_min                  0
vusage_onnet_days                 0
vusage_onnet_stddev               0
vusage_offnet_sum                 0
vusage_offnet_max                 0
vusage_offnet_min                 0
vusage_offnet_days                0
vusage_offnet_avg              3218
vusage_offnet_stddev              0
number_of_fixed_bb_accounts       0
number_of_iptv_accounts           0
add_on_tot_rental           

In [19]:
train_data['dusage_avg'] = train_data['dusage_avg'].fillna(value = 0)
train_data['vusage_offnet_avg'] = train_data['vusage_offnet_avg'].fillna(value = 0)
train_data['add_on_tot_rental'] = train_data['add_on_tot_rental'].fillna(value = 0)
train_data['add_on_count'] = train_data['add_on_count'].fillna(value = 0)

In [23]:
train_data.isnull().sum()

primary_identifier             0
device_type                    0
device_category                0
gender                         0
district_name                  0
age_group                      0
network_stay                   0
average_monthly_bill_amount    0
dusage_sum                     0
dusage_min                     0
dusage_max                     0
dusage_avg                     0
dusage_days                    0
dusage_stddev                  0
vusage_onnet_sum               0
vusage_onnet_max               0
vusage_onnet_min               0
vusage_onnet_days              0
vusage_onnet_stddev            0
vusage_offnet_sum              0
vusage_offnet_max              0
vusage_offnet_min              0
vusage_offnet_days             0
vusage_offnet_avg              0
vusage_offnet_stddev           0
number_of_fixed_bb_accounts    0
number_of_iptv_accounts        0
add_on_tot_rental              0
add_on_count                   0
encoded_class_labels           0
dtype: int

In [24]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10337 entries, 0 to 10499
Data columns (total 30 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   primary_identifier           10337 non-null  int64  
 1   device_type                  10337 non-null  object 
 2   device_category              10337 non-null  object 
 3   gender                       10337 non-null  object 
 4   district_name                10337 non-null  object 
 5   age_group                    10337 non-null  object 
 6   network_stay                 10337 non-null  int64  
 7   average_monthly_bill_amount  10337 non-null  float64
 8   dusage_sum                   10337 non-null  float64
 9   dusage_min                   10337 non-null  float64
 10  dusage_max                   10337 non-null  float64
 11  dusage_avg                   10337 non-null  float64
 12  dusage_days                  10337 non-null  float64
 13  dusage_stddev   

### One-Hot encoding the Categorical variables

In [25]:
categorical_variables = ['device_type', 'device_category', 'gender', 'age_group', 'district_name']

In [26]:
for var in categorical_variables:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(train_data[var], prefix=var)
    train_data_1=train_data.join(cat_list)
    train_data=train_data_1

In [27]:
train_data_vars=train_data.columns.values.tolist()
to_keep=[i for i in train_data_vars if i not in categorical_variables]

In [28]:
train_data_final=train_data[to_keep]
train_data_final.columns.values

array(['primary_identifier', 'network_stay',
       'average_monthly_bill_amount', 'dusage_sum', 'dusage_min',
       'dusage_max', 'dusage_avg', 'dusage_days', 'dusage_stddev',
       'vusage_onnet_sum', 'vusage_onnet_max', 'vusage_onnet_min',
       'vusage_onnet_days', 'vusage_onnet_stddev', 'vusage_offnet_sum',
       'vusage_offnet_max', 'vusage_offnet_min', 'vusage_offnet_days',
       'vusage_offnet_avg', 'vusage_offnet_stddev',
       'number_of_fixed_bb_accounts', 'number_of_iptv_accounts',
       'add_on_tot_rental', 'add_on_count', 'encoded_class_labels',
       'device_type_2G', 'device_type_3G', 'device_type_4G',
       'device_category_Basic', 'device_category_Feature phone',
       'device_category_Modem', 'device_category_Pluggable card',
       'device_category_Smartphone', 'device_category_Tablet',
       'gender_FEMALE', 'gender_MALE', 'age_group_20-30',
       'age_group_30-40', 'age_group_40-50', 'age_group_50-60',
       'age_group_60-70', 'age_group_<20', 'age_gr

In [29]:
train_data_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10337 entries, 0 to 10499
Data columns (total 69 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   primary_identifier              10337 non-null  int64  
 1   network_stay                    10337 non-null  int64  
 2   average_monthly_bill_amount     10337 non-null  float64
 3   dusage_sum                      10337 non-null  float64
 4   dusage_min                      10337 non-null  float64
 5   dusage_max                      10337 non-null  float64
 6   dusage_avg                      10337 non-null  float64
 7   dusage_days                     10337 non-null  float64
 8   dusage_stddev                   10337 non-null  float64
 9   vusage_onnet_sum                10337 non-null  float64
 10  vusage_onnet_max                10337 non-null  float64
 11  vusage_onnet_min                10337 non-null  float64
 12  vusage_onnet_days               

### Selecting X and y

In [74]:
train_data_final.to_csv("updated_train_data.csv")

In [30]:
y = train_data_final['encoded_class_labels']

In [31]:
X = train_data_final.drop(columns=['encoded_class_labels', 'primary_identifier'])

In [66]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=10, test_size = 0.05, stratify = y)

### Scaling the variables

In [67]:
standard_X = StandardScaler()
X_train = standard_X.fit_transform(X_train)
X_test = standard_X.transform(X_test)

In [68]:
X_test.shape

(517, 67)

### Fitting X and y to a model

In [69]:
model = GradientBoostingClassifier()

In [70]:
model.fit(X_train, Y_train) #training the model

GradientBoostingClassifier()

### Test set predictions and classification report

In [71]:
y_pred = model.predict(X_test)

In [72]:
# Printing confusion matrix
confusion_matrix(Y_test, y_pred)

array([[146,   1,  25,   3,   5,   1,   0,   0],
       [ 39,   7,  19,   2,   0,   0,   0,   0],
       [ 16,   1,  63,   5,   0,   0,   0,   0],
       [  2,   2,  26,   3,   3,   0,   0,   0],
       [  4,   2,   7,   2,  54,   7,   0,   1],
       [  0,   0,   1,   2,  18,  12,   0,   3],
       [  1,   0,   1,   0,   5,   3,   4,   4],
       [  0,   0,   0,   0,   0,   6,   2,   9]])

In [73]:
# Getting f1 score
from sklearn.metrics import classification_report
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           1       0.70      0.81      0.75       181
           2       0.54      0.10      0.18        67
           3       0.44      0.74      0.56        85
           4       0.18      0.08      0.11        36
           5       0.64      0.70      0.67        77
           6       0.41      0.33      0.37        36
           7       0.67      0.22      0.33        18
           8       0.53      0.53      0.53        17

    accuracy                           0.58       517
   macro avg       0.51      0.44      0.44       517
weighted avg       0.56      0.58      0.54       517



In [53]:
from collections import Counter
from imblearn.over_sampling import SMOTE

In [54]:
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({1: 3622, 3: 1695, 5: 1539, 2: 1342, 4: 725, 6: 719, 7: 350, 8: 345})


In [55]:
sm = SMOTE(random_state=42)

In [56]:
X_res, y_res = sm.fit_resample(X, y)

In [57]:
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({2: 3622, 1: 3622, 6: 3622, 4: 3622, 5: 3622, 3: 3622, 8: 3622, 7: 3622})


In [58]:
X_train_res, X_test_res, Y_train_res, Y_test_res = train_test_split(X_res, y_res, random_state=10, test_size = 0.05)

In [59]:
X_test_res.shape

(1449, 67)

In [60]:
model_res = GradientBoostingClassifier()

In [61]:
model_res.fit(X_train_res, Y_train_res) #training the model

GradientBoostingClassifier()

In [63]:
y_pred_res = model.predict(X_test_res)

In [64]:
# Printing confusion matrix
confusion_matrix(Y_test_res, y_pred_res)

array([[  0,   1,  11,  13,   0,   3,   1, 165],
       [  0,   0,   8,   2,   0,   3,   0, 179],
       [  0,   1,   2,   6,   0,   2,   0, 150],
       [  0,   0,   0,   3,   0,   0,   0, 187],
       [  0,   0,   6,   3,   0,   2,   0, 175],
       [  0,   2,   2,   1,   0,   1,   2, 163],
       [  0,   1,   2,   0,   0,   1,   0, 172],
       [  0,   0,   1,   1,   0,   1,   1, 175]])

In [65]:
# Getting f1 score
from sklearn.metrics import classification_report
print(classification_report(Y_test_res, y_pred_res))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       194
           2       0.00      0.00      0.00       192
           3       0.06      0.01      0.02       161
           4       0.10      0.02      0.03       190
           5       0.00      0.00      0.00       186
           6       0.08      0.01      0.01       171
           7       0.00      0.00      0.00       176
           8       0.13      0.98      0.23       179

    accuracy                           0.12      1449
   macro avg       0.05      0.13      0.04      1449
weighted avg       0.05      0.12      0.04      1449



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

#Other files
.DS_Store
