## Import the needed libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import xgboost as xgb
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import TomekLinks
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
#Import other necessary model libraries, for this example, using Logistic Regression

## Import the training data

In [2]:
train_data = pd.read_csv("../data/train_dataset_v2.csv")

In [3]:
train_data.tail()

Unnamed: 0,primary_identifier,device_type,device_category,gender,district_name,age_group,network_stay,average_monthly_bill_amount,dusage_sum,dusage_min,...,vusage_offnet_min,vusage_offnet_diff,vusage_offnet_days,vusage_offnet_avg,vusage_offnet_stddev,number_of_fixed_bb_accounts,number_of_iptv_accounts,add_on_tot_rental,add_on_count,next_month_plan
10495,1438655,4G,Smartphone,MALE,Kalutara,40-50,64,2088.13,479,0.346959,...,0.0,29.866667,7,14.571429,8.017054,0,0,,,PKG6
10496,7827264,4G,Smartphone,FEMALE,Colombo,<20,6,2053.5,23168,0.219985,...,0.0,25.366667,3,10.861111,5.151029,0,0,580.0,3.0,PKG1
10497,1433957,4G,Smartphone,FEMALE,Colombo,30-40,69,2827.33,4553,4.831711,...,0.0,0.0,0,,0.0,0,1,,,PKG6
10498,8494507,4G,Smartphone,MALE,Kandy,60-70,233,645.43,2069,4.722031,...,0.0,18.65,18,5.887963,4.956822,0,1,,,PKG1
10499,3569439,3G,Smartphone,MALE,Puttalam,40-50,33,498.2,536,0.003048,...,0.0,4.366667,7,2.228571,1.096541,0,0,,,PKG1


## Encoding Labels

In [4]:
train_data['next_month_plan'] = train_data['next_month_plan'].astype('category')

In [5]:
train_data['next_month_plan'].unique()

['PKG2', 'PKG1', 'PKG6', 'PKG4', 'PKG5', 'PKG3', 'PKG8', 'PKG7']
Categories (8, object): ['PKG2', 'PKG1', 'PKG6', 'PKG4', 'PKG5', 'PKG3', 'PKG8', 'PKG7']

In [6]:
labels_x = ['PKG0','PKG1', 'PKG2', 'PKG3', 'PKG4', 'PKG5', 'PKG6', 'PKG7', 'PKG8']

In [7]:
le = preprocessing.LabelEncoder()

In [8]:
le.fit(labels_x)

LabelEncoder()

In [9]:
le.classes_

array(['PKG0', 'PKG1', 'PKG2', 'PKG3', 'PKG4', 'PKG5', 'PKG6', 'PKG7',
       'PKG8'], dtype='<U4')

In [10]:
train_data['encoded_class_labels'] = le.transform(train_data['next_month_plan'])

In [11]:
train_data.head(20)

Unnamed: 0,primary_identifier,device_type,device_category,gender,district_name,age_group,network_stay,average_monthly_bill_amount,dusage_sum,dusage_min,...,vusage_offnet_diff,vusage_offnet_days,vusage_offnet_avg,vusage_offnet_stddev,number_of_fixed_bb_accounts,number_of_iptv_accounts,add_on_tot_rental,add_on_count,next_month_plan,encoded_class_labels
0,6875768,4G,Smartphone,MALE,Kegalle,40-50,219,761.62,6317,2.773958,...,152.866667,27,25.099383,34.759221,0,1,,,PKG2,2
1,6406277,4G,Smartphone,MALE,Galle,40-50,36,2482.52,28365,3.079438,...,35.433333,15,5.071111,6.779644,0,0,600.0,2.0,PKG1,1
2,3563570,4G,Smartphone,MALE,Kandy,30-40,186,1565.55,18581,48.193713,...,0.0,0,,0.0,0,0,0.0,1.0,PKG6,6
3,1853666,4G,Smartphone,FEMALE,Colombo,40-50,24,2397.49,91071,22.437842,...,0.0,0,,0.0,0,0,1110.0,2.0,PKG4,4
4,2794331,4G,Smartphone,FEMALE,Kandy,50-60,37,2501.75,19736,53.302092,...,87.616667,23,11.256522,16.224071,0,1,1350.0,2.0,PKG4,4
5,3619017,2G,Basic,MALE,Galle,40-50,30,495.54,0,0.001531,...,1.066667,3,0.7,0.233432,0,0,,,PKG1,1
6,7633257,4G,Smartphone,MALE,Kegalle,30-40,35,1977.19,14782,205.394299,...,0.0,0,,0.0,0,0,110.0,1.0,PKG5,5
7,5507217,4G,Smartphone,MALE,Gampaha,30-40,170,2925.0,3473,6.426329,...,0.0,0,,0.0,0,0,650.0,1.0,PKG5,5
8,7566055,4G,Smartphone,MALE,Kalutara,20-30,48,473.59,3870,0.0,...,22.883333,3,11.338889,4.661264,0,0,110.0,1.0,PKG2,2
9,7324903,4G,Smartphone,MALE,Colombo,20-30,81,1324.6,20876,3.450117,...,6.816667,6,1.922222,1.310881,0,0,360.0,2.0,PKG2,2


## Handling Missing Values

In [12]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   primary_identifier           10500 non-null  int64   
 1   device_type                  10477 non-null  object  
 2   device_category              10470 non-null  object  
 3   gender                       10365 non-null  object  
 4   district_name                10500 non-null  object  
 5   age_group                    10365 non-null  object  
 6   network_stay                 10500 non-null  int64   
 7   average_monthly_bill_amount  10500 non-null  float64 
 8   dusage_sum                   10500 non-null  int64   
 9   dusage_min                   10500 non-null  float64 
 10  dusage_max                   10500 non-null  float64 
 11  dusage_diff                  10500 non-null  float64 
 12  dusage_avg                   9642 non-null   float64 
 13  d

In [13]:
train_data['device_category'].unique()

array(['Smartphone', 'Basic', 'Feature phone', 'Pluggable card', 'Tablet',
       nan, 'Modem'], dtype=object)

In [14]:
train_data['device_type'] = train_data['device_type'].fillna(value = 'Unknown')
train_data['device_category'] = train_data['device_category'].fillna(value = 'Unknown')
train_data['gender'] = train_data['gender'].fillna(value = 'Unknown')
train_data['age_group'] = train_data['age_group'].fillna(value = 'Unknown')

In [15]:
train_data = train_data.drop(columns = ['next_month_plan'])

In [16]:
train_data['dusage_avg'] = train_data['dusage_avg'].fillna(value = 0)
train_data['vusage_offnet_avg'] = train_data['vusage_offnet_avg'].fillna(value = 0)
train_data['add_on_tot_rental'] = train_data['add_on_tot_rental'].fillna(value = -1)
train_data['add_on_count'] = train_data['add_on_count'].fillna(value = 0)
train_data['vusage_onnet_avg'] = train_data['vusage_onnet_avg'].fillna(value = 0)

In [17]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   primary_identifier           10500 non-null  int64  
 1   device_type                  10500 non-null  object 
 2   device_category              10500 non-null  object 
 3   gender                       10500 non-null  object 
 4   district_name                10500 non-null  object 
 5   age_group                    10500 non-null  object 
 6   network_stay                 10500 non-null  int64  
 7   average_monthly_bill_amount  10500 non-null  float64
 8   dusage_sum                   10500 non-null  int64  
 9   dusage_min                   10500 non-null  float64
 10  dusage_max                   10500 non-null  float64
 11  dusage_diff                  10500 non-null  float64
 12  dusage_avg                   10500 non-null  float64
 13  dusage_days     

## Creating pre-processing pipeline

In [18]:
train_data.columns

Index(['primary_identifier', 'device_type', 'device_category', 'gender',
       'district_name', 'age_group', 'network_stay',
       'average_monthly_bill_amount', 'dusage_sum', 'dusage_min', 'dusage_max',
       'dusage_diff', 'dusage_avg', 'dusage_days', 'dusage_stddev',
       'vusage_onnet_sum', 'vusage_onnet_max', 'vusage_onnet_min',
       'vusage_onnet_diff', 'vusage_onnet_days', 'vusage_onnet_avg',
       'vusage_onnet_stddev', 'vusage_offnet_sum', 'vusage_offnet_max',
       'vusage_offnet_min', 'vusage_offnet_diff', 'vusage_offnet_days',
       'vusage_offnet_avg', 'vusage_offnet_stddev',
       'number_of_fixed_bb_accounts', 'number_of_iptv_accounts',
       'add_on_tot_rental', 'add_on_count', 'encoded_class_labels'],
      dtype='object')

In [19]:
def get_col_types(dataframe):
    data_cat_cols = []
    data_quan_cols = []
    
    for col in np.array(dataframe.columns):
        if dataframe[col].dtype == 'int64' or dataframe[col].dtype == 'float64':
            data_quan_cols.append(col)
        elif dataframe[col].dtype == 'O':
            data_cat_cols.append(col)
            
    return data_cat_cols, data_quan_cols

In [21]:
X = train_data.drop(columns=['encoded_class_labels', 'primary_identifier'])

In [22]:
data_cat_cols, data_quan_cols = get_col_types(X)

In [23]:
data_cat_cols

['device_type', 'device_category', 'gender', 'district_name', 'age_group']

In [24]:
data_quan_cols

['network_stay',
 'average_monthly_bill_amount',
 'dusage_sum',
 'dusage_min',
 'dusage_max',
 'dusage_diff',
 'dusage_avg',
 'dusage_days',
 'dusage_stddev',
 'vusage_onnet_sum',
 'vusage_onnet_max',
 'vusage_onnet_min',
 'vusage_onnet_diff',
 'vusage_onnet_days',
 'vusage_onnet_avg',
 'vusage_onnet_stddev',
 'vusage_offnet_sum',
 'vusage_offnet_max',
 'vusage_offnet_min',
 'vusage_offnet_diff',
 'vusage_offnet_days',
 'vusage_offnet_avg',
 'vusage_offnet_stddev',
 'number_of_fixed_bb_accounts',
 'number_of_iptv_accounts',
 'add_on_tot_rental',
 'add_on_count']

In [25]:
y = train_data['encoded_class_labels']

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=10, test_size = 0.05, stratify = y)

In [27]:
X_train.head(20)

Unnamed: 0,device_type,device_category,gender,district_name,age_group,network_stay,average_monthly_bill_amount,dusage_sum,dusage_min,dusage_max,...,vusage_offnet_max,vusage_offnet_min,vusage_offnet_diff,vusage_offnet_days,vusage_offnet_avg,vusage_offnet_stddev,number_of_fixed_bb_accounts,number_of_iptv_accounts,add_on_tot_rental,add_on_count
6155,4G,Smartphone,MALE,Colombo,60-70,39,831.92,34164,425.700622,1789.25268,...,3.6,0.0,3.6,5,1.103333,0.672225,1,2,360.0,2.0
2288,4G,Smartphone,MALE,Kandy,20-30,93,2677.35,11984,2.27036,1055.490074,...,0.0,0.0,0.0,0,0.0,0.0,0,0,-1.0,0.0
9714,4G,Smartphone,MALE,Colombo,40-50,92,2627.77,19830,114.69436,1129.623393,...,0.0,0.0,0.0,0,0.0,0.0,0,0,-1.0,0.0
7157,4G,Smartphone,MALE,Colombo,40-50,158,4795.76,8498,13.75556,725.783259,...,17.05,0.0,17.05,22,5.197727,4.52221,0,0,360.0,2.0
738,4G,Smartphone,MALE,Colombo,50-60,51,3978.4,36917,660.364806,2735.215541,...,0.0,0.0,0.0,0,0.0,0.0,0,0,1500.0,1.0
490,4G,Smartphone,FEMALE,Colombo,40-50,133,2582.98,6820,16.4799,766.500066,...,66.483333,0.0,66.483333,18,18.367593,18.339099,0,1,650.0,1.0
467,4G,Smartphone,MALE,Kalutara,40-50,9,906.76,7652,0.043341,958.006304,...,17.7,0.0,17.7,27,3.873457,4.288491,0,0,110.0,2.0
6161,4G,Smartphone,MALE,Colombo,30-40,32,2148.93,10470,24.44814,1649.124297,...,21.683333,0.0,21.683333,9,5.916667,4.704471,0,0,-1.0,0.0
1343,4G,Smartphone,MALE,Gampaha,40-50,10,2759.05,15901,102.143398,1233.143407,...,8.533333,0.0,8.533333,11,4.140909,2.426823,0,0,-1.0,0.0
5792,4G,Smartphone,MALE,Colombo,50-60,198,1069.44,13193,52.539788,1733.79877,...,13.333333,0.0,13.333333,8,3.879167,2.785113,1,0,600.0,2.0


In [28]:
X_train['age_group'].unique()

array(['60-70', '20-30', '40-50', '50-60', '30-40', '>70', 'Unknown',
       '<20'], dtype=object)

In [29]:
X_train.shape

(9975, 32)

In [30]:
quan_pipeline = Pipeline([
    ('std_scaler', RobustScaler())
])

quan_transformed = quan_pipeline.fit_transform(X_train[data_quan_cols])

In [31]:
data_pipeline = ColumnTransformer([
    ('numerical', quan_pipeline, data_quan_cols),
    ('categorical', OrdinalEncoder(), data_cat_cols),
    
])

train_data_processed = data_pipeline.fit_transform(X_train)

In [32]:
train_data_processed[1]

array([ 9.44881890e-02,  5.36438249e-01,  2.59987477e-01, -2.46300869e-03,
        5.14737330e-02,  9.18799625e-02,  2.02782339e-01,  0.00000000e+00,
        1.45573065e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -2.00000000e-01, -3.26498423e-01,  0.00000000e+00, -3.26627219e-01,
       -3.75000000e-01, -4.32542308e-01, -3.23931364e-01,  0.00000000e+00,
        0.00000000e+00, -4.45676275e-01, -5.00000000e-01,  2.00000000e+00,
        4.00000000e+00,  1.00000000e+00,  1.00000000e+01,  0.00000000e+00])

In [33]:
X_test.head()

Unnamed: 0,device_type,device_category,gender,district_name,age_group,network_stay,average_monthly_bill_amount,dusage_sum,dusage_min,dusage_max,...,vusage_offnet_max,vusage_offnet_min,vusage_offnet_diff,vusage_offnet_days,vusage_offnet_avg,vusage_offnet_stddev,number_of_fixed_bb_accounts,number_of_iptv_accounts,add_on_tot_rental,add_on_count
231,4G,Smartphone,FEMALE,Colombo,30-40,11,1716.27,12647,0.0,1875.519728,...,10.366667,0.0,10.366667,4,3.395833,1.946967,0,0,250.0,1.0
9966,4G,Smartphone,MALE,Ratnapura,20-30,9,985.32,1089,0.0,267.741177,...,5.85,0.0,5.85,11,2.775758,1.919775,0,0,-1.0,0.0
486,4G,Smartphone,MALE,Gampaha,40-50,108,3518.42,22732,0.0,1351.566897,...,44.533333,0.0,44.533333,4,28.425,10.514607,0,0,760.0,2.0
9686,4G,Smartphone,MALE,Colombo,30-40,34,2559.86,14666,96.47504,1160.861369,...,0.0,0.0,0.0,0,0.0,0.0,0,0,-1.0,0.0
6310,4G,Smartphone,FEMALE,Trincomalee,30-40,24,1802.86,31567,521.592238,1672.417785,...,24.683333,0.0,24.683333,12,4.397222,4.734033,0,0,450.0,2.0


In [34]:
X_train.columns

Index(['device_type', 'device_category', 'gender', 'district_name',
       'age_group', 'network_stay', 'average_monthly_bill_amount',
       'dusage_sum', 'dusage_min', 'dusage_max', 'dusage_diff', 'dusage_avg',
       'dusage_days', 'dusage_stddev', 'vusage_onnet_sum', 'vusage_onnet_max',
       'vusage_onnet_min', 'vusage_onnet_diff', 'vusage_onnet_days',
       'vusage_onnet_avg', 'vusage_onnet_stddev', 'vusage_offnet_sum',
       'vusage_offnet_max', 'vusage_offnet_min', 'vusage_offnet_diff',
       'vusage_offnet_days', 'vusage_offnet_avg', 'vusage_offnet_stddev',
       'number_of_fixed_bb_accounts', 'number_of_iptv_accounts',
       'add_on_tot_rental', 'add_on_count'],
      dtype='object')

In [35]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 525 entries, 231 to 229
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   device_type                  525 non-null    object 
 1   device_category              525 non-null    object 
 2   gender                       525 non-null    object 
 3   district_name                525 non-null    object 
 4   age_group                    525 non-null    object 
 5   network_stay                 525 non-null    int64  
 6   average_monthly_bill_amount  525 non-null    float64
 7   dusage_sum                   525 non-null    int64  
 8   dusage_min                   525 non-null    float64
 9   dusage_max                   525 non-null    float64
 10  dusage_diff                  525 non-null    float64
 11  dusage_avg                   525 non-null    float64
 12  dusage_days                  525 non-null    int64  
 13  dusage_stddev     

In [36]:
test_data_processed = data_pipeline.transform(X_test)

In [37]:
test_data_processed[0]

array([-0.5511811 , -0.04037991,  0.30150282, -0.03786963,  0.60829292,
        0.69580789,  0.68906446, -2.2       ,  1.07049724,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.04705882,  0.16403786,  0.        ,  0.16410256,
       -0.125     ,  0.14406906,  0.06531578,  0.        ,  0.        ,
        0.11086475,  0.        ,  2.        ,  4.        ,  0.        ,
        4.        ,  1.        ])

### Loading Submission data

In [38]:
submission_data = pd.read_csv("../data/test_dataset_v2.csv")

In [39]:
submission_data.head()

Unnamed: 0,primary_identifier,device_type,device_category,gender,district_name,age_group,network_stay,average_monthly_bill_amount,dusage_sum,dusage_min,...,vusage_offnet_max,vusage_offnet_min,vusage_offnet_diff,vusage_offnet_days,vusage_offnet_avg,vusage_offnet_stddev,number_of_fixed_bb_accounts,number_of_iptv_accounts,add_on_tot_rental,add_on_count
0,2003793,4G,Smartphone,MALE,Colombo,40-50,4,689.42,148,0.0,...,0.0,0.0,0.0,0,,0.0,0,1,,
1,1776101,4G,Smartphone,MALE,Colombo,40-50,7,799.93,300,0.180222,...,16.266667,0.0,16.266667,21,5.107143,4.924111,0,0,,
2,6945050,2G,Basic,MALE,Colombo,20-30,20,1538.45,8215,0.0,...,90.0,0.0,90.0,8,13.558333,16.397071,0,0,110.0,2.0
3,2472049,4G,Smartphone,FEMALE,Matara,40-50,10,995.75,11613,0.0,...,0.416667,0.0,0.416667,2,0.333333,0.087355,0,0,,
4,7197266,4G,Smartphone,MALE,Kandy,30-40,154,553.61,26505,28.515345,...,0.0,0.0,0.0,0,,0.0,0,0,350.0,1.0


## Handling missing values in submission data

In [40]:
submission_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 33 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   primary_identifier           4500 non-null   int64  
 1   device_type                  4487 non-null   object 
 2   device_category              4484 non-null   object 
 3   gender                       4445 non-null   object 
 4   district_name                4500 non-null   object 
 5   age_group                    4445 non-null   object 
 6   network_stay                 4500 non-null   int64  
 7   average_monthly_bill_amount  4500 non-null   float64
 8   dusage_sum                   4500 non-null   int64  
 9   dusage_min                   4500 non-null   float64
 10  dusage_max                   4500 non-null   float64
 11  dusage_diff                  4500 non-null   float64
 12  dusage_avg                   4136 non-null   float64
 13  dusage_days       

In [41]:
submission_data['device_category'].unique()

array(['Smartphone', 'Basic', 'Feature phone', 'Unknown', nan, 'Tablet',
       'Pluggable card', 'Modem'], dtype=object)

In [42]:
submission_data['device_type'] = submission_data['device_type'].fillna(value = 'Unknown')
submission_data['device_category'] = submission_data['device_category'].fillna(value = 'Unknown')
submission_data['gender'] = submission_data['gender'].fillna(value = 'Unknown')
submission_data['age_group'] = submission_data['age_group'].fillna(value = 'Unknown')

In [43]:
submission_data['dusage_avg'] = submission_data['dusage_avg'].fillna(value = 0)
submission_data['vusage_offnet_avg'] = submission_data['vusage_offnet_avg'].fillna(value = 0)
submission_data['add_on_tot_rental'] = submission_data['add_on_tot_rental'].fillna(value = -1)
submission_data['add_on_count'] = submission_data['add_on_count'].fillna(value = 0)
submission_data['vusage_onnet_avg'] = submission_data['vusage_onnet_avg'].fillna(value = 0)

In [44]:
submission_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 33 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   primary_identifier           4500 non-null   int64  
 1   device_type                  4500 non-null   object 
 2   device_category              4500 non-null   object 
 3   gender                       4500 non-null   object 
 4   district_name                4500 non-null   object 
 5   age_group                    4500 non-null   object 
 6   network_stay                 4500 non-null   int64  
 7   average_monthly_bill_amount  4500 non-null   float64
 8   dusage_sum                   4500 non-null   int64  
 9   dusage_min                   4500 non-null   float64
 10  dusage_max                   4500 non-null   float64
 11  dusage_diff                  4500 non-null   float64
 12  dusage_avg                   4500 non-null   float64
 13  dusage_days       

In [45]:
prim_id = submission_data['primary_identifier']

In [46]:
submission_data.shape

(4500, 33)

In [47]:
submission_X = submission_data.drop(columns=['primary_identifier'])

In [48]:
submission_X.shape

(4500, 32)

## Pre-processing the submission data

In [49]:
submission_data_processed = data_pipeline.transform(submission_X)

In [50]:
submission_data_processed[0].shape

(32,)

In [51]:
submission_data_processed[0]

array([-0.60629921, -0.65667172, -0.48115216, -0.03786963, -0.56463354,
       -0.57283961, -0.4593038 , -5.2       , -0.57999961,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.2       , -0.32649842,  0.        , -0.32662722,
       -0.375     , -0.43254231, -0.32393136,  0.        ,  1.        ,
       -0.44567627, -0.5       ,  2.        ,  4.        ,  1.        ,
        4.        ,  2.        ])

## Model Code

In [52]:
unique, counts = np.unique(Y_train, return_counts=True)
dict(zip(unique, counts))

{1: 3490, 2: 1286, 3: 1631, 4: 698, 5: 1480, 6: 703, 7: 346, 8: 341}

In [50]:
strategy = {
    1: 3490,
    2: 1250,
    3: 1600,
    4: 698,
    5: 1450,
    6: 700,
    7: 346,
    8: 341
}

In [51]:
undersample = NearMiss(sampling_strategy=strategy,n_neighbors=2)

In [52]:
X_near , y_near = undersample.fit_resample(train_data_processed,Y_train)

In [53]:
unique, counts = np.unique(y_near, return_counts=True)
dict(zip(unique, counts))

{1: 3490, 2: 1250, 3: 1600, 4: 698, 5: 1450, 6: 700, 7: 346, 8: 341}

In [53]:
strategy_os = {
    1: 3500,
    2: 3500,
    3: 3500,
    4: 3500,
    5: 3500,
    6: 3500,
    7: 3500,
    8: 3500
}

In [54]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42, sampling_strategy = strategy_os)
X_res, y_res = sm.fit_resample(train_data_processed, Y_train)

  f"After over-sampling, the number of samples ({n_samples})"
  f"After over-sampling, the number of samples ({n_samples})"
  f"After over-sampling, the number of samples ({n_samples})"
  f"After over-sampling, the number of samples ({n_samples})"
  f"After over-sampling, the number of samples ({n_samples})"
  f"After over-sampling, the number of samples ({n_samples})"
  f"After over-sampling, the number of samples ({n_samples})"
  f"After over-sampling, the number of samples ({n_samples})"


In [55]:
unique, counts = np.unique(y_res, return_counts=True)
dict(zip(unique, counts))

{1: 3500, 2: 3500, 3: 3500, 4: 3500, 5: 3500, 6: 3500, 7: 3500, 8: 3500}

In [56]:
X_res.shape

(28000, 32)

### Decision Trees

In [58]:
model_dec_tree = DecisionTreeClassifier()

In [59]:
model_dec_tree.fit(X_res, y_res)

DecisionTreeClassifier()

In [60]:
y_pred_dec_tree = model_dec_tree.predict(test_data_processed)

In [61]:
print(classification_report(Y_test, y_pred_dec_tree))

              precision    recall  f1-score   support

           1       0.73      0.64      0.68       183
           2       0.26      0.29      0.27        68
           3       0.45      0.47      0.46        86
           4       0.19      0.19      0.19        37
           5       0.58      0.54      0.56        78
           6       0.22      0.22      0.22        37
           7       0.15      0.22      0.18        18
           8       0.32      0.50      0.39        18

    accuracy                           0.47       525
   macro avg       0.36      0.38      0.37       525
weighted avg       0.49      0.47      0.48       525



### Random Forests

In [62]:
model_ran_for = RandomForestClassifier()

In [63]:
model_ran_for.fit(X_res, y_res)

RandomForestClassifier()

In [64]:
y_pred_ran_for = model_ran_for.predict(test_data_processed)

In [65]:
print(classification_report(Y_test, y_pred_ran_for))

              precision    recall  f1-score   support

           1       0.73      0.72      0.72       183
           2       0.34      0.28      0.31        68
           3       0.44      0.53      0.48        86
           4       0.34      0.27      0.30        37
           5       0.65      0.71      0.68        78
           6       0.53      0.43      0.48        37
           7       0.15      0.11      0.13        18
           8       0.38      0.61      0.47        18

    accuracy                           0.55       525
   macro avg       0.45      0.46      0.45       525
weighted avg       0.55      0.55      0.55       525



### Extra Trees Classifier

In [66]:
model_ex_trees = ExtraTreesClassifier()

In [67]:
model_ex_trees.fit(X_res, y_res)

ExtraTreesClassifier()

In [68]:
y_pred_ex_trees = model_ex_trees.predict(test_data_processed)

In [69]:
print(classification_report(Y_test, y_pred_ex_trees))

              precision    recall  f1-score   support

           1       0.67      0.78      0.72       183
           2       0.39      0.22      0.28        68
           3       0.40      0.42      0.41        86
           4       0.28      0.22      0.24        37
           5       0.62      0.65      0.64        78
           6       0.41      0.35      0.38        37
           7       0.00      0.00      0.00        18
           8       0.39      0.61      0.48        18

    accuracy                           0.53       525
   macro avg       0.39      0.41      0.39       525
weighted avg       0.50      0.53      0.51       525



### Gradient Boosting

In [70]:
model_grad_boost = GradientBoostingClassifier()

In [71]:
model_grad_boost.fit(X_res, y_res)

GradientBoostingClassifier()

In [72]:
y_pred_grad_boost = model_grad_boost.predict(test_data_processed)

In [73]:
print(classification_report(Y_test, y_pred_grad_boost))

              precision    recall  f1-score   support

           1       0.77      0.72      0.74       183
           2       0.41      0.26      0.32        68
           3       0.41      0.52      0.46        86
           4       0.23      0.22      0.22        37
           5       0.66      0.63      0.64        78
           6       0.38      0.38      0.38        37
           7       0.26      0.28      0.27        18
           8       0.37      0.72      0.49        18

    accuracy                           0.54       525
   macro avg       0.44      0.47      0.44       525
weighted avg       0.55      0.54      0.54       525



### Hist Gradient Boosting Classifier

In [74]:
model_hist_gb = HistGradientBoostingClassifier()

In [75]:
model_hist_gb.fit(X_res, y_res)

HistGradientBoostingClassifier()

In [76]:
y_pred_his_gb = model_hist_gb.predict(test_data_processed)

In [77]:
print(classification_report(Y_test, y_pred_his_gb))

              precision    recall  f1-score   support

           1       0.70      0.75      0.72       183
           2       0.37      0.21      0.26        68
           3       0.43      0.49      0.46        86
           4       0.25      0.22      0.23        37
           5       0.66      0.67      0.66        78
           6       0.41      0.38      0.39        37
           7       0.21      0.17      0.19        18
           8       0.39      0.72      0.51        18

    accuracy                           0.54       525
   macro avg       0.43      0.45      0.43       525
weighted avg       0.53      0.54      0.53       525



### XG Boost

In [78]:
model = XGBClassifier(label_encoder = False)

In [79]:
model.fit(X_res, y_res)



Parameters: { label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              label_encoder=False, learning_rate=0.300000012, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [80]:
y_pred = model.predict(test_data_processed)

In [81]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           1       0.73      0.76      0.74       183
           2       0.33      0.21      0.25        68
           3       0.44      0.55      0.48        86
           4       0.33      0.27      0.30        37
           5       0.72      0.68      0.70        78
           6       0.37      0.38      0.37        37
           7       0.24      0.22      0.23        18
           8       0.46      0.61      0.52        18

    accuracy                           0.56       525
   macro avg       0.45      0.46      0.45       525
weighted avg       0.55      0.56      0.55       525



## Attempting Stacking Classification

In [None]:
estimators = [('rf', RandomForestClassifier()),
              ('xgb', XGBClassifier(label_encoder = False)),
              ('hist_tree', HistGradientBoostingClassifier())
             ]

In [None]:
clf = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())

In [None]:
clf.fit(X_res, y_res)

In [None]:
y_pred_clf = clf.predict(test_data_processed)

In [None]:
print(classification_report(Y_test, y_pred_clf))

## Submission

In [None]:
y_submission = model_ran_for.predict(submission_data_processed)

In [None]:
y_submission

In [None]:
unique, counts = np.unique(y_submission, return_counts=True)
dict(zip(unique, counts))

In [None]:
y_submissions_series = pd.Series(y_submission)

In [None]:
submission_data = {'primary_identifier' : pd.to_numeric(prim_id),
                 'next_month_plan' : pd.to_numeric(y_submissions_series)}

submission_df = pd.DataFrame(submission_data)

In [None]:
submission_df.shape

In [None]:
#submission_df.to_csv("submission_7.csv")

In [None]:
param_dist = {'n_estimators': [50, 100, 300, 500, 700, 900],
              'criterion': ['gini', 'entropy'],
              'max_depth': [1, 3, 4, 5, 6, 8, 10, 12, None],
              'max_features': ['auto', 'sqrt', 'log2', 15, 20, 25],
              'class_weight': ['balanced', 'balanced_subsample']
             }

In [None]:
param_dist = {'max_depth': [16, 20],
              'n_estimators': [400, 500, 600],
              'max_features': ['log2', 15, 20]
             }

In [None]:
clf_rf = RandomForestClassifier()
clf = GridSearchCV(clf_rf, 
                   param_dist,
                   cv = 5,
                   verbose = 3, 
                   n_jobs = -1)

In [None]:
clf.fit(X_res, y_res)

In [None]:
clf.best_params_

In [None]:
clf.best_score_

In [63]:
model_ran_for_new = RandomForestClassifier(random_state = 0, max_depth = 10, 
                                           n_estimators = 400, max_features=31)

In [64]:
model_ran_for_new.fit(X_res, y_res)

RandomForestClassifier(max_depth=10, max_features=31, n_estimators=400,
                       random_state=0)

In [65]:
y_pred_ran_for_new = model_ran_for_new.predict(test_data_processed)

In [66]:
print(classification_report(Y_test, y_pred_ran_for_new))

              precision    recall  f1-score   support

           1       0.82      0.66      0.73       183
           2       0.39      0.38      0.39        68
           3       0.48      0.49      0.49        86
           4       0.34      0.54      0.42        37
           5       0.72      0.67      0.69        78
           6       0.43      0.41      0.42        37
           7       0.24      0.28      0.26        18
           8       0.37      0.78      0.50        18

    accuracy                           0.56       525
   macro avg       0.47      0.52      0.49       525
weighted avg       0.60      0.56      0.57       525



In [130]:
model_ran_for_new_2 = RandomForestClassifier(random_state = 0, max_depth = 10, 
                                           n_estimators = 450, max_features=31)

In [131]:
model_ran_for_new_2.fit(X_res, y_res)

RandomForestClassifier(max_depth=10, max_features=31, n_estimators=450,
                       random_state=0)

In [132]:
y_pred_ran_for_new_2 = model_ran_for_new_2.predict(test_data_processed)

In [133]:
print(classification_report(Y_test, y_pred_ran_for_new_2))

              precision    recall  f1-score   support

           1       0.82      0.65      0.72       183
           2       0.39      0.38      0.39        68
           3       0.48      0.48      0.48        86
           4       0.33      0.54      0.41        37
           5       0.72      0.67      0.69        78
           6       0.43      0.41      0.42        37
           7       0.23      0.28      0.25        18
           8       0.37      0.78      0.50        18

    accuracy                           0.56       525
   macro avg       0.47      0.52      0.48       525
weighted avg       0.60      0.56      0.57       525



In [135]:
y_submission = model_ran_for_new_2.predict(submission_data_processed)

In [136]:
unique, counts = np.unique(y_submission, return_counts=True)
dict(zip(unique, counts))

{1: 1252, 2: 605, 3: 655, 4: 574, 5: 627, 6: 331, 7: 192, 8: 264}

In [142]:
y_submissions_series = pd.Series(y_submission)

In [143]:
submission_data = {'primary_identifier' : pd.to_numeric(prim_id),
                 'next_month_plan' : pd.to_numeric(y_submissions_series)}

submission_df = pd.DataFrame(submission_data)

In [144]:
submission_df.shape

(4500, 2)

In [145]:
submission_df.to_csv("submission_11.csv")