## Import the needed libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import xgboost as xgb
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import TomekLinks
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
#Import other necessary model libraries, for this example, using Logistic Regression

## Import the training data

In [2]:
train_data = pd.read_csv("../data/updated_train_19.csv")

In [3]:
train_data.tail()

Unnamed: 0,primary_identifier,device_type,device_category,gender,district_name,age_group,network_stay,average_monthly_bill_amount,dusage_sum,dusage_min,...,vusage_max,vusage_min,vusage_diff,vusage_days,vusage_avg,number_of_fixed_bb_accounts,number_of_iptv_accounts,add_on_tot_rental,add_on_count,encoded_class_labels
10495,1438655,4G,Smartphone,MALE,Kalutara,40-50,64,2088.13,479,0.346959,...,43.433333,0,43.433333,19,41.090873,0,0,-1,0,6
10496,7827264,4G,Smartphone,FEMALE,Colombo,<20,6,2053.5,23168,0.219985,...,25.366667,0,25.366667,3,10.861111,0,0,580,3,1
10497,1433957,4G,Smartphone,FEMALE,Colombo,30-40,69,2827.33,4553,4.831711,...,0.0,0,0.0,0,0.0,0,1,-1,0,6
10498,8494507,4G,Smartphone,MALE,Kandy,60-70,233,645.43,2069,4.722031,...,18.65,0,18.65,18,5.887963,0,1,-1,0,1
10499,3569439,3G,Smartphone,MALE,Puttalam,40-50,33,498.2,536,0.003048,...,4.366667,0,4.366667,7,2.228571,0,0,-1,0,1


## Encoding Labels

In [4]:
train_data['next_month_plan'] = train_data['next_month_plan'].astype('category')

KeyError: 'next_month_plan'

In [5]:
train_data['next_month_plan'].unique()

KeyError: 'next_month_plan'

In [None]:
labels_x = ['PKG0','PKG1', 'PKG2', 'PKG3', 'PKG4', 'PKG5', 'PKG6', 'PKG7', 'PKG8']

In [None]:
le = preprocessing.LabelEncoder()

In [None]:
le.fit(labels_x)

In [None]:
le.classes_

In [None]:
train_data['encoded_class_labels'] = le.transform(train_data['next_month_plan'])

In [None]:
train_data.head(20)

## Handling Missing Values

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   primary_identifier           10500 non-null  int64  
 1   device_type                  10500 non-null  object 
 2   device_category              10500 non-null  object 
 3   gender                       10500 non-null  object 
 4   district_name                10500 non-null  object 
 5   age_group                    10500 non-null  object 
 6   network_stay                 10500 non-null  int64  
 7   average_monthly_bill_amount  10500 non-null  float64
 8   dusage_sum                   10500 non-null  int64  
 9   dusage_min                   10500 non-null  float64
 10  dusage_max                   10500 non-null  float64
 11  dusage_diff                  10500 non-null  float64
 12  dusage_avg                   10500 non-null  float64
 13  dusage_days     

In [None]:
train_data['device_category'].unique()

In [None]:
train_data['device_type'] = train_data['device_type'].fillna(value = 'Unknown')
train_data['device_category'] = train_data['device_category'].fillna(value = 'Unknown')
train_data['gender'] = train_data['gender'].fillna(value = 'Unknown')
train_data['age_group'] = train_data['age_group'].fillna(value = 'Unknown')

In [None]:
train_data = train_data.drop(columns = ['next_month_plan'])

In [None]:
train_data['dusage_avg'] = train_data['dusage_avg'].fillna(value = 0)
train_data['vusage_offnet_avg'] = train_data['vusage_offnet_avg'].fillna(value = 0)
train_data['add_on_tot_rental'] = train_data['add_on_tot_rental'].fillna(value = -1)
train_data['add_on_count'] = train_data['add_on_count'].fillna(value = 0)
train_data['vusage_onnet_avg'] = train_data['vusage_onnet_avg'].fillna(value = 0)

In [None]:
train_data.info()

## Creating pre-processing pipeline

In [7]:
train_data.columns

Index(['primary_identifier', 'device_type', 'device_category', 'gender',
       'district_name', 'age_group', 'network_stay',
       'average_monthly_bill_amount', 'dusage_sum', 'dusage_min', 'dusage_max',
       'dusage_diff', 'dusage_avg', 'dusage_days', 'dusage_stddev',
       'vusage_onnet_sum', 'vusage_onnet_max', 'vusage_onnet_min',
       'vusage_onnet_diff', 'vusage_onnet_days', 'vusage_onnet_avg',
       'vusage_onnet_stddev', 'vusage_offnet_sum', 'vusage_offnet_max',
       'vusage_offnet_min', 'vusage_offnet_diff', 'vusage_offnet_days',
       'vusage_offnet_avg', 'vusage_offnet_stddev', 'vusage_sum', 'vusage_max',
       'vusage_min', 'vusage_diff', 'vusage_days', 'vusage_avg',
       'number_of_fixed_bb_accounts', 'number_of_iptv_accounts',
       'add_on_tot_rental', 'add_on_count', 'encoded_class_labels'],
      dtype='object')

In [8]:
def get_col_types(dataframe):
    data_cat_cols = []
    data_quan_cols = []
    
    for col in np.array(dataframe.columns):
        if dataframe[col].dtype == 'int64' or dataframe[col].dtype == 'float64':
            data_quan_cols.append(col)
        elif dataframe[col].dtype == 'O':
            data_cat_cols.append(col)
            
    return data_cat_cols, data_quan_cols

In [9]:
X = train_data.drop(columns=['encoded_class_labels', 'primary_identifier'])

In [10]:
data_cat_cols, data_quan_cols = get_col_types(X)

In [11]:
data_cat_cols

['device_type', 'device_category', 'gender', 'district_name', 'age_group']

In [12]:
data_quan_cols

['network_stay',
 'average_monthly_bill_amount',
 'dusage_sum',
 'dusage_min',
 'dusage_max',
 'dusage_diff',
 'dusage_avg',
 'dusage_days',
 'dusage_stddev',
 'vusage_onnet_sum',
 'vusage_onnet_max',
 'vusage_onnet_min',
 'vusage_onnet_diff',
 'vusage_onnet_days',
 'vusage_onnet_avg',
 'vusage_onnet_stddev',
 'vusage_offnet_sum',
 'vusage_offnet_max',
 'vusage_offnet_min',
 'vusage_offnet_diff',
 'vusage_offnet_days',
 'vusage_offnet_avg',
 'vusage_offnet_stddev',
 'vusage_sum',
 'vusage_max',
 'vusage_min',
 'vusage_diff',
 'vusage_days',
 'vusage_avg',
 'number_of_fixed_bb_accounts',
 'number_of_iptv_accounts',
 'add_on_tot_rental',
 'add_on_count']

In [13]:
y = train_data['encoded_class_labels']

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=10, test_size = 0.05, stratify = y)

In [15]:
X_train.head(20)

Unnamed: 0,device_type,device_category,gender,district_name,age_group,network_stay,average_monthly_bill_amount,dusage_sum,dusage_min,dusage_max,...,vusage_sum,vusage_max,vusage_min,vusage_diff,vusage_days,vusage_avg,number_of_fixed_bb_accounts,number_of_iptv_accounts,add_on_tot_rental,add_on_count
6155,4G,Smartphone,MALE,Colombo,60-70,39,831.92,34164,425.700622,1789.25268,...,5,3.6,0,3.6,5,1.103333,1,2,360,2
2288,4G,Smartphone,MALE,Kandy,20-30,93,2677.35,11984,2.27036,1055.490074,...,0,0.0,0,0.0,0,0.0,0,0,-1,0
9714,4G,Smartphone,MALE,Colombo,40-50,92,2627.77,19830,114.69436,1129.623393,...,0,0.0,0,0.0,0,0.0,0,0,-1,0
7157,4G,Smartphone,MALE,Colombo,40-50,158,4795.76,8498,13.75556,725.783259,...,114,17.05,0,17.05,22,5.197727,0,0,360,2
738,4G,Smartphone,MALE,Colombo,50-60,51,3978.4,36917,660.364806,2735.215541,...,0,0.0,0,0.0,0,0.0,0,0,1500,1
490,4G,Smartphone,FEMALE,Colombo,40-50,133,2582.98,6820,16.4799,766.500066,...,330,66.483333,0,66.483333,18,18.367593,0,1,650,1
467,4G,Smartphone,MALE,Kalutara,40-50,9,906.76,7652,0.043341,958.006304,...,104,17.7,0,17.7,27,3.873457,0,0,110,2
6161,4G,Smartphone,MALE,Colombo,30-40,32,2148.93,10470,24.44814,1649.124297,...,376,43.866667,0,43.866667,34,18.872667,0,0,-1,0
1343,4G,Smartphone,MALE,Gampaha,40-50,10,2759.05,15901,102.143398,1233.143407,...,111,20.95,0,20.95,19,12.399242,0,0,-1,0
5792,4G,Smartphone,MALE,Colombo,50-60,198,1069.44,13193,52.539788,1733.79877,...,31,13.333333,0,13.333333,8,3.879167,1,0,600,2


In [16]:
X_train['age_group'].unique()

array(['60-70', '20-30', '40-50', '50-60', '30-40', '>70', 'Unknown',
       '<20'], dtype=object)

In [17]:
X_train.shape

(9975, 38)

In [18]:
quan_pipeline = Pipeline([
    ('std_scaler', RobustScaler())
])

quan_transformed = quan_pipeline.fit_transform(X_train[data_quan_cols])

In [19]:
data_pipeline = ColumnTransformer([
    ('numerical', quan_pipeline, data_quan_cols),
    ('categorical', OrdinalEncoder(), data_cat_cols),
    
])

train_data_processed = data_pipeline.fit_transform(X_train)

In [20]:
train_data_processed[1]

array([ 9.44881890e-02,  5.36438249e-01,  2.59987477e-01, -2.46300869e-03,
        5.14737330e-02,  9.18799625e-02,  2.02782339e-01,  0.00000000e+00,
        1.45573065e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -2.00000000e-01, -3.26498423e-01,  0.00000000e+00, -3.26627219e-01,
       -3.75000000e-01, -4.32542308e-01, -3.23931364e-01, -2.15686275e-01,
       -3.37454060e-01,  0.00000000e+00, -3.37454060e-01, -3.52941176e-01,
       -3.93286152e-01,  0.00000000e+00,  0.00000000e+00, -4.45676275e-01,
       -5.00000000e-01,  2.00000000e+00,  4.00000000e+00,  1.00000000e+00,
        1.00000000e+01,  0.00000000e+00])

In [21]:
X_test.head()

Unnamed: 0,device_type,device_category,gender,district_name,age_group,network_stay,average_monthly_bill_amount,dusage_sum,dusage_min,dusage_max,...,vusage_sum,vusage_max,vusage_min,vusage_diff,vusage_days,vusage_avg,number_of_fixed_bb_accounts,number_of_iptv_accounts,add_on_tot_rental,add_on_count
231,4G,Smartphone,FEMALE,Colombo,30-40,11,1716.27,12647,0.0,1875.519728,...,13,10.366667,0,10.366667,4,3.395833,0,0,250,1
9966,4G,Smartphone,MALE,Ratnapura,20-30,9,985.32,1089,0.0,267.741177,...,30,5.85,0,5.85,11,2.775758,0,0,-1,0
486,4G,Smartphone,MALE,Gampaha,40-50,108,3518.42,22732,0.0,1351.566897,...,113,44.533333,0,44.533333,4,28.425,0,0,760,2
9686,4G,Smartphone,MALE,Colombo,30-40,34,2559.86,14666,96.47504,1160.861369,...,0,0.0,0,0.0,0,0.0,0,0,-1,0
6310,4G,Smartphone,FEMALE,Trincomalee,30-40,24,1802.86,31567,521.592238,1672.417785,...,52,24.683333,0,24.683333,12,4.397222,0,0,450,2


In [22]:
X_train.columns

Index(['device_type', 'device_category', 'gender', 'district_name',
       'age_group', 'network_stay', 'average_monthly_bill_amount',
       'dusage_sum', 'dusage_min', 'dusage_max', 'dusage_diff', 'dusage_avg',
       'dusage_days', 'dusage_stddev', 'vusage_onnet_sum', 'vusage_onnet_max',
       'vusage_onnet_min', 'vusage_onnet_diff', 'vusage_onnet_days',
       'vusage_onnet_avg', 'vusage_onnet_stddev', 'vusage_offnet_sum',
       'vusage_offnet_max', 'vusage_offnet_min', 'vusage_offnet_diff',
       'vusage_offnet_days', 'vusage_offnet_avg', 'vusage_offnet_stddev',
       'vusage_sum', 'vusage_max', 'vusage_min', 'vusage_diff', 'vusage_days',
       'vusage_avg', 'number_of_fixed_bb_accounts', 'number_of_iptv_accounts',
       'add_on_tot_rental', 'add_on_count'],
      dtype='object')

In [23]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 525 entries, 231 to 229
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   device_type                  525 non-null    object 
 1   device_category              525 non-null    object 
 2   gender                       525 non-null    object 
 3   district_name                525 non-null    object 
 4   age_group                    525 non-null    object 
 5   network_stay                 525 non-null    int64  
 6   average_monthly_bill_amount  525 non-null    float64
 7   dusage_sum                   525 non-null    int64  
 8   dusage_min                   525 non-null    float64
 9   dusage_max                   525 non-null    float64
 10  dusage_diff                  525 non-null    float64
 11  dusage_avg                   525 non-null    float64
 12  dusage_days                  525 non-null    int64  
 13  dusage_stddev     

In [24]:
test_data_processed = data_pipeline.transform(X_test)

In [25]:
test_data_processed[0]

array([-0.5511811 , -0.04037991,  0.30150282, -0.03786963,  0.60829292,
        0.69580789,  0.68906446, -2.2       ,  1.07049724,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.04705882,  0.16403786,  0.        ,  0.16410256,
       -0.125     ,  0.14406906,  0.06531578, -0.08823529,  0.07818243,
        0.        ,  0.07818243, -0.11764706,  0.0266005 ,  0.        ,
        0.        ,  0.11086475,  0.        ,  2.        ,  4.        ,
        0.        ,  4.        ,  1.        ])

### Loading Submission data

In [26]:
submission_data = pd.read_csv("../data/test_dataset_v2_19.csv")

In [27]:
submission_data.head()

Unnamed: 0,primary_identifier,device_type,device_category,gender,district_name,age_group,network_stay,average_monthly_bill_amount,dusage_sum,dusage_min,...,vusage_sum,vusage_max,vusage_min,vusage_diff,vusage_days,vusage_avg,number_of_fixed_bb_accounts,number_of_iptv_accounts,add_on_tot_rental,add_on_count
0,2003793,4G,Smartphone,MALE,Colombo,40-50,4,689.42,148,0.0,...,0,0.0,0,0.0,0,0.0,0,1,-1,0
1,1776101,4G,Smartphone,MALE,Colombo,40-50,7,799.93,300,0.180222,...,107,16.266667,0,16.266667,21,5.107143,0,0,-1,0
2,6945050,2G,Basic,MALE,Colombo,20-30,20,1538.45,8215,0.0,...,108,90.0,0,90.0,8,13.558333,0,0,110,2
3,2472049,4G,Smartphone,FEMALE,Matara,40-50,10,995.75,11613,0.0,...,0,0.416667,0,0.416667,2,0.333333,0,0,-1,0
4,7197266,4G,Smartphone,MALE,Kandy,30-40,154,553.61,26505,28.515345,...,0,0.0,0,0.0,0,0.0,0,0,350,1


## Handling missing values in submission data

In [None]:
submission_data.info()

In [None]:
submission_data['device_category'].unique()

In [None]:
submission_data['device_type'] = submission_data['device_type'].fillna(value = 'Unknown')
submission_data['device_category'] = submission_data['device_category'].fillna(value = 'Unknown')
submission_data['gender'] = submission_data['gender'].fillna(value = 'Unknown')
submission_data['age_group'] = submission_data['age_group'].fillna(value = 'Unknown')

In [None]:
submission_data['dusage_avg'] = submission_data['dusage_avg'].fillna(value = 0)
submission_data['vusage_offnet_avg'] = submission_data['vusage_offnet_avg'].fillna(value = 0)
submission_data['add_on_tot_rental'] = submission_data['add_on_tot_rental'].fillna(value = -1)
submission_data['add_on_count'] = submission_data['add_on_count'].fillna(value = 0)
submission_data['vusage_onnet_avg'] = submission_data['vusage_onnet_avg'].fillna(value = 0)

In [28]:
submission_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 39 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   primary_identifier           4500 non-null   int64  
 1   device_type                  4500 non-null   object 
 2   device_category              4500 non-null   object 
 3   gender                       4500 non-null   object 
 4   district_name                4500 non-null   object 
 5   age_group                    4500 non-null   object 
 6   network_stay                 4500 non-null   int64  
 7   average_monthly_bill_amount  4500 non-null   float64
 8   dusage_sum                   4500 non-null   int64  
 9   dusage_min                   4500 non-null   float64
 10  dusage_max                   4500 non-null   float64
 11  dusage_diff                  4500 non-null   float64
 12  dusage_avg                   4500 non-null   float64
 13  dusage_days       

In [29]:
prim_id = submission_data['primary_identifier']

In [30]:
submission_data.shape

(4500, 39)

In [31]:
submission_X = submission_data.drop(columns=['primary_identifier'])

In [32]:
submission_X.shape

(4500, 38)

## Pre-processing the submission data

In [33]:
submission_data_processed = data_pipeline.transform(submission_X)

In [34]:
submission_data_processed[0].shape

(38,)

In [35]:
submission_data_processed[0]

array([-0.60629921, -0.65667172, -0.48115216, -0.03786963, -0.56463354,
       -0.57283961, -0.4593038 , -5.2       , -0.57999961,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.2       , -0.32649842,  0.        , -0.32662722,
       -0.375     , -0.43254231, -0.32393136, -0.21568627, -0.33745406,
        0.        , -0.33745406, -0.35294118, -0.39328615,  0.        ,
        1.        , -0.44567627, -0.5       ,  2.        ,  4.        ,
        1.        ,  4.        ,  2.        ])

## Model Code

In [36]:
unique, counts = np.unique(Y_train, return_counts=True)
dict(zip(unique, counts))

{1: 3490, 2: 1286, 3: 1631, 4: 698, 5: 1480, 6: 703, 7: 346, 8: 341}

In [37]:
strategy = {
    1: 3490,
    2: 1250,
    3: 1600,
    4: 698,
    5: 1450,
    6: 700,
    7: 346,
    8: 341
}

In [38]:
undersample = NearMiss(sampling_strategy=strategy,n_neighbors=2)

In [39]:
X_near , y_near = undersample.fit_resample(train_data_processed,Y_train)

In [40]:
unique, counts = np.unique(y_near, return_counts=True)
dict(zip(unique, counts))

{1: 3490, 2: 1250, 3: 1600, 4: 698, 5: 1450, 6: 700, 7: 346, 8: 341}

In [41]:
os_count = 4000

In [42]:
strategy_os = {
    1: 3490,
    2: 1500,
    3: 1900,
    4: 900,
    5: 1800,
    6: 1000,
    7: 600,
    8: 600
}

In [43]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42, sampling_strategy = strategy_os)
X_res, y_res = sm.fit_resample(train_data_processed, Y_train)

In [44]:
unique, counts = np.unique(y_res, return_counts=True)
dict(zip(unique, counts))

{1: 3490, 2: 1500, 3: 1900, 4: 900, 5: 1800, 6: 1000, 7: 600, 8: 600}

In [45]:
X_res.shape

(11790, 38)

### Decision Trees

In [46]:
model_dec_tree = DecisionTreeClassifier(class_weight='balanced')

In [47]:
model_dec_tree.fit(X_res, y_res)

DecisionTreeClassifier(class_weight='balanced')

In [48]:
y_pred_dec_tree = model_dec_tree.predict(test_data_processed)

In [49]:
print(classification_report(Y_test, y_pred_dec_tree))

              precision    recall  f1-score   support

           1       0.67      0.64      0.66       183
           2       0.18      0.19      0.19        68
           3       0.35      0.34      0.35        86
           4       0.19      0.22      0.20        37
           5       0.62      0.54      0.58        78
           6       0.31      0.35      0.33        37
           7       0.18      0.17      0.17        18
           8       0.36      0.50      0.42        18

    accuracy                           0.45       525
   macro avg       0.36      0.37      0.36       525
weighted avg       0.46      0.45      0.45       525



### Random Forests

In [50]:
model_ran_for = RandomForestClassifier(class_weight='balanced')

In [51]:
model_ran_for.fit(X_res, y_res)

RandomForestClassifier(class_weight='balanced')

In [52]:
y_pred_ran_for = model_ran_for.predict(test_data_processed)

In [53]:
print(classification_report(Y_test, y_pred_ran_for))

              precision    recall  f1-score   support

           1       0.66      0.81      0.73       183
           2       0.53      0.15      0.23        68
           3       0.41      0.58      0.48        86
           4       0.40      0.11      0.17        37
           5       0.62      0.83      0.71        78
           6       0.48      0.27      0.34        37
           7       0.33      0.17      0.22        18
           8       0.50      0.44      0.47        18

    accuracy                           0.57       525
   macro avg       0.49      0.42      0.42       525
weighted avg       0.55      0.57      0.53       525



### Extra Trees Classifier

In [None]:
model_ex_trees = ExtraTreesClassifier(class_weight='balanced')

In [None]:
model_ex_trees.fit(X_res, y_res)

In [None]:
y_pred_ex_trees = model_ex_trees.predict(test_data_processed)

In [None]:
print(classification_report(Y_test, y_pred_ex_trees))

### Gradient Boosting

In [None]:
model_grad_boost = GradientBoostingClassifier()

In [None]:
model_grad_boost.fit(X_res, y_res)

In [None]:
y_pred_grad_boost = model_grad_boost.predict(test_data_processed)

In [None]:
print(classification_report(Y_test, y_pred_grad_boost))

### Hist Gradient Boosting Classifier

In [None]:
model_hist_gb = HistGradientBoostingClassifier()

In [None]:
model_hist_gb.fit(X_res, y_res)

In [None]:
y_pred_his_gb = model_hist_gb.predict(test_data_processed)

In [None]:
print(classification_report(Y_test, y_pred_his_gb))

### XG Boost

In [None]:
model = XGBClassifier(label_encoder = False, class_weights = 'balanced')

In [None]:
model.fit(X_res, y_res)

In [None]:
y_pred = model.predict(test_data_processed)

In [None]:
print(classification_report(Y_test, y_pred))

## Attempting Stacking Classification

In [None]:
estimators = [('rf', RandomForestClassifier(class_weight='balanced')),
              ('xgb', XGBClassifier(label_encoder = False, class_weight = 'balanced')),
              ('hist_tree', HistGradientBoostingClassifier())
             ]

In [None]:
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

In [None]:
clf.fit(X_res, y_res)

In [None]:
y_pred_clf = clf.predict(test_data_processed)

In [None]:
print(classification_report(Y_test, y_pred_clf))

## Submission

In [None]:
y_submission = model_ran_for.predict(submission_data_processed)

In [None]:
y_submission

In [None]:
unique, counts = np.unique(y_submission, return_counts=True)
dict(zip(unique, counts))

In [None]:
y_submissions_series = pd.Series(y_submission)

In [None]:
submission_data = {'primary_identifier' : pd.to_numeric(prim_id),
                 'next_month_plan' : pd.to_numeric(y_submissions_series)}

submission_df = pd.DataFrame(submission_data)

In [None]:
submission_df.shape

In [None]:
#submission_df.to_csv("submission_7.csv")

In [None]:
param_dist = {'n_estimators': [50, 100, 300, 500, 700, 900],
              'criterion': ['gini', 'entropy'],
              'max_depth': [1, 3, 4, 5, 6, 8, 10, 12, None],
              'max_features': ['auto', 'sqrt', 'log2', 15, 20, 25],
              'class_weight': ['balanced', 'balanced_subsample']
             }

In [None]:
param_dist = {'max_depth': [16, 20],
              'n_estimators': [400, 500, 600],
              'max_features': ['log2', 15, 20]
             }

In [None]:
clf_rf = RandomForestClassifier()
clf = GridSearchCV(clf_rf, 
                   param_dist,
                   cv = 5,
                   verbose = 3, 
                   n_jobs = -1)

In [None]:
clf.fit(X_res, y_res)

In [None]:
clf.best_params_

In [None]:
clf.best_score_

In [54]:
X_res.shape

(11790, 38)

In [55]:
model_ran_for_new = RandomForestClassifier(random_state = 0, max_depth = 10, 
                                           n_estimators = 400, max_features=38,
                                          class_weight='balanced')

In [56]:
model_ran_for_new.fit(X_res, y_res)

RandomForestClassifier(class_weight='balanced', max_depth=10, max_features=38,
                       n_estimators=400, random_state=0)

In [57]:
y_pred_ran_for_new = model_ran_for_new.predict(test_data_processed)

In [58]:
print(classification_report(Y_test, y_pred_ran_for_new))

              precision    recall  f1-score   support

           1       0.83      0.64      0.72       183
           2       0.41      0.38      0.40        68
           3       0.45      0.53      0.49        86
           4       0.31      0.46      0.37        37
           5       0.68      0.68      0.68        78
           6       0.37      0.38      0.37        37
           7       0.27      0.17      0.21        18
           8       0.43      0.83      0.57        18

    accuracy                           0.56       525
   macro avg       0.47      0.51      0.48       525
weighted avg       0.59      0.56      0.56       525



In [189]:
model_ran_for_new_2 = RandomForestClassifier(random_state = 0, max_depth = 9,
                                             n_estimators = 250, max_features=38,
                                             class_weight='balanced'
                                            )

In [183]:
model_ran_for_new_2 = RandomForestClassifier(random_state = 0, max_depth = 6,
                                             n_estimators = 250, max_features=38,
                                             class_weight='balanced'
                                            )

In [190]:
model_ran_for_new_2.fit(X_res, y_res)

RandomForestClassifier(class_weight='balanced', max_depth=9, max_features=38,
                       n_estimators=250, random_state=0)

In [191]:
y_pred_ran_for_new_2 = model_ran_for_new_2.predict(test_data_processed)

In [192]:
print(classification_report(Y_test, y_pred_ran_for_new_2))

              precision    recall  f1-score   support

           1       0.83      0.62      0.71       183
           2       0.39      0.38      0.39        68
           3       0.45      0.49      0.47        86
           4       0.32      0.57      0.41        37
           5       0.68      0.64      0.66        78
           6       0.38      0.41      0.39        37
           7       0.36      0.22      0.28        18
           8       0.38      0.83      0.53        18

    accuracy                           0.55       525
   macro avg       0.48      0.52      0.48       525
weighted avg       0.59      0.55      0.56       525



In [193]:
y_submission = model_ran_for_new_2.predict(submission_data_processed)

In [194]:
unique, counts = np.unique(y_submission, return_counts=True)
dict(zip(unique, counts))

{1: 1150, 2: 697, 3: 682, 4: 592, 5: 642, 6: 305, 7: 152, 8: 280}

In [195]:
y_submissions_series = pd.Series(y_submission)

In [196]:
submission_data = {'primary_identifier' : pd.to_numeric(prim_id),
                 'next_month_plan' : pd.to_numeric(y_submissions_series)}

submission_df = pd.DataFrame(submission_data)

In [197]:
submission_df.shape

(4500, 2)

In [198]:
submission_df.to_csv("submission_test_bigil.csv")

In [None]:
from sklearn.metrics import f1_score

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

In [None]:
param_grid = {"max_depth": [3, 5, 7, 10, 11, 12, 13],
             "max_features": [15, 20, 22, 25, 28, 30, 31, 32]}

In [None]:
search = HalvingGridSearchCV(RandomForestClassifier(random_state = 0),
                             param_grid, resource='n_estimators',max_resources=500,random_state=0,
                             n_jobs=-1, class_weights = 'balanced')

In [None]:
search.fit(X_res, y_res)

In [None]:
search.best_params_

In [None]:
search.best_score_

In [None]:
X_res.shape

In [None]:
model_xgb = XGBClassifier(random_state = 0, max_depth = 20, max_features = 38, n_estimators = 900,
                                  class_weight='balanced')

In [None]:
model_xgb.fit(X_res, y_res)

In [None]:
y_pred_xgb = model_xgb.predict(test_data_processed)

In [None]:
print(classification_report(Y_test, y_pred_xgb))

In [None]:
y_submission = model_xgb.predict(submission_data_processed)

In [None]:
unique, counts = np.unique(y_submission, return_counts=True)
dict(zip(unique, counts))

In [None]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

In [None]:
ovo = OneVsOneClassifier(RandomForestClassifier(random_state = 0, max_depth = 15, 
                                                n_estimators = 600, max_features=38,
                                               class_weight='balanced'), n_jobs = -1)

In [None]:
ovo.fit(X_res, y_res)

In [None]:
y_pred_ovo = ovo.predict(test_data_processed)

In [None]:
print(classification_report(Y_test, y_pred_ovo))

In [None]:
ovr = OneVsRestClassifier(RandomForestClassifier(random_state = 0, max_depth = 15, 
                                                 n_estimators = 600, max_features=38,
                                       class_weight='balanced'), n_jobs = -1)

In [None]:
ovr.fit(X_res, y_res)

In [None]:
y_pred_ovr = ovr.predict(test_data_processed)

In [None]:
print(classification_report(Y_test, y_pred_ovr))

In [None]:
y_submission = ovr.predict(submission_data_processed)

In [None]:
unique, counts = np.unique(y_submission, return_counts=True)
dict(zip(unique, counts))