In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import xgboost as xgb
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import TomekLinks
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier, Pool
#Import other necessary model libraries, for this example, using Logistic Regression

In [2]:
train_data = pd.read_csv("../data/train_dataset.csv")

In [3]:
train_data['next_month_plan'] = train_data['next_month_plan'].astype('category')

In [4]:
train_data['next_month_plan'].unique()

['PKG2', 'PKG1', 'PKG6', 'PKG4', 'PKG5', 'PKG3', 'PKG8', 'PKG7']
Categories (8, object): ['PKG2', 'PKG1', 'PKG6', 'PKG4', 'PKG5', 'PKG3', 'PKG8', 'PKG7']

In [5]:
labels_x = ['PKG0','PKG1', 'PKG2', 'PKG3', 'PKG4', 'PKG5', 'PKG6', 'PKG7', 'PKG8']

In [6]:
le = preprocessing.LabelEncoder()

In [7]:
le.fit(labels_x)

LabelEncoder()

In [8]:
train_data['encoded_class_labels'] = le.transform(train_data['next_month_plan'])

In [9]:
train_data['device_category'].unique()

array(['Smartphone', 'Basic', 'Feature phone', 'Pluggable card', 'Tablet',
       nan, 'Modem'], dtype=object)

In [10]:
train_data['device_type'] = train_data['device_type'].fillna(value = 'Unknown')
train_data['device_category'] = train_data['device_category'].fillna(value = 'Unknown')
train_data['gender'] = train_data['gender'].fillna(value = 'Unknown')
train_data['age_group'] = train_data['age_group'].fillna(value = 'Unknown')

In [11]:
train_data = train_data.drop(columns = ['next_month_plan'])

In [12]:
train_data['dusage_avg'] = train_data['dusage_avg'].fillna(value = 0)
train_data['vusage_offnet_avg'] = train_data['vusage_offnet_avg'].fillna(value = 0)
train_data['add_on_tot_rental'] = train_data['add_on_tot_rental'].fillna(value = -1)
train_data['add_on_count'] = train_data['add_on_count'].fillna(value = 0)
train_data['vusage_onnet_avg'] = train_data['vusage_onnet_avg'].fillna(value = 0)

In [13]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   primary_identifier           10500 non-null  int64  
 1   device_type                  10500 non-null  object 
 2   device_category              10500 non-null  object 
 3   gender                       10500 non-null  object 
 4   district_name                10500 non-null  object 
 5   age_group                    10500 non-null  object 
 6   network_stay                 10500 non-null  int64  
 7   average_monthly_bill_amount  10500 non-null  float64
 8   dusage_sum                   10500 non-null  float64
 9   dusage_min                   10500 non-null  float64
 10  dusage_max                   10500 non-null  float64
 11  dusage_avg                   10500 non-null  float64
 12  dusage_days                  10500 non-null  float64
 13  dusage_stddev   

In [14]:
def get_col_types(dataframe):
    data_cat_cols = []
    data_quan_cols = []
    
    for col in np.array(dataframe.columns):
        if dataframe[col].dtype == 'int64' or dataframe[col].dtype == 'float64':
            data_quan_cols.append(col)
        elif dataframe[col].dtype == 'O':
            data_cat_cols.append(col)
            
    return data_cat_cols, data_quan_cols

In [15]:
X = train_data.drop(columns=['encoded_class_labels', 'primary_identifier'])

In [16]:
data_cat_cols, data_quan_cols = get_col_types(X)

In [17]:
y = train_data['encoded_class_labels']

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=10, test_size = 0.05, stratify = y)

In [19]:
quan_pipeline = Pipeline([
    ('std_scaler', RobustScaler())
])

quan_transformed = quan_pipeline.fit_transform(X_train[data_quan_cols])

In [20]:
data_pipeline = ColumnTransformer([
    ('numerical', quan_pipeline, data_quan_cols),
    ('categorical', OrdinalEncoder(), data_cat_cols),
    
])

train_data_processed = data_pipeline.fit_transform(X_train)

In [21]:
test_data_processed = data_pipeline.transform(X_test)

## Submission Data

In [60]:
submission_data = pd.read_csv("../data/test_dataset_new.csv")

In [61]:
submission_data['device_type'] = submission_data['device_type'].fillna(value = 'Unknown')
submission_data['device_category'] = submission_data['device_category'].fillna(value = 'Unknown')
submission_data['gender'] = submission_data['gender'].fillna(value = 'Unknown')
submission_data['age_group'] = submission_data['age_group'].fillna(value = 'Unknown')

In [62]:
submission_data['dusage_avg'] = submission_data['dusage_avg'].fillna(value = 0)
submission_data['vusage_offnet_avg'] = submission_data['vusage_offnet_avg'].fillna(value = 0)
submission_data['add_on_tot_rental'] = submission_data['add_on_tot_rental'].fillna(value = -1)
submission_data['add_on_count'] = submission_data['add_on_count'].fillna(value = 0)
submission_data['vusage_onnet_avg'] = submission_data['vusage_onnet_avg'].fillna(value = 0)

In [63]:
prim_id = submission_data['primary_identifier']

In [64]:
submission_X = submission_data.drop(columns=['primary_identifier'])

In [65]:
submission_data_processed = data_pipeline.transform(submission_X)

In [46]:
model = XGBClassifier(label_encoder = False, n_estimators = 900, max_depth = 20)

In [47]:
model.fit(train_data_processed, Y_train)



Parameters: { label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              label_encoder=False, learning_rate=0.300000012, max_delta_step=0,
              max_depth=20, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=900, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [48]:
y_pred = model.predict(test_data_processed)

In [49]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           1       0.71      0.82      0.76       183
           2       0.38      0.21      0.27        68
           3       0.47      0.53      0.50        86
           4       0.27      0.22      0.24        37
           5       0.64      0.68      0.66        78
           6       0.34      0.32      0.33        37
           7       0.14      0.11      0.12        18
           8       0.44      0.44      0.44        18

    accuracy                           0.56       525
   macro avg       0.42      0.42      0.42       525
weighted avg       0.53      0.56      0.54       525



In [152]:
model_rf = RandomForestClassifier(random_state = 0,class_weight='balanced', max_depth = 11, max_features = 29)

In [153]:
model_rf.fit(train_data_processed, Y_train)

RandomForestClassifier(class_weight='balanced', max_depth=11, max_features=29,
                       random_state=0)

In [154]:
y_pred_rf = model_rf.predict(test_data_processed)

In [155]:
print(classification_report(y_pred, y_pred_rf))

              precision    recall  f1-score   support

           1       0.98      0.72      0.83       211
           2       0.43      0.59      0.50        37
           3       0.62      0.68      0.65        97
           4       0.34      0.63      0.44        30
           5       0.74      0.73      0.74        83
           6       0.51      0.51      0.51        35
           7       0.44      0.29      0.35        14
           8       0.52      0.89      0.65        18

    accuracy                           0.68       525
   macro avg       0.57      0.63      0.58       525
weighted avg       0.74      0.68      0.70       525



In [156]:
y_submission = model_rf.predict(submission_data_processed)

In [157]:
unique, counts = np.unique(y_submission, return_counts=True)
dict(zip(unique, counts))

{1: 1305, 2: 544, 3: 769, 4: 520, 5: 715, 6: 316, 7: 123, 8: 208}

In [None]:
f1_score()