In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
from random import randint, randrange
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from mlxtend.classifier import StackingClassifier
from sklearn.feature_selection import RFE, RFECV
from mlxtend.feature_selection import ExhaustiveFeatureSelector
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import mlxtend
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

In [None]:
os.getcwd()

In [None]:
dataset_path = os.path.join(os.getcwd(),'dataset')
# output_folder = '/content/drive/MyDrive/ML Challenge Datasets/Output Folder'
result_path = os.path.join(os.getcwd(),'Submissions')
files = os.listdir(dataset_path)
print(files)

In [None]:
train = pd.read_csv(os.path.join(dataset_path,files[1]),low_memory=False)
test_data = pd.read_csv(os.path.join(dataset_path,files[0]),low_memory=False)

In [None]:
# Data Cleaning
'''
Combine All Issue columns into one single column with Issue as a binary variable and number of issues as another column
'''
issue_cols = [col for col in train if col.startswith('issue')]
train['Issue_Count'] = train[issue_cols].apply(lambda x: x.count(), axis=1)
train['Issue_Y_N'] = np.where(train['Issue_Count']== 0, 0, 1)


In [None]:
# Apply the same on Test Data
test_data['Issue_Count'] = test_data[issue_cols].apply(lambda x: x.count(), axis=1)
test_data['Issue_Y_N'] = np.where(test_data['Issue_Count']== 0, 0, 1)


In [None]:
# Missing Value Treatment
limit_per = len(train)*0.70
train_clean = train.dropna(thresh=limit_per,axis=1)

In [None]:
train_clean = train_clean.dropna()
len(train_clean)

In [None]:
# Remove Columns with Zero Variance
train_clean_col = train_clean
for col in train_clean.columns:
    #print('Cols {}'.format(col))
    if len(train_clean[col].unique())==1:
        #print('Removing feature: {}'.format(col))
        train_clean_col = train_clean_col.drop(col,axis=1)

In [None]:
# Removing Other Redundant Features
feat_remove = ['appno','country.alpha2','docname','ecli','itemid','originatingbody','sharepointid','parties.1','judgementdate',\
               'kpdate','respondentOrderEng','parties.0']
train_cleaned = train_clean_col.drop(feat_remove,axis=1)

In [None]:
# Columns to remove from Test set
cols_to_remove = list(set(train.columns.tolist())-set(train_cleaned.columns.tolist()))

In [None]:
# Replace (-,=,.) with (_) in column names
train_cleaned.columns=train_cleaned.columns.str.replace('=','_')
train_cleaned.columns=train_cleaned.columns.str.replace('-','_')
train_cleaned.columns=train_cleaned.columns.str.replace('.','_')

In [None]:
# Find CCL columns replace (-1) with (2) and one-hot encode all columns that need to be encoded
ccl_cols = [col for col in train_cleaned if col.startswith('ccl')]
train_cleaned[ccl_cols] = train_cleaned[ccl_cols].replace([-1],[2])
feature_encode_cols = ccl_cols + train_cleaned.select_dtypes(include=['object']).columns.tolist()+['typedescription']

In [None]:
# OHE
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
array_hot_encoded = ohe.fit_transform(train_cleaned[feature_encode_cols])

In [None]:
# All columns after encoding
column_names = []
C=0
for i in ohe.categories_:
  print(list(feature_encode_cols[C]+'_'+pd.Series(i).apply(str)))
  column_names.extend(list(feature_encode_cols[C]+'_'+pd.Series(i).apply(str)))
  C=C+1


In [None]:
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=train_cleaned.index,columns=column_names,dtype=np.int64)
data_others = train_cleaned.drop(feature_encode_cols,axis=1)
train_final = pd.concat([data_hot_encoded,data_others],axis=1)

In [None]:
# Apply same on test
test_data_1 = test_data.drop(cols_to_remove,axis=1)
test_data_1.columns=test_data_1.columns.str.replace('=','_')
test_data_1.columns=test_data_1.columns.str.replace('-','_')
test_data_1.columns=test_data_1.columns.str.replace('.','_')

In [None]:
test_data_1[ccl_cols] = test_data_1[ccl_cols].replace([-1],[2])
array_hot_encoded_test = ohe.transform(test_data_1[feature_encode_cols])
data_hot_encoded_test = pd.DataFrame(array_hot_encoded_test, index=test_data_1.index,columns=column_names,dtype=np.int64)
data_others_test = test_data_1.drop(feature_encode_cols,axis=1)

In [None]:
test_final = pd.concat([data_hot_encoded_test,data_others_test],axis=1)

In [None]:
print(len(train_final.columns.tolist()))
print(len(test_final.columns.tolist()))

In [None]:
# Splitting Train and Test

## Define X, y
X = train_final.drop(['importance'],axis=1).values
y = train_final['importance'].values

# Split Train Test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42,stratify=y)

# Test Submission Matrix

test_matrix = test_final.values

In [None]:
# Class Weight for Train Split and Full Training Set

## Class Weight for Training Split
from sklearn.utils import class_weight
class_weights_train = list(class_weight.compute_class_weight('balanced',
                                             np.unique(train_final['importance']),
                                             y_train,classes=[1,2,3,4]))

w_array = np.ones(y_train.shape[0], dtype = 'float')
for i, val in enumerate(y_train):
    w_array[i] = class_weights_train[val-1]

    
## Class Weight for Full Training Set
class_weights_ft = list(class_weight.compute_class_weight('balanced',
                                             np.unique(train_final['importance']),
                                             train_final['importance'],classes=[1,2,3,4]))

w_array_ft = np.ones(y.shape[0], dtype = 'float')
for i, val in enumerate(y):
    w_array_ft[i] = class_weights_ft[val-1]



In [None]:
# Grid XGBoost
model = xgb.XGBClassifier(random_state=123,objective='multi:softmax')

In [None]:
param_grid = {
    'learning_rate':[0.1,0.01,0.5,0.05],
    'colsample_bytree':[0.2,0.4,0.6,0.8,1],
    'subsample':np.linspace(0.4,1,num=5),
    'max_depth':[10,15,20,25],
    'n_estimators':[100,200,300,400,500],
    'reg_lambda':np.linspace(1,2,num=5),
    'gamma':np.linspace(0,0.5,num=5)
}

# scoring = {
#     'Accuracy':make_scorer(accuracy_score)
# }

num_folds = 10
kfold = StratifiedKFold(n_splits=num_folds,random_state=123,shuffle=True)
n_iter = 50
grid=RandomizedSearchCV(
    estimator=model,
    param_distributions = param_grid,
    random_state=123,
    cv = kfold,
    n_jobs=-1,
    n_iter=n_iter,
    verbose=3
)

In [None]:
# Training w/o sample weights
grid.fit(X_train,y_train)

In [None]:
best_estimator = grid.best_estimator_

In [None]:
pred_xgb = best_estimator.predict(X_test)
pred_xgb_train = best_estimator.predict(X_train)
print('Accuracy on test: {:.2f}'.format(accuracy_score(y_test,pred_xgb)))
print('Accuracy on train: {:.2f}'.format(accuracy_score(y_train,pred_xgb_train)))

In [None]:
best_estimator.fit(X,y,sample_weight=w_array)

In [None]:
pred_test_xgb = best_estimator.predict(test_matrix)
test_data['importance']= pred_test_xgb

In [None]:
test_data[['appno','importance']].to_csv(os.path.join(result_path,'XGB_RandCV_cclbt02_md_25_est500_lr01_nl50_rlamb125_rand123_weights_Fin_Submission.csv'),index=False)