# This Template is created to make grading fair and straightforward. Anything not in the place as mentioned in the template would not be graded.

<font color='red'> # NOTE: We would run the notebook through a Plagiarism Checker. If it is found to be copied, your work would not be graded, and the incident would be highlighted to NYU Authorities. </font>

# Import Library and Dataset

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import preprocessing
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
# So On.......

# Import data
df_train = pd.read_csv('leaderboard_training.csv')
df_test = pd.read_csv('leaderboard_test.csv')

# Store target variable of training data
y_train = df_train.quidditch_league_player

df_train = df_train.drop(['quidditch_league_player'], axis=1)

# PART I: Preprocessing

#### Handling missing values. (If ANY)

In [2]:
# Get the number of row in training and test data
num_train = df_train.iloc[:,0].size
num_test = df_test.iloc[:,0].size
num_col = df_train.size

In [3]:
# Process with missing values
df_train = df_train.replace('?',np.nan)
df_test = df_test.replace('?',np.nan)

# Find the column with missing value and output the column name with its missing rate
nan_train = df_train.isnull().sum()  
nan_ratio_train = (nan_train/num_train).tolist()
nan_col_name_train = df_train.columns[df_train.isnull().any()].tolist()

nan_test = df_test.isnull().sum()
nan_ratio_test = (nan_test/num_test).tolist()
nan_col_name_test = df_test.columns[df_test.isnull().any()].tolist()

j = 0
nan_col_train = []
for i in nan_ratio_train:
    if i!=0:
        nan_list = []
        nan_list.append(nan_col_name_train[j])
        j = j+1
        nan_list.append(i)
        nan_col_train.append(nan_list)
        
k = 0
nan_col_test = []
for i in nan_ratio_test:
    if i!=0:
        nan_list = []
        nan_list.append(nan_col_name_test[k])
        k = k+1
        nan_list.append(i)
        nan_col_test.append(nan_list)
        
print(nan_col_train)
print(nan_col_test)

[['house', 0.022368656094317527], ['weight', 0.9685905960343767], ['player_code', 0.39555008633864597], ['move_specialty', 0.49078062044737314]]
[['house', 0.018], ['weight', 0.97], ['player_code', 0.416], ['move_specialty', 0.492]]


In [4]:
# Put the column with more than half missing value in either training set or test set into drop list
# Put the other column with missing value into refill list
drop_col = []
refill_col = []

nan_num = len(nan_col_train)

for flag in range(nan_num):
    if ((nan_col_train[flag][1]>=0.5) or (nan_col_test[flag][1]>=0.5)):
        drop_col.append(nan_col_train[flag][0])
    else:
        refill_col.append(nan_col_train[flag][0])

print(drop_col)
print(refill_col)

['weight']
['house', 'player_code', 'move_specialty']


In [5]:
# Concatenate training and test sets
data = pd.concat([df_train, df_test])

In [6]:
# Drop the column with more than half missing value
for col in drop_col:
    data = data.drop([col],axis = 1)

In [7]:
#Refill the other missing value with the most frequent value in this column since they are all categorical features
for col in refill_col:
    col_data = data[col]
    freq = col_data.dropna().mode()[0]
    data[col] = data[col].fillna(freq)

#### Combine Some Features or Create New Features

In [8]:
# Combine some features together
data['num_games_not_participate'] = data['num_games_satout']+data['num_games_injured']+data['num_games_notpartof']
#data = data.drop(['num_games_satout','num_games_injured','num_games_notpartof'], axis=1)

In [9]:
# Create new features about the 23 tactics
data['num_tactic_steady'] = 0
data['num_tactic_up'] = 0
data['num_tactic_down'] = 0
data['num_tactic_no'] = 0
for i in range (18,41):
    data['num_tactic_steady'] += data[data.columns[i]].apply(lambda x: 1 if x == 'Steady' else 0)
    data['num_tactic_up'] += data[data.columns[i]].apply(lambda x: 1 if x == 'Up' else 0)
    data['num_tactic_down'] += data[data.columns[i]].apply(lambda x: 1 if x == 'Down' else 0)
    data['num_tactic_no'] += data[data.columns[i]].apply(lambda x: 1 if x == 'No' else 0)
#data.head()

#### Feature Datatype Conversion From Numeric to categoric and Vice-versa. (If ANY)

In [10]:
# Transform target variable into numberical data
y_label_encoder = preprocessing.LabelEncoder()
y_label_encoder.fit(y_train.values)
y_train = y_label_encoder.transform(y_train.values.astype(str))
#print(y_train)

In [11]:
# Reduce number of categories
data['stooging'] = data['stooging'].replace('>7','High')
data['stooging'] = data['stooging'].replace('>8','High')
data['snitchnip'] = data['snitchnip'].replace('>7','High')
data['snitchnip'] = data['snitchnip'].replace('>8','High')
#data.head()

In [12]:
# Find the numerical features and categorical features
cate_col = []
num_col = []
i = 0
for columns in data:
    if data[columns].dtypes=='object':
        cate_col.append(columns)
    elif (data[columns].dtypes=='int64' or data[columns].dtypes=='float64'):
        num_col.append(columns)
    i = i+1
    
print(cate_col)
print(num_col)

['house', 'gender', 'player_code', 'move_specialty', 'player_type', 'snitchnip', 'stooging', 'body_blow', 'checking', 'dopplebeater_defence', 'hawkshead_attacking_formation', 'no_hands_tackle', 'power_play', 'sloth_grip_roll', 'spiral_dive', 'starfish_and_stick', 'twirl', 'wronski_feint', 'zig-zag', 'bludger_backbeat', 'chelmondiston_charge', 'dionysus_dive', 'double_eight_loop', 'finbourgh_flick', 'reverse_pass', 'parkins_pincer', 'plumpton_pass', 'porskoff_ploy', 'transylvanian_tackle', 'woollongong_shimmy', 'change', 'snitch_caught']
['id_num', 'player_id', 'age', 'foul_type_id', 'game_move_id', 'penalty_id', 'game_duration', 'num_game_moves', 'num_game_losses', 'num_practice_sessions', 'num_games_satout', 'num_games_injured', 'num_games_notpartof', 'num_games_won', 'num_games_not_participate', 'num_tactic_steady', 'num_tactic_up', 'num_tactic_down', 'num_tactic_no']


In [13]:
# Transform category data with 2 classes into numerical one using label encoder
# Drop categorical column with only one class which is unnecessary
#new_col = 0
for col in cate_col:
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(data[col].values)
    
    if (len(label_encoder.classes_)==2):
        print("Encode:",col)
        print(label_encoder.classes_)
        data[col] = label_encoder.transform(data[col].values.astype(str))
    elif (len(label_encoder.classes_)==1):
        print("Drop:",col)
        data = data.drop([col],axis = 1)
    else:
        print(col,len(label_encoder.classes_),label_encoder.classes_)
        #new_col += len(label_encoder.classes_)
        
#print(new_col)

house 5 ['Gryffindor' 'Hufflepuff' 'Other' 'Ravenclaw' 'Slytherin']
gender 3 ['Female' 'Male' 'Unknown/Invalid']
player_code 17 ['BC' 'CH' 'CM' 'CP' 'DM' 'FR' 'HM' 'MC' 'MD' 'MP' 'OG' 'OT' 'PO' 'SI'
 'SP' 'UN' 'WC']
move_specialty 72 ['specialty1' 'specialty10' 'specialty11' 'specialty12' 'specialty13'
 'specialty14' 'specialty15' 'specialty16' 'specialty17' 'specialty18'
 'specialty19' 'specialty2' 'specialty20' 'specialty21' 'specialty22'
 'specialty23' 'specialty24' 'specialty25' 'specialty26' 'specialty27'
 'specialty28' 'specialty29' 'specialty3' 'specialty30' 'specialty31'
 'specialty32' 'specialty33' 'specialty34' 'specialty35' 'specialty36'
 'specialty37' 'specialty38' 'specialty39' 'specialty4' 'specialty40'
 'specialty41' 'specialty42' 'specialty43' 'specialty44' 'specialty45'
 'specialty46' 'specialty47' 'specialty48' 'specialty49' 'specialty5'
 'specialty50' 'specialty51' 'specialty52' 'specialty53' 'specialty54'
 'specialty55' 'specialty56' 'specialty57' 'specialty58' 'spe

In [14]:
# Transform category data with more than 2 classes using one hot encoder
data = pd.get_dummies(data)

#for col in cate_col:
#    onehot_encoder = preprocessing.OneHotEncoder(categories = [col[0]])
#    data = onehot_encoder.fit_transform(data).toarray()
data.head()

Unnamed: 0,id_num,player_id,age,foul_type_id,game_move_id,penalty_id,game_duration,num_game_moves,num_game_losses,num_practice_sessions,...,dionysus_dive_Steady,dionysus_dive_Up,reverse_pass_Down,reverse_pass_No,reverse_pass_Steady,reverse_pass_Up,parkins_pincer_Down,parkins_pincer_No,parkins_pincer_Steady,parkins_pincer_Up
0,1,8222157,11.0,6,25,1,1,41,0,1,...,0,0,0,1,0,0,0,1,0,0
1,2,55629189,12.0,1,1,7,3,59,0,18,...,0,0,0,0,0,1,0,1,0,0
2,3,86047875,13.0,1,1,7,2,11,5,13,...,0,0,0,1,0,0,0,1,0,0
3,4,82442376,14.0,1,1,7,2,44,1,16,...,0,0,0,0,0,1,0,1,0,0
4,5,42519267,14.5,1,1,7,1,51,0,8,...,0,0,0,0,1,0,0,1,0,0


#### Feature Reduction or extraction. (If ANY)

In [15]:
# Drop Id columns which have no effect on result
data = data.drop(['id_num','player_id','foul_type_id', 'game_move_id', 'penalty_id'], axis = 1)
#data.head()
num_col.remove('id_num')
num_col.remove('player_id')
num_col.remove('foul_type_id')
num_col.remove('game_move_id')
num_col.remove('penalty_id')
#print(num_col)

In [16]:
# Observing correlation matrix of features and compare the correlation between features
corr = data.corr()
#corr.style.background_gradient(cmap='coolwarm').set_precision(2)

In [17]:
# Drop one of two features that have a correlation higher than 0.75
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.8:
            print(data.columns[i],data.columns[j],corr.iloc[i,j])
            if columns[j]:
                columns[j] = False
                print('Drop:',data.columns[j])

selected_columns = data.columns[columns]

data = data[selected_columns]
#data.head()

num_tactic_up reverse_pass_Up 0.8515325136689368
Drop: reverse_pass_Up
num_tactic_down reverse_pass_Down 0.9077971271509003
Drop: reverse_pass_Down


#### Any other Pre-processing Used. (Give the name along with the code.)

In [18]:
# Log transformation in columns with high skew
log_col = ['num_games_satout', 'num_games_injured', 'num_games_notpartof', 'num_games_not_participate']
for col in log_col:
    data[col] = data[col].apply(lambda x: np.log(x) if x>0 else 0)

In [19]:
# Using Standardization to process with outliers in these columns
for col in log_col:
    data[col + '_zscore'] = (data[col]-data[col].mean())/data[col].std()
    abnormal_col = abs(data[col + '_zscore'])>3
    print(col + ' has ' + str(abnormal_col.sum()) + ' outliers')
    data.loc[abs(data[col + '_zscore'])>3, col] = np.nan

    mean = data[col].dropna().mean()
    print('Replace outliers in',col,'with the mean value',mean)
    data[col] = data[col].fillna(mean)
    data = data.drop([col + '_zscore'], axis=1)
    
#data.head()

num_games_satout has 4574 outliers
Replace outliers in num_games_satout with the mean value 0.025613441403018706
num_games_injured has 3686 outliers
Replace outliers in num_games_injured with the mean value 0.0
num_games_notpartof has 3623 outliers
Replace outliers in num_games_notpartof with the mean value 0.09172725056131344
num_games_not_participate has 2234 outliers
Replace outliers in num_games_not_participate with the mean value 0.25854294808124534


In [20]:
#MinMax Normalization for numerical feature

num_col.remove('num_games_injured')
#print(num_col)

for col in num_col:
    data_min = data[col].min()
    data_range = data[col].max()-data_min
    data[col] = (data[col]-data_min)/data_range

#data.head()

In [21]:
# Dimensionality Reduction using PCA
pca = PCA(n_components = 50)
pca_data = pca.fit_transform(data)
new_data = pd.DataFrame(pca_data)
#new_data.info()

In [22]:
#Spilt training set and testing set
data_train = new_data.iloc[:100766]
data_test = new_data.iloc[100766:]

In [23]:
# Data Balancing using SMOTE
X_resampled, y_resampled = SMOTE().fit_resample(data_train, y_train)
X_resampled = pd.DataFrame(X_resampled)
#X_resampled.info()

num1_before = str(y_train.tolist()).count("1")
num0_before = str(y_train.tolist()).count("0")
print('The original number of 1 and 0 is ',num1_before,num0_before)
num1_after = str(y_resampled.tolist()).count("1")
num0_after = str(y_resampled.tolist()).count("0")
print('The number of 1 and 0 after balancing is ',num1_after,num0_after)

The original number of 1 and 0 is  11244 89522
The number of 1 and 0 after balancing is  89522 89522


# PART II: Classification

### Model 1:
Model Name:-----------<br>
Evaluation method and metric used Name:-----------<br>
Name of the Hyperparameter used:--------------......<br>


In [None]:
#Code...
cv_params = {'n_neighbors':[2,3,5,7,11,13]}
model = KNeighborsClassifier()
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='f1', cv=5, verbose=1, n_jobs=8)
optimized_GBM.fit(X_resampled,y_resampled)
evaluate_result = optimized_GBM.cv_results_

for i in range (0,len(evaluate_result['mean_test_score'])):
    print(evaluate_result['params'][i],evaluate_result['mean_test_score'][i])
    
#print('Results for each iteration:{0}'.format(evaluate_result))
print('Optimal value for max_depth and max_features:{0}'.format(optimized_GBM.best_params_))
print('Best score:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
#Code...
'''
cv_params = {'criterion':['entropy','gini'],
             'max_depth':[1,2,3],
             'max_features':[6,7,8],
             'min_samples_split':[0.5,2,3]}

model = DecisionTreeClassifier(criterion = 'entropy',splitter = 'random',
                               min_samples_leaf=1,min_weight_fraction_leaf=0.0,random_state=None,max_leaf_nodes=None,
                               min_impurity_decrease=0.0,min_impurity_split=None,class_weight=None,presort=False)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='f1', cv=5, verbose=1, n_jobs=8)
optimized_GBM.fit(X_resampled,y_resampled)
evaluate_result = optimized_GBM.cv_results_

for i in range (0,len(evaluate_result['mean_test_score'])):
    print(evaluate_result['params'][i],evaluate_result['mean_test_score'][i])
    
#print('Results for each iteration:{0}'.format(evaluate_result))
print('Optimal value for max_depth and max_features:{0}'.format(optimized_GBM.best_params_))
print('Best score:{0}'.format(optimized_GBM.best_score_))
'''

### Model 2:
Model Name:-----------<br>
Evaluation method and metric used Name:-----------<br>
Name of the Hyperparameter used:--------------......<br>


### Model 3:
Model Name:-----------<br>
Evaluation method and metric used Name:-----------<br>
Name of the Hyperparameter used:--------------......<br>


In [None]:
#Code...

# PART III: Best Hypothesis:
Model Name:------------<br>
Reason:--------------<br>
Hyper-parameter Value:-----------<br>
