# Setting the environment

Let’s start by importing some libraries and our data.

## Setting the path and libraries

In [1]:
# setting the path
import os
os.chdir("E:/Data Science/ZS")

# importing libraries
import pandas as pd # For data manipulation
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np # For numerical analysis
import seaborn as sns # For visulization
import matplotlib.pyplot as plt # For visulization

## Data Preprocessing

Lets start by loading our data set.

In [2]:
data = pd.read_csv("./Input/data.csv")

# getting the shapes of the datasets
print("Shape of Data :", data.shape)

Shape of Data : (30697, 28)


In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,match_event_id,location_x,location_y,remaining_min,power_of_shot,knockout_match,game_season,remaining_sec,distance_of_shot,is_goal,area_of_shot,shot_basics,range_of_shot,team_name,date_of_game,home/away,shot_id_number,lat/lng,type_of_shot,type_of_combined_shot,match_id,team_id,remaining_min.1,power_of_shot.1,knockout_match.1,remaining_sec.1,distance_of_shot.1
0,0,10.0,167.0,72.0,10.0,1.0,0.0,2000-01,27.0,38.0,,Right Side(R),Mid Range,16-24 ft.,Manchester United,2000-10-31,MANU @ POR,1.0,"45.539131, -122.651648",shot - 30,,20000012,1610612747,10.0,1.0,50.61,54.2,38.0
1,1,12.0,-157.0,0.0,10.0,1.0,0.0,2000-01,22.0,35.0,0.0,Left Side(L),Mid Range,8-16 ft.,Manchester United,2000-10-31,MANU @ POR,2.0,"45.539131, -122.651648",shot - 45,,20000012,1610612747,10.0,1.0,28.8,22.0,35.0
2,2,35.0,-101.0,135.0,7.0,1.0,0.0,2000-01,45.0,36.0,1.0,Left Side Center(LC),Mid Range,16-24 ft.,Manchester United,2000-10-31,,3.0,"45.539131, -122.651648",shot - 25,,20000012,1610612747,92.64,1.0,0.0,63.72,54.4
3,3,43.0,138.0,175.0,6.0,1.0,0.0,2000-01,52.0,42.0,0.0,Right Side Center(RC),Mid Range,16-24 ft.,Manchester United,2000-10-31,MANU @ POR,4.0,"45.539131, -122.651648",,shot - 3,20000012,1610612747,,1.0,122.61,52.0,42.0
4,4,155.0,0.0,0.0,,2.0,0.0,2000-01,19.0,20.0,1.0,Center(C),Goal Area,Less Than 8 ft.,,2000-10-31,MANU @ POR,5.0,"45.539131, -122.651648",,shot - 1,20000012,1610612747,42.64,2.0,0.0,19.0,20.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30697 entries, 0 to 30696
Data columns (total 28 columns):
Unnamed: 0               30697 non-null int64
match_event_id           29134 non-null float64
location_x               29236 non-null float64
location_y               29157 non-null float64
remaining_min            29135 non-null float64
power_of_shot            29211 non-null float64
knockout_match           29180 non-null float64
game_season              24835 non-null object
remaining_sec            29103 non-null float64
distance_of_shot         29130 non-null float64
is_goal                  24429 non-null float64
area_of_shot             29195 non-null object
shot_basics              29122 non-null object
range_of_shot            29133 non-null object
team_name                29162 non-null object
date_of_game             29147 non-null object
home/away                29200 non-null object
shot_id_number           29134 non-null float64
lat/lng                  29132 non-n

In [5]:
percent_missing = data.isnull().sum() * 100 / len(data)
missing_value_data = pd.DataFrame({'column_name': data.columns,
                                 'percent_missing': percent_missing})
missing_value_data.sort_values('percent_missing', inplace=True)
missing_value_data

Unnamed: 0,column_name,percent_missing
Unnamed: 0,Unnamed: 0,0.0
match_id,match_id,0.0
team_id,team_id,0.0
location_x,location_x,4.76
power_of_shot,power_of_shot,4.84
knockout_match.1,knockout_match.1,4.86
home/away,home/away,4.88
area_of_shot,area_of_shot,4.89
knockout_match,knockout_match,4.94
team_name,team_name,5.0


In [6]:
data_v1 = data.drop(['Unnamed: 0','game_season','type_of_shot','type_of_combined_shot','shot_id_number',
                     'match_event_id','location_x','location_y','date_of_game','lat/lng','match_id','team_id'],axis = 1)
data_v1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30697 entries, 0 to 30696
Data columns (total 16 columns):
remaining_min         29135 non-null float64
power_of_shot         29211 non-null float64
knockout_match        29180 non-null float64
remaining_sec         29103 non-null float64
distance_of_shot      29130 non-null float64
is_goal               24429 non-null float64
area_of_shot          29195 non-null object
shot_basics           29122 non-null object
range_of_shot         29133 non-null object
team_name             29162 non-null object
home/away             29200 non-null object
remaining_min.1       29162 non-null float64
power_of_shot.1       29158 non-null float64
knockout_match.1      29204 non-null float64
remaining_sec.1       29158 non-null float64
distance_of_shot.1    29129 non-null float64
dtypes: float64(11), object(5)
memory usage: 3.7+ MB


In [7]:
data_v1['shot_id_number'] = np.arange(1, 30698)
data_v1['data_type'] = np.where(((data_v1['is_goal']== 0.00) | (data_v1['is_goal']== 1.00)), 'train', 'test')
data_v1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30697 entries, 0 to 30696
Data columns (total 18 columns):
remaining_min         29135 non-null float64
power_of_shot         29211 non-null float64
knockout_match        29180 non-null float64
remaining_sec         29103 non-null float64
distance_of_shot      29130 non-null float64
is_goal               24429 non-null float64
area_of_shot          29195 non-null object
shot_basics           29122 non-null object
range_of_shot         29133 non-null object
team_name             29162 non-null object
home/away             29200 non-null object
remaining_min.1       29162 non-null float64
power_of_shot.1       29158 non-null float64
knockout_match.1      29204 non-null float64
remaining_sec.1       29158 non-null float64
distance_of_shot.1    29129 non-null float64
shot_id_number        30697 non-null int32
data_type             30697 non-null object
dtypes: float64(11), int32(1), object(6)
memory usage: 4.1+ MB


In [8]:
data_v1.head()

Unnamed: 0,remaining_min,power_of_shot,knockout_match,remaining_sec,distance_of_shot,is_goal,area_of_shot,shot_basics,range_of_shot,team_name,home/away,remaining_min.1,power_of_shot.1,knockout_match.1,remaining_sec.1,distance_of_shot.1,shot_id_number,data_type
0,10.0,1.0,0.0,27.0,38.0,,Right Side(R),Mid Range,16-24 ft.,Manchester United,MANU @ POR,10.0,1.0,50.61,54.2,38.0,1,test
1,10.0,1.0,0.0,22.0,35.0,0.0,Left Side(L),Mid Range,8-16 ft.,Manchester United,MANU @ POR,10.0,1.0,28.8,22.0,35.0,2,train
2,7.0,1.0,0.0,45.0,36.0,1.0,Left Side Center(LC),Mid Range,16-24 ft.,Manchester United,,92.64,1.0,0.0,63.72,54.4,3,train
3,6.0,1.0,0.0,52.0,42.0,0.0,Right Side Center(RC),Mid Range,16-24 ft.,Manchester United,MANU @ POR,,1.0,122.61,52.0,42.0,4,train
4,,2.0,0.0,19.0,20.0,1.0,Center(C),Goal Area,Less Than 8 ft.,,MANU @ POR,42.64,2.0,0.0,19.0,20.0,5,train


In [9]:
train = data_v1[data_v1["data_type"]== 'train']
test = data_v1[data_v1["data_type"]== 'test']

In [10]:
train.head()

Unnamed: 0,remaining_min,power_of_shot,knockout_match,remaining_sec,distance_of_shot,is_goal,area_of_shot,shot_basics,range_of_shot,team_name,home/away,remaining_min.1,power_of_shot.1,knockout_match.1,remaining_sec.1,distance_of_shot.1,shot_id_number,data_type
1,10.0,1.0,0.0,22.0,35.0,0.0,Left Side(L),Mid Range,8-16 ft.,Manchester United,MANU @ POR,10.0,1.0,28.8,22.0,35.0,2,train
2,7.0,1.0,0.0,45.0,36.0,1.0,Left Side Center(LC),Mid Range,16-24 ft.,Manchester United,,92.64,1.0,0.0,63.72,54.4,3,train
3,6.0,1.0,0.0,52.0,42.0,0.0,Right Side Center(RC),Mid Range,16-24 ft.,Manchester United,MANU @ POR,,1.0,122.61,52.0,42.0,4,train
4,,2.0,0.0,19.0,20.0,1.0,Center(C),Goal Area,Less Than 8 ft.,,MANU @ POR,42.64,2.0,0.0,19.0,20.0,5,train
5,9.0,3.0,0.0,32.0,34.0,0.0,Left Side(L),Mid Range,8-16 ft.,Manchester United,MANU @ POR,9.0,3.0,0.0,,34.0,6,train


In [11]:
test.head()

Unnamed: 0,remaining_min,power_of_shot,knockout_match,remaining_sec,distance_of_shot,is_goal,area_of_shot,shot_basics,range_of_shot,team_name,home/away,remaining_min.1,power_of_shot.1,knockout_match.1,remaining_sec.1,distance_of_shot.1,shot_id_number,data_type
0,10.0,1.0,0.0,27.0,38.0,,Right Side(R),Mid Range,16-24 ft.,Manchester United,MANU @ POR,10.0,1.0,50.61,54.2,38.0,1,test
7,8.0,3.0,0.0,5.0,22.0,,Center(C),Goal Area,Less Than 8 ft.,Manchester United,MANU @ POR,68.64,3.0,0.0,5.0,22.0,8,test
16,0.0,1.0,0.0,1.0,20.0,,,Goal Area,Less Than 8 ft.,Manchester United,MANU vs. UTA,0.0,1.0,0.0,1.0,20.0,17,test
19,10.0,3.0,0.0,46.0,20.0,,Center(C),,Less Than 8 ft.,Manchester United,MANU vs. UTA,10.0,70.36,0.0,46.0,20.0,20,test
21,9.0,3.0,0.0,4.0,38.0,,Right Side Center(RC),Mid Range,16-24 ft.,Manchester United,MANU vs. UTA,9.0,43.36,0.0,4.0,38.0,22,test


In [12]:
train = train.drop(['data_type'],axis = 1)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24429 entries, 1 to 30696
Data columns (total 17 columns):
remaining_min         23185 non-null float64
power_of_shot         23229 non-null float64
knockout_match        23217 non-null float64
remaining_sec         23179 non-null float64
distance_of_shot      23172 non-null float64
is_goal               24429 non-null float64
area_of_shot          23247 non-null object
shot_basics           23146 non-null object
range_of_shot         23181 non-null object
team_name             23217 non-null object
home/away             23215 non-null object
remaining_min.1       23220 non-null float64
power_of_shot.1       23221 non-null float64
knockout_match.1      23215 non-null float64
remaining_sec.1       23229 non-null float64
distance_of_shot.1    23185 non-null float64
shot_id_number        24429 non-null int32
dtypes: float64(11), int32(1), object(5)
memory usage: 3.3+ MB


In [13]:
test = test.drop(['data_type','is_goal'],axis = 1)
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6268 entries, 0 to 30693
Data columns (total 16 columns):
remaining_min         5950 non-null float64
power_of_shot         5982 non-null float64
knockout_match        5963 non-null float64
remaining_sec         5924 non-null float64
distance_of_shot      5958 non-null float64
area_of_shot          5948 non-null object
shot_basics           5976 non-null object
range_of_shot         5952 non-null object
team_name             5945 non-null object
home/away             5985 non-null object
remaining_min.1       5942 non-null float64
power_of_shot.1       5937 non-null float64
knockout_match.1      5989 non-null float64
remaining_sec.1       5929 non-null float64
distance_of_shot.1    5944 non-null float64
shot_id_number        6268 non-null int32
dtypes: float64(10), int32(1), object(5)
memory usage: 808.0+ KB


In [14]:
# getting the shapes of the datasets
print("Shape of Train :", train.shape)
print("Shape of Test :", test.shape)

Shape of Train : (24429, 17)
Shape of Test : (6268, 16)


In [15]:
train["is_goal"].value_counts(normalize = True)

0.00   0.55
1.00   0.45
Name: is_goal, dtype: float64

In [16]:
numeric_features = train.select_dtypes(include=['int64', 'float64']).drop(['is_goal'], axis=1).columns
categorical_features = train.select_dtypes(include=['object']).columns

print(numeric_features)
print(categorical_features)

Index(['remaining_min', 'power_of_shot', 'knockout_match', 'remaining_sec',
       'distance_of_shot', 'remaining_min.1', 'power_of_shot.1',
       'knockout_match.1', 'remaining_sec.1', 'distance_of_shot.1'],
      dtype='object')
Index(['area_of_shot', 'shot_basics', 'range_of_shot', 'team_name',
       'home/away'],
      dtype='object')


## Model Building

In [17]:
X = train.drop('is_goal', axis=1)
y = train['is_goal']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

## Fitting the classifier

In [19]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

seed = 786

classifiers = [
    LogisticRegression(),
    SGDClassifier(),
    LinearSVC(),
    KNeighborsClassifier(),
    SVC(kernel="rbf"),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    XGBClassifier()
    ]

for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))



Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
model score: 0.601




Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)
model score: 0.533




Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
model score: 0.601


Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...ki',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
model score: 0.549




Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
model score: 0.603


Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
model score: 0.527




Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
model score: 0.561


Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...m='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
model score: 0.610


Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))])

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
model score: 0.609


Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
model score: 0.610


In [27]:
# Fitting Logistic Regression model
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import cross_val_score

pipeline_LR = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(penalty='l1'))])
                        
model_LR = pipeline_LR.fit(X_train, y_train)

# Scoring based on F1 score for hold out sample
y_predict_LR = model_LR.predict(X_test)
scores_holdout = f1_score(y_test, y_predict_LR, average="macro")
print(classification_report(y_test, y_predict_LR))
print("F1 Score for hold-out sample data:", round(scores_holdout*100,2))




              precision    recall  f1-score   support

         0.0       0.60      0.83      0.70      2703
         1.0       0.60      0.32      0.42      2183

   micro avg       0.60      0.60      0.60      4886
   macro avg       0.60      0.57      0.56      4886
weighted avg       0.60      0.60      0.57      4886

F1 Score for hold-out sample data: 55.74


In [24]:
# Fitting model
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import cross_val_score

pipeline_XGB = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', XGBClassifier(gamma=0, learning_rate=0.0001,n_estimators=500,random_state=0,seed=seed))])

model = pipeline_XGB.fit(X_train, y_train)

# Scoring based on F1 score for hold out sample
y_predict = model.predict(X_test)
scores_holdout = f1_score(y_test, y_predict, average="macro")
print(classification_report(y_test, y_predict))
print("F1 Score for hold-out sample data:", round(scores_holdout*100,2))

# Scoring based on F1 score for cross validation
#scores_cv = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
#print("F1 Score based on cross validation:" , round(scores_cv.mean()*100,2))


              precision    recall  f1-score   support

         0.0       0.60      0.85      0.71      2703
         1.0       0.62      0.31      0.41      2183

   micro avg       0.61      0.61      0.61      4886
   macro avg       0.61      0.58      0.56      4886
weighted avg       0.61      0.61      0.58      4886

F1 Score for hold-out sample data: 56.01


In [25]:
# Fitting model
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import cross_val_score

pipeline_SVM = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,decision_function_shape='ovr', 
                                         gamma='auto',kernel='rbf', max_iter=-1, probability=False, 
                                         random_state=seed,shrinking=True, tol=0.001, verbose=False))])

model = pipeline_SVM.fit(X_train, y_train)

# Scoring based on F1 score for hold out sample
y_predict = model.predict(X_test)
scores_holdout = f1_score(y_test, y_predict, average="macro")
print(classification_report(y_test, y_predict))
print("F1 Score for hold-out sample data:", round(scores_holdout*100,2))

# Scoring based on F1 score for cross validation
#scores_cv = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
#print("F1 Score based on cross validation:" , round(scores_cv.mean()*100,2))

              precision    recall  f1-score   support

         0.0       0.60      0.80      0.69      2703
         1.0       0.59      0.35      0.44      2183

   micro avg       0.60      0.60      0.60      4886
   macro avg       0.60      0.58      0.56      4886
weighted avg       0.60      0.60      0.58      4886

F1 Score for hold-out sample data: 56.49


In [26]:
y_predict = model.predict(test) 
Submission= pd.DataFrame(columns=['shot_id_number','is_goal'])
Submission['shot_id_number']=test['shot_id_number']
Submission['is_goal']=y_predict
Submission.to_csv('E:/Data Science/ZS/Output/submission_v1.csv',index=False)
