In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import xgboost as xgb
from fi_utils import load_dataset
from plt_utils import plot_confusion_matrix

### Visualização dos 5 primeiros exemplos do conjunto




In [None]:
df = load_dataset()
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
plt.figure(figsize=(12, 8))
plt.title("Quantidade de jogadores por posição")
plt.xlabel("Área de atuaçãos")
plt.ylabel("Quantidade de jogadores")
fig = sns.countplot(x = 'Pos', data =df)

In [None]:
# Set up the matplotlib figure
f, axes = plt.subplots(2, 2, figsize=(15, 15), sharex=False)
sns.despine(left=True)
sns.boxplot('Pos', 'Overall', data = df, ax=axes[0, 0])
sns.boxplot('Pos', 'HeadingAccuracy', data = df, ax=axes[0, 1])
sns.boxplot('Pos', 'ShortPassing', data = df, ax=axes[1, 1])
sns.boxplot('Pos', 'Weight_int', data = df, ax=axes[1, 0])


In [None]:
mean_value_per_age = df.groupby('Age')['Value_float'].mean()
p = sns.barplot(x = mean_value_per_age.index, y = mean_value_per_age.values)
p = plt.xticks(rotation=90)

In [None]:
mean_wage_per_age = df.groupby('Age')['Wage_float'].mean()
p = sns.barplot(x = mean_wage_per_age.index, y = mean_wage_per_age.values)
p = plt.xticks(rotation=90)

In [None]:

sns.jointplot(x='Age', y="Overall", data=df, kind="kde")


In [None]:
sns.jointplot(x='Value_float', y="Overall", data=df)

In [None]:
sns.lineplot(x='Value_float', y="Overall", data=df)

In [None]:
ax = sns.scatterplot(x="ShortPassing", y="Finishing", hue="Pos",data=df)

In [None]:
df_new = df[['Overall',
'BallControl',
'Acceleration',
'LongShots',
'Aggression',
'Pos']]

sns.pairplot(df_new, kind="scatter", hue="Pos") 
plt.show()


In [None]:
df_new = df[['Height_int',
'ShortPassing',
'Finishing',
'Volleys',
'HeadingAccuracy',
'Pos']]

sns.pairplot(df_new, kind="scatter", hue="Pos") 
plt.show()

In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
lasso = linear_model.Lasso()
from sklearn.model_selection import train_test_split

le_class = preprocessing.LabelEncoder()

df['Pos'] = le_class.fit_transform(df['Pos'])

y = df["Pos"]

df.drop(columns=["Position","Pos"],inplace=True)


X_train, X_test, y_train, y_test = train_test_split(df, y, 
                                                    test_size=0.20, 
                                                    random_state=50 )
print(X_train.shape)
print(X_test.shape)

print(X_train.info())



In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)

dtest = xgb.DMatrix(X_test,label=y_test)

param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 50  # the number of training iterations

bst = xgb.train(param, dtrain, num_round)
bst.dump_model('dump.raw.txt')
preds = bst.predict(dtest)

In [None]:

best_preds = np.asarray([np.argmax(line) for line in preds])

cf = confusion_matrix(y_test, best_preds)

df_cm = pd.DataFrame(cf)
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)
sns.heatmap(df_cm, annot=True, fmt="d", annot_kws={"size": 16})

accuracy_score(y_test, best_preds)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from datetime import datetime

In [None]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [None]:
# A parameter grid for XGBoost
param = {
        'silent': [False],
        'max_depth': [6, 10, 15, 20],
        'objective': 'multi:softprob',
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0,3],
        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
        'gamma': [0, 0.25, 0.5, 1.0],
        'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
        'n_estimators': [100]}



In [None]:
xgb_class = XGBClassifier()

In [None]:
folds = 10
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb_class, param_distributions=params, n_iter=param_comb, n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )

start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train,y_train)
timer(start_time) # timing ends here for "start_time" variab

In [None]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)

In [None]:
param = {'subsample': 0.8, 'min_child_weight': 10, 'max_depth': 7, 'gamma': 0.5, 'colsample_bytree': 1.0,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset

num_round = 50  # the number of training iterations

bst = xgb.train(param, dtrain, num_round)
bst.dump_model('dump.raw.txt')
preds = bst.predict(dtest)
preds_train = bst.predict(dtrain)


In [None]:

best_preds = np.asarray([np.argmax(line) for line in preds])

best_preds_train = np.asarray([np.argmax(line) for line in preds_train])

cf_train = confusion_matrix(y_train, best_preds_train)

accuracy_score(y_train, best_preds_train)

cf = confusion_matrix(y_test, best_preds)

accuracy_score(y_test, best_preds)

In [None]:
clf = xgb.XGBClassifier()



fit_params = {'eval_metric': 'mlogloss',
              'early_stopping_rounds': 10,
              'eval_set': [(x_valid, y_valid)]}

rs_clf = RandomizedSearchCV(clf, param_grid, n_iter=20,
                            n_jobs=1, verbose=2, cv=2,
                            fit_params=fit_params,
                            scoring='neg_log_loss', refit=False, random_state=42)
print("Randomized search..")
search_time_start = time.time()
rs_clf.fit(x_train, y_train)
print("Randomized search time:", time.time() - search_time_start)

best_score = rs_clf.best_score_
best_params = rs_clf.best_params_
print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_params.keys()):
    print('%s: %r' % (param_name, best_params[param_name]))