In [1]:
import h2o

In [21]:
from h2o.automl import H2OAutoML
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,54 mins 04 secs
H2O cluster timezone:,America/Sao_Paulo
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.3
H2O cluster version age:,6 days
H2O cluster name:,H2O_from_python_bruno_2ea3oe
H2O cluster total nodes:,1
H2O cluster free memory:,986 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [22]:
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt

# Path of the file to read
fifa_filepath = "data.csv"
# Read the file into a variable iris_data
data = pd.read_csv(fifa_filepath)
# Print the first 5 rows of the data
data.head()


df2 = data.loc[:, 'Crossing':'Release Clause']
df1 = data[['Age', 'Overall', 'Value', 'Wage', 'Preferred Foot', 'Skill Moves', 'Position', 'Height', 'Weight']]
df = pd.concat([df1, df2], axis=1)

df = df.dropna()

def value_to_int(df_value):
    try:
        value = float(df_value[1:-1])
        suffix = df_value[-1:]

        if suffix == 'M':
            value = value * 1000000
        elif suffix == 'K':
            value = value * 1000
    except ValueError:
        value = 0
    return value
  
df['Value_float'] = df['Value'].apply(value_to_int)
df['Wage_float'] = df['Wage'].apply(value_to_int)
df['Release_Clause_float'] = df['Release Clause'].apply(lambda m: value_to_int(m))

def weight_to_int(df_weight):
    value = df_weight[:-3]
    return value
  
df['Weight_int'] = df['Weight'].apply(weight_to_int)
df['Weight_int'] = df['Weight_int'].apply(lambda x: int(x))

def height_to_int(df_height):
    try:
        feet = int(df_height[0])
        dlm = df_height[-2]

        if dlm == "'":
            height = round((feet * 12 + int(df_height[-1])) * 2.54, 0)
        elif dlm != "'":
            height = round((feet * 12 + int(df_height[-2:])) * 2.54, 0)
    except ValueError:
        height = 0
    return height

df['Height_int'] = df['Height'].apply(height_to_int)


df = df.drop(['Value', 'Wage', 'Release Clause', 'Weight', 'Height'], axis=1)

le_foot = preprocessing.LabelEncoder()
df["Preferred Foot"] = le_foot.fit_transform(df["Preferred Foot"].values)


for i in ['ST', 'CF', 'LF', 'LS', 'LW', 'RF', 'RS', 'RW']:
  df.loc[df.Position == i , 'Pos'] = 'Strikers' 

for i in ['CAM', 'CDM', 'LCM', 'CM', 'LAM', 'LDM', 'LM', 'RAM', 'RCM', 'RDM', 'RM']:
  df.loc[df.Position == i , 'Pos'] = 'Midfielder' 

for i in ['CB', 'LB', 'LCB', 'LWB', 'RB', 'RCB', 'RWB','GK']:
  df.loc[df.Position == i , 'Pos'] = 'Defender' 


from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
lasso = linear_model.Lasso()
from sklearn.model_selection import train_test_split

le_class = preprocessing.LabelEncoder()

df['Pos'] = le_class.fit_transform(df['Pos'])

y = df["Pos"]

df.drop(columns=["Position"],inplace=True)


X_train, X_test, y_train, y_test = train_test_split(df, y, 
                                                    test_size=0.20, 
                                                    random_state=42 )
print(X_train.shape)
print(X_test.shape)

print(X_train.info())

(13314, 44)
(3329, 44)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13314 entries, 12320 to 17334
Data columns (total 44 columns):
Age                     13314 non-null int64
Overall                 13314 non-null int64
Preferred Foot          13314 non-null int64
Skill Moves             13314 non-null float64
Crossing                13314 non-null float64
Finishing               13314 non-null float64
HeadingAccuracy         13314 non-null float64
ShortPassing            13314 non-null float64
Volleys                 13314 non-null float64
Dribbling               13314 non-null float64
Curve                   13314 non-null float64
FKAccuracy              13314 non-null float64
LongPassing             13314 non-null float64
BallControl             13314 non-null float64
Acceleration            13314 non-null float64
SprintSpeed             13314 non-null float64
Agility                 13314 non-null float64
Reactions               13314 non-null float64
Balance                 

In [24]:

# Import a sample binary outcome train/test set into H2O

train = h2o.H2OFrame.from_python(X_train)
test = h2o.H2OFrame.from_python(X_test)


# Identify predictors and response
x = train.columns
y = "Pos"


# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_AutoML_20190513_133705,0.0860701,0.293377,0.0860701,0.179592,0.162888
StackedEnsemble_BestOfFamily_AutoML_20190513_133705,0.0867562,0.294544,0.0867562,0.179373,0.163623
XGBoost_2_AutoML_20190513_133705,0.0904843,0.300806,0.0904843,0.18588,0.168444
XGBoost_1_AutoML_20190513_133705,0.0905592,0.300931,0.0905592,0.186396,0.168471
DeepLearning_1_AutoML_20190513_133705,0.0915764,0.302616,0.0915764,0.18553,0.168618
XGBoost_grid_1_AutoML_20190513_133705_model_2,0.0934695,0.305728,0.0934695,0.192263,0.172145
GBM_2_AutoML_20190513_133705,0.0934829,0.30575,0.0934829,0.192013,0.172208
GBM_5_AutoML_20190513_133705,0.0935353,0.305835,0.0935353,0.198522,0.172464
GBM_3_AutoML_20190513_133705,0.0936547,0.30603,0.0936547,0.190319,0.171983
GBM_4_AutoML_20190513_133705,0.093863,0.306371,0.093863,0.189605,0.172206




In [34]:

# The leader model is stored here
aml.leader

# If you need to generate predictions on a test set, you can make
# predictions directly on the `"H2OAutoML"` object, or on the leader
# model object directly

preds = aml.predict(test)

# or:
preds = aml.leader.predict(test)

print(preds)

stackedensemble prediction progress: |████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


predict
0.00152002
0.700698
1.77017
1.16616
-0.0225859
0.925879
0.933315
1.06821
0.976231
1.73504





In [33]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


cf = confusion_matrix(y_test, preds)

accuracy_score(y_test, preds)

predict
0.00152002
0.700698
1.77017
1.16616
-0.0225859
0.925879
0.933315
1.06821
0.976231
1.73504


ValueError: Expected array-like (array or non-string sequence), got 