In [10]:
import pandas as pd
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, StackingRegressor, GradientBoostingClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/processed/processed_data.csv')

# filter certain columns by index and last column
df = df.iloc[:, list(range(1, 10)) + list(range(38, 50))]
X, y = df.drop('position', axis=1), df['position']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)



(114890, 20) (28723, 20) (114890,) (28723,)


In [None]:
# use gradient boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
# gb.fit(X_train, y_train)
# gb.score(X_test, y_test)

In [None]:
# use bagging
from sklearn.ensemble import BaggingClassifier
bag = BaggingClassifier(n_estimators=100, random_state=42)
# bag.fit(X_train, y_train)
# bag.score(X_test, y_test)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
# ada.fit(X_train, y_train)
# ada.score(X_test, y_test)

In [None]:
import time
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=70, random_state=42)
# rf.fit(X_train, y_train)
# score = rf.score(X_test, y_test)

# s = time.perf_counter()
# rf.predict(X_test.iloc[:10])
# e = time.perf_counter()

# print(f'Random Forest Regressor score: {score}')
# print(f'Random Forest Regressor time: {e - s}')

In [None]:
# use rf regressor
rf = RandomForestRegressor(n_estimators=70, random_state=42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# use stacking
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=10, random_state=42)),
    ('bag', BaggingClassifier(n_estimators=10, random_state=42)),
    ('ada', AdaBoostClassifier(n_estimators=10, random_state=42))
]

stack = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(n_estimators=100, random_state=42))

stack.fit(X_train, y_train)

stack.score(X_test, y_test)

In [None]:
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, AdaBoostRegressor
# use stacking regressor
estimators = [
    ('rf', RandomForestRegressor(n_estimators=20, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=20, random_state=42)),
    ('bag', BaggingRegressor(n_estimators=20, random_state=42)),
    ('ada', AdaBoostRegressor(n_estimators=20, random_state=42))
]

stack = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=100, random_state=42))

stack.fit(X_train, y_train)

stack.score(X_test, y_test)


In [None]:
# draw feature importance
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 10))
pd.Series(rf.feature_importances_, index=X.columns).sort_values().plot.barh(ax=ax)


In [5]:
cleaned_df = pd.read_csv('../data/cleaned/cleaned_data.csv')
cleaned_df = cleaned_df.iloc[:, list(range(1, 12)) + [40]]

# convert preferred_foot to binary
cleaned_df['preferred_foot'] = cleaned_df['preferred_foot'].apply(lambda x: 1 if x == 'right' else 0)
# encode work_rate low/low is 0, low/medium is 1, low/high is 2, medium/low is 3, medium/medium is 4, medium/high is 5, high/low is 6, high/medium is 7, high/high is 8 
cleaned_df['work_rate'] = cleaned_df['work_rate'].apply(lambda x: 0 if x == 'low/low' else 1 if x == 'low/medium' else 2 if x == 'low/high' else 3 if x == 'medium/low' else 4 if x == 'medium/medium' else 5 if x == 'medium/high' else 6 if x == 'high/low' else 7 if x == 'high/medium' else 8)

X, y = cleaned_df.drop('position', axis=1), cleaned_df['position']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cleaned_df.head()

Unnamed: 0,weight_kg,preferred_foot,weak_foot,skill_moves,work_rate,pace,shooting,passing,dribbling,defending,physic,position
0,67,0,3,4,8,93.0,89.0,86.0,96.0,27.0,63.0,ST
1,80,0,4,5,8,93.0,93.0,81.0,91.0,32.0,79.0,LW
2,80,0,2,4,8,93.0,86.0,83.0,92.0,32.0,64.0,RW
3,95,0,4,4,8,76.0,91.0,81.0,86.0,34.0,86.0,ST
4,65,0,4,4,8,75.0,72.0,89.0,91.0,59.0,63.0,CM


In [12]:
# use random forest regressor
rf = RandomForestRegressor(n_estimators=70, random_state=42)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
print(f'Random Forest Regressor score: {score}')

Random Forest Regressor score: 0.8801290344677006
