In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_classif







In [2]:
# Učitavanje proširenog dataset-a
df = pd.read_csv('../resources/clean_extended_data.csv')


### DATA Cleaning

In [3]:
# Definisanje liste neophodnih karakteristika uključujući ciljnu varijablu
columns_needed = [
    'Age', 'BB', 'SO', 'SB', 'OPS', 'ISO', 'BABIP', 'sprint_speed', 'BB%', 'Contact%', 'LD%', 'H'
]

# Kreiranje novog DataFrame-a sa odabranim karakteristikama
df_selected = df[columns_needed].copy()


In [4]:
df_selected.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2447 entries, 0 to 2446
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           2447 non-null   int64  
 1   BB            2447 non-null   int64  
 2   SO            2447 non-null   int64  
 3   SB            2447 non-null   int64  
 4   OPS           2447 non-null   float64
 5   ISO           2447 non-null   float64
 6   BABIP         2447 non-null   float64
 7   sprint_speed  2447 non-null   float64
 8   BB%           2447 non-null   float64
 9   Contact%      2447 non-null   float64
 10  LD%           2447 non-null   float64
 11  H             2447 non-null   int64  
dtypes: float64(7), int64(5)
memory usage: 229.5 KB


In [5]:
# Popunjavanje nedostajućih vrednosti u koloni 'sprint_speed' sa srednjom vrednošću
df_selected['sprint_speed'].fillna(df_selected['sprint_speed'].mean(), inplace=True)


In [6]:
# Odvajanje nezavisnih promenljivih (X) od ciljne promenljive (y)
X = df_selected.drop('H', axis=1)
y = df_selected['H']

# Podela podataka na trening i test skupove
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Skaliranje podataka
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)




In [8]:
# Kreiranje instance RandomForestRegressor modela
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Obučavanje modela na trening podacima
rf.fit(X_train, y_train)

In [9]:
# Predviđanje na test skupu
y_pred = rf.predict(X_test)

# Evaluacija modela koristeći MSE, RMSE i R^2 metrike
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

MSE: 92.94562673469386
RMSE: 9.640831226335925
R^2: 0.9657437616910353


In [10]:
# Ispis važnosti karakteristika
feature_importances = pd.DataFrame(rf.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

              importance
SO              0.502859
BB              0.270563
BB%             0.106511
Contact%        0.043476
OPS             0.034638
BABIP           0.023289
SB              0.004699
LD%             0.004193
ISO             0.003698
sprint_speed    0.003428
Age             0.002646


In [11]:
from sklearn.model_selection import cross_val_score

# Definisanje modela
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Izvršavanje kros-validacije
scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_squared_error')

# Pretvaranje rezultata u pozitivne vrednosti
mse_scores = -scores

# Izračunavanje srednje vrednosti i standardne devijacije za MSE
print("MSE scores:", mse_scores)
print("Mean MSE:", mse_scores.mean())
print("Standard deviation of MSE:", mse_scores.std())


MSE scores: [659.77979469 199.14303082  80.0369002   47.19429693  69.9573047 ]
Mean MSE: 211.222265470139
Standard deviation of MSE: 230.3997953974754


Rezultati kros-validacije koje ste dobili pokazuju značajne varijacije u srednjoj kvadratnoj grešci (MSE) preko različitih podskupova, što se vidi kroz veliku standardnu devijaciju MSE vrednosti. Prvi skor MSE, koji je znatno viši od ostalih, ukazuje na moguće postojanje podskupa podataka na kojem model znatno lošije performira, što može biti indikacija preprilagođavanja (overfitting-a) ili da su podaci nehomogeni.