In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score








In [2]:
# Load the data
df = pd.read_csv('../resources/cleaned_extended_data3.csv')


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11525 entries, 0 to 11524
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SO_x              11525 non-null  int64  
 1   CSW%              11525 non-null  float64
 2   wRAA              11525 non-null  float64
 3   Barrels           11525 non-null  int64  
 4   RBI_x             11525 non-null  int64  
 5   IFFB              11525 non-null  int64  
 6   competitive_runs  11525 non-null  float64
 7   Age               11525 non-null  int64  
 8   HR                11525 non-null  int64  
 9   R_y               11525 non-null  int64  
 10  RBI_y             11525 non-null  int64  
 11  SB                11525 non-null  int64  
 12  BB                11525 non-null  int64  
 13  SO_y              11525 non-null  int64  
 14  OBP               11525 non-null  float64
 15  SLG               11525 non-null  float64
 16  OPS               11525 non-null  float6

In [4]:
# Drop target variable from the features
X = df.drop('next_year_hits', axis=1)
y = df['next_year_hits']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Skaling the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)




In [6]:
# Random Forest Regressor model 
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model
rf.fit(X_train, y_train)

In [7]:
# Predict the target variable
y_pred = rf.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

MSE: 724.0826046420825
RMSE: 26.90878303903918
R^2: 0.728509887566525


In [8]:
# Feature importances 
feature_importances = pd.DataFrame(rf.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

                  importance
RBI_x               0.231174
R_y                 0.103655
Contact%            0.055675
WAR                 0.051256
Age                 0.045525
BABIP               0.041267
GB%                 0.040054
competitive_runs    0.039971
LD%                 0.035171
RBI_y               0.032663
SLG                 0.032613
SwStr%              0.030457
SO_y                0.029996
OBP                 0.029752
BB                  0.027446
SB                  0.027396
wOBA                0.025535
OPS                 0.023337
CSW%                0.020983
HR                  0.019607
wRAA                0.017407
SO_x                0.014216
Barrels             0.013775
IFFB                0.011069


In [9]:
from sklearn.model_selection import cross_val_score

# Definisanje modela
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Izvršavanje kros-validacije
scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_squared_error')

# Pretvaranje rezultata u pozitivne vrednosti
mse_scores = -scores

# Izračunavanje srednje vrednosti i standardne devijacije za MSE
print("MSE scores:", mse_scores)
print("Mean MSE:", mse_scores.mean())
print("Standard deviation of MSE:", mse_scores.std())


MSE scores: [2819.99585254 2101.74663996 1676.7722646  1544.58494868 2050.23068711]
Mean MSE: 2038.6660785770066
Standard deviation of MSE: 444.8982824403171


The cross-validation results you obtained show significant variation in the mean squared error (MSE) across the different subsets, as shown by the large standard deviation of the MSE values. The first MSE score, which is significantly higher than the others, indicates the possible existence of a subset of the data on which the model performs significantly worse, which may be an indication of overfitting or that the data are inhomogeneous.