# APPLYING RANDOM FOREST WITN 174 FEATURES 

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1. LOAD TRAIN DATA
poly_train_part1 = pd.read_csv("poly_train_part1.csv", dtype=np.float32)
poly_train_part2 = pd.read_csv("poly_train_part2.csv", dtype=np.float32)
other_train = pd.read_csv("other_train.csv", dtype=np.float32)
y_train = pd.read_csv("y_train.csv").values.ravel()

# Combine polynomial parts (row-wise)
poly_train_full = pd.concat(
    [poly_train_part1, poly_train_part2],
    axis=0
).reset_index(drop=True)

# Combine features (column-wise)
X_train = pd.concat([poly_train_full, other_train], axis=1)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# 2. LOAD TEST DATA
poly_test = pd.read_csv("poly_test.csv", dtype=np.float32)
other_test = pd.read_csv("other_test.csv", dtype=np.float32)
y_test = pd.read_csv("y_test.csv").values.ravel()

X_test = pd.concat([poly_test, other_test], axis=1)

print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# 3. TRAIN FAST RANDOM FOREST
rf_model = RandomForestRegressor(
    n_estimators=50,        # fewer trees → faster
    max_depth=10,          # shallow trees
    min_samples_leaf=20,   # reduces splits
    max_samples=0.6,       # row subsampling (HUGE speed gain)
    n_jobs=-1,
    random_state=42
)

rf_model.fit(X_train, y_train)

# 4. PREDICT & EVALUATE
y_pred = rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print("\n----- RESULTS (Random Forest) -----")
print("MSE  :", mse)
print("RMSE :", rmse)
print("R²   :", r2)


X_train shape: (1885128, 174)
y_train shape: (1885128,)
X_test shape: (471282, 174)
y_test shape: (471282,)

----- RESULTS (Random Forest) -----
MSE  : 0.03254100344708413
RMSE : 0.18039125102699446
R²   : 0.9674757006252317


# CHECKING IMPORTANCE AND CREATING A NEW TRAINING AND TESTING DATA

In [None]:
# 5. EXTRACT FEATURE IMPORTANCE

# Get feature importances from the trained model
importances = rf_model.feature_importances_

# Convert to a pandas Series with feature names
feature_importance_series = pd.Series(importances, index=X_train.columns)

# Sort features by importance (descending)
feature_importance_sorted = feature_importance_series.sort_values(ascending=False)

# Show top 20 important features
top_n = 20
print(f"\n----- TOP {top_n} FEATURES -----")
print(feature_importance_sorted.head(top_n))

# Save all feature importances
feature_importance_sorted.to_csv("feature_importances.csv", header=True)

# Save only top N feature names
top_features = feature_importance_sorted.head(top_n).index
top_features.to_series().to_csv("top_features.csv", index=False, header=False)

# 7. CREATE REDUCED DATASETS BASED ON TOP FEATURES
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

# Optionally save these reduced datasets
X_train_top.to_csv("X_train_top_features.csv", index=False)
X_test_top.to_csv("X_test_top_features.csv", index=False)

print("\nTop features extracted and datasets saved successfully.")




----- TOP 20 FEATURES -----
days_since_game                              0.810984
days_since_tweet                             0.053720
contract_remaining_days days_since_joined    0.035100
days_since_joined days_since_tweet           0.017632
tb_emotion_unknown                           0.013991
vader_emotion_unknown                        0.011016
age days_since_game                          0.005908
contract_remaining_days days_since_game      0.005319
contract_remaining_days days_since_tweet     0.005216
vader_polarity days_since_tweet              0.003257
contract_remaining_days                      0.002962
tb_polarity days_since_game                  0.002773
age contract_remaining_days                  0.002577
team_name_freq                               0.002571
vader_polarity days_since_game               0.002509
citizenship_freq                             0.002476
tb_polarity days_since_tweet                 0.002231
goals_conceded contract_remaining_days       0.001941

Here we found the first features having the highest importance value.

# CHECKING WITH 20 FEATURES

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1. LOAD TRAIN & TEST DATA
X_train = pd.read_csv("X_train_top_features.csv", dtype=np.float32)
X_test = pd.read_csv("X_test_top_features.csv", dtype=np.float32)

# Load target and convert to 1D array with float32 for memory efficiency
y_train = pd.read_csv("y_train.csv", dtype=np.float32).values.ravel()
y_test = pd.read_csv("y_test.csv", dtype=np.float32).values.ravel()

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# 2. TRAIN FAST RANDOM FOREST
rf_model = RandomForestRegressor(
    n_estimators=50,        # fewer trees → faster
    max_depth=10,           # shallow trees
    min_samples_leaf=20,    # reduces splits
    max_samples=0.6,        # row subsampling → huge speed gain
    n_jobs=-1,              
    random_state=42
)

rf_model.fit(X_train, y_train)

# 3. PREDICT & EVALUATE ON TRAIN DATA
y_train_pred = rf_model.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

print("\n----- TRAIN RESULTS -----")
print("MSE  :", mse_train)
print("RMSE :", rmse_train)
print("R²   :", r2_train)

# 4. PREDICT & EVALUATE ON TEST DATA
y_test_pred = rf_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

print("\n----- TEST RESULTS -----")
print("MSE  :", mse_test)
print("RMSE :", rmse_test)
print("R²   :", r2_test)


X_train shape: (1885128, 20)
y_train shape: (1885128,)
X_test shape: (471282, 20)
y_test shape: (471282,)

----- TRAIN RESULTS -----
MSE  : 0.03313889833144061
RMSE : 0.1820409248807548
R²   : 0.9668725830811422

----- TEST RESULTS -----
MSE  : 0.033460467538756165
RMSE : 0.18292202584368064
R²   : 0.9665567094988683


# DROPPING THE DOMINATING AS WELL AS THE RELATED FEATURES CREATED DURING POLYNOMIAL FEATURISATION.

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# LOAD TRAIN & TEST DATA
X_train = pd.read_csv("X_train_top_features.csv", dtype=np.float32)
X_test  = pd.read_csv("X_test_top_features.csv", dtype=np.float32)

y_train = pd.read_csv("y_train.csv", dtype=np.float32).values.ravel()
y_test  = pd.read_csv("y_test.csv", dtype=np.float32).values.ravel()

print("Original X_train shape:", X_train.shape)
print("Original X_test shape :", X_test.shape)

# DROP days_since_game + all derived features
drop_cols = [c for c in X_train.columns if "days_since_game" in c]

X_train_no_dsg = X_train.drop(columns=drop_cols)
X_test_no_dsg  = X_test.drop(columns=drop_cols)

print("\nDropped columns:", drop_cols)
print("New X_train shape:", X_train_no_dsg.shape)
print("New X_test shape :", X_test_no_dsg.shape)

# TRAIN RANDOM FOREST (same settings)
rf_model = RandomForestRegressor(
    n_estimators=50,
    max_depth=10,
    min_samples_leaf=20,
    max_samples=0.6,
    n_jobs=-1,
    random_state=42
)

rf_model.fit(X_train_no_dsg, y_train)

# TRAIN PERFORMANCE
y_train_pred = rf_model.predict(X_train_no_dsg)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
r2_train = r2_score(y_train, y_train_pred)

print("\n----- TRAIN RESULTS (NO days_since_game) -----")
print("RMSE :", rmse_train)
print("R²   :", r2_train)

# TEST PERFORMANCE
y_test_pred = rf_model.predict(X_test_no_dsg)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2_test = r2_score(y_test, y_test_pred)

print("\n----- TEST RESULTS (NO days_since_game) -----")
print("RMSE :", rmse_test)
print("R²   :", r2_test)


Original X_train shape: (1885128, 20)
Original X_test shape : (471282, 20)

Dropped columns: ['days_since_game', 'age days_since_game', 'contract_remaining_days days_since_game', 'tb_polarity days_since_game', 'vader_polarity days_since_game', 'days_since_game^2']
New X_train shape: (1885128, 14)
New X_test shape : (471282, 14)

----- TRAIN RESULTS (NO days_since_game) -----
RMSE : 0.18068512411571847
R²   : 0.9673641969506669

----- TEST RESULTS (NO days_since_game) -----
RMSE : 0.18144212897218542
R²   : 0.9670956540606555
