# POLYNOMIAL FEATURISATION WITH K=1:

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# --------------------------------------------------
# 1) Load dataset
# --------------------------------------------------
df = pd.read_csv("final_preprocessed_file.csv")

# --------------------------------------------------
# 2) Drop ID and name columns
# --------------------------------------------------
cols_to_drop = [
    "player_id",
    "team_id",
    "current_club_id",
    "player_agent_id",
    "player_name"
]

df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

# --------------------------------------------------
# 3) Choose target column (transfer value)
# --------------------------------------------------
target = "value_transformed"     # <-- Change if needed

y = df[target]

# --------------------------------------------------
# 4) Select numeric features only
# --------------------------------------------------
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

numeric_cols = numeric_cols.drop(target)  # remove target from features

X = df[numeric_cols]

print("Numeric feature columns used:", list(numeric_cols))

# --------------------------------------------------
# 5) Train/Test Split
# --------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --------------------------------------------------
# 6) Build Pipeline (Polynomial + Scaling + Linear Regression)
# --------------------------------------------------
degree_k = 1   # You can change to 3 if needed

model = Pipeline([
    ("poly", PolynomialFeatures(degree=degree_k, include_bias=False)),
    ("scaler", StandardScaler()),
    ("reg", LinearRegression())
])

# --------------------------------------------------
# 7) Fit Model
# --------------------------------------------------
model.fit(X_train, y_train)

# --------------------------------------------------
# 8) Evaluate Model
# --------------------------------------------------
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print("\nðŸ“Š MODEL PERFORMANCE WITH POLYNOMIAL FEATURES (after dropping ID columns)")
print(f"MSE  : {mse}")
print(f"RMSE : {rmse}")
print(f"RÂ²   : {r2}")


Numeric feature columns used: ['nb_in_group', 'nb_on_pitch', 'goals', 'assists', 'own_goals', 'subed_in', 'subed_out', 'yellow_cards', 'second_yellow_cards', 'direct_red_cards', 'penalty_goals', 'minutes_played', 'goals_conceded', 'clean_sheets', 'height', 'is_eu', 'days_missed', 'games_missed', 'vader_polarity', 'tb_polarity', 'foot_both', 'foot_left', 'foot_right', 'foot_unknown', 'vader_emotion_negative', 'vader_emotion_neutral', 'vader_emotion_positive', 'vader_emotion_unknown', 'tb_emotion_negative', 'tb_emotion_neutral', 'tb_emotion_positive', 'tb_emotion_unknown', 'age', 'contract_remaining_days', 'days_since_joined', 'days_since_game', 'days_since_tweet', 'game_year', 'game_month', 'game_week', 'game_weekday', 'tweet_year', 'tweet_month', 'tweet_week', 'tweet_weekday', 'competition_id_freq', 'competition_name_freq', 'team_name_freq', 'citizenship_freq', 'position_Attack', 'position_Attack - Centre-Forward', 'position_Attack - Left Winger', 'position_Attack - Right Winger', 'pos

# POLYNOMIAL FEATURISATION WITH K=2

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

# -----------------------------------------------------
# 1. Load dataset
# -----------------------------------------------------
df = pd.read_csv("final_preprocessed_file.csv")

# 2. Drop ID / text columns (NOT useful for transfer value)
drop_cols = [
    "player_id", "team_id", "current_club_id",
    "player_agent_id", "player_name"
]
df = df.drop(columns=drop_cols, errors="ignore")

# 3. Define target and nonlinear numeric features
target = "value_transformed"

nonlinear_numeric_features = [
    "goals", "assists", "minutes_played", "goals_conceded",
    "clean_sheets", "days_missed", "games_missed",
    "vader_polarity", "tb_polarity",
    "age", "contract_remaining_days",
    "days_since_joined", "days_since_game", "days_since_tweet"
]

# Only keep columns that exist
nonlinear_numeric_features = [
    col for col in nonlinear_numeric_features if col in df.columns
]

# All other features (categorical one-hot, frequencies, etc.)
other_features = [
    col for col in df.columns
    if col not in nonlinear_numeric_features + [target]
]

X1 = df[nonlinear_numeric_features]   # polynomial block
X2 = df[other_features]               # everything else
y = df[target]

# 4. Train-test split
X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(
    X1, X2, y, test_size=0.2, random_state=42
)

# 5. Scale nonlinear numeric part (important!)
scaler = StandardScaler()
X1_train_scaled = scaler.fit_transform(X1_train)
X1_test_scaled = scaler.transform(X1_test)

# 6. Polynomial expansion
poly = PolynomialFeatures(degree=2, include_bias=False)

X1_train_poly = poly.fit_transform(X1_train_scaled)
X1_test_poly = poly.transform(X1_test_scaled)

# Convert to float32 to reduce memory
X1_train_poly = X1_train_poly.astype("float32")
X1_test_poly = X1_test_poly.astype("float32")

# 7. Convert *other* features to numeric + float32
X2_train = X2_train.apply(pd.to_numeric, errors="coerce").fillna(0)
X2_test = X2_test.apply(pd.to_numeric, errors="coerce").fillna(0)

X2_train_np = X2_train.to_numpy(dtype="float32")
X2_test_np = X2_test.to_numpy(dtype="float32")


# 8. Combine polynomial and other features safely
X_train_final = np.hstack([X1_train_poly, X2_train_np])
X_test_final = np.hstack([X1_test_poly, X2_test_np])

print("Final train shape:", X_train_final.shape)
print("Final test shape:", X_test_final.shape)


# 9. Lasso Regression
lasso = Lasso(alpha=0.001, max_iter=15000)
lasso.fit(X_train_final, y_train)


# 10. Predictions
y_pred = lasso.predict(X_test_final)


# 11. Evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\n----- RESULTS -----")
print("MSE :", mse)
print("RMSE:", rmse)
print("RÂ²   :", r2)


Final train shape: (1885128, 174)
Final test shape: (471282, 174)

----- RESULTS -----
MSE : 0.15189579764459202
RMSE: 0.3897381141800119
RÂ²   : 0.8481821740870557


# SAVING THE NEWLY CREATED FEATURES

In [7]:
import pandas as pd
import numpy as np

# -----------------------------------------------------
# 1. SMART MEMORY OPTIMIZER
# -----------------------------------------------------

def optimize_df(df):
    """Downcast ints, convert floats to float16 for max memory saving."""
    for col in df.columns:
        col_type = df[col].dtype
        
        # Downcast integers
        if pd.api.types.is_integer_dtype(col_type):
            df[col] = pd.to_numeric(df[col], downcast="integer")
        
        # Downcast floats â†’ float16
        elif pd.api.types.is_float_dtype(col_type):
            df[col] = df[col].astype("float16")
            
        # Convert object columns â†’ numeric float16 (if possible)
        elif df[col].dtype == "object":
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype("float16")

    return df


# -----------------------------------------------------
# 2. BUILD SEPARATE DATAFRAMES
# -----------------------------------------------------

# Polynomial feature names
poly_feature_names = poly.get_feature_names_out(nonlinear_numeric_features)

# Polynomial features â†’ float16
df_poly_train = pd.DataFrame(X1_train_poly.astype("float16"), columns=poly_feature_names)
df_poly_test  = pd.DataFrame(X1_test_poly.astype("float16"),  columns=poly_feature_names)

# Other features â†’ convert safely then optimize
df_other_train = X2_train.apply(pd.to_numeric, errors="coerce").fillna(0)
df_other_test  = X2_test.apply(pd.to_numeric, errors="coerce").fillna(0)

# Optimize all DataFrames
df_poly_train  = optimize_df(df_poly_train)
df_poly_test   = optimize_df(df_poly_test)
df_other_train = optimize_df(df_other_train)
df_other_test  = optimize_df(df_other_test)


# -----------------------------------------------------
# 3. SAVE AS 4 SEPARATE FEATURE FILES + 2 TARGET FILES
# -----------------------------------------------------

df_poly_train.to_csv("poly_train.csv", index=False)
df_poly_test.to_csv("poly_test.csv", index=False)

df_other_train.to_csv("other_train.csv", index=False)
df_other_test.to_csv("other_test.csv", index=False)

# Save target separate
y_train.reset_index(drop=True).to_csv("y_train.csv", index=False)
y_test.reset_index(drop=True).to_csv("y_test.csv", index=False)

print("âœ… Saved files:")
print(" - poly_train.csv")
print(" - poly_test.csv")
print(" - other_train.csv")
print(" - other_test.csv")
print(" - y_train.csv")
print(" - y_test.csv")
print("ðŸŽ‰ All files saved in memory-optimized format!")


âœ… Saved files:
 - poly_train.csv
 - poly_test.csv
 - other_train.csv
 - other_test.csv
 - y_train.csv
 - y_test.csv
ðŸŽ‰ All files saved in memory-optimized format!


# SAVING THE POLY_TRAIN INTO TWO SUB FILES FOR MEMORY EFIICIENCY

In [8]:
import pandas as pd

def split_csv_into_two_rowwise(input_file):
    df = pd.read_csv(input_file)

    mid = len(df) // 2  # midpoint

    df1 = df.iloc[:mid]
    df2 = df.iloc[mid:]

    df1.to_csv(input_file.replace(".csv", "_part1.csv"), index=False)
    df2.to_csv(input_file.replace(".csv", "_part2.csv"), index=False)

    print("Saved:")
    print(f"- {input_file.replace('.csv', '_part1.csv')}")
    print(f"- {input_file.replace('.csv', '_part2.csv')}")

# Run
split_csv_into_two_rowwise("poly_train.csv")


Saved:
- poly_train_part1.csv
- poly_train_part2.csv


# VERIFYING WHETHER THE SPLIT OCCURED CORECTLY

In [2]:
import pandas as pd

original = pd.read_csv("poly_train.csv")
print(len(original))  # number of rows in original file


1885128


In [1]:
import pandas as pd

part1 = pd.read_csv("poly_train_part1.csv")
part2 = pd.read_csv("poly_train_part2.csv")
print(len(part1) + len(part2))  # should match len of original poly_train.csv


1885128
