In [1]:
import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext

properties = {
    'username': 'postgres',
    'password': '20020202',
    'url': "jdbc:postgresql://localhost:5432/postgres",
    'table': 'fifa.player_data',
    'driver': 'org.postgresql.Driver'
}

def write_to_pgadmin(df, mode='overwrite'):
    df.write.format('jdbc').mode(mode)\
        .option("url", properties['url'])\
        .option("dbtable", properties['table'])\
        .option("user", properties['username'])\
        .option("password", properties['password'])\
        .option("Driver", properties['driver'])\
        .save()

def read_from_pgadmin():
    return spark.read.format("jdbc")\
        .option("url", properties['url'])\
        .option("dbtable", properties['table'])\
        .option("user", properties['username'])\
        .option("password", properties['password'])\
        .option("Driver", properties['driver'])\
        .load()

appName = "Big Data Analytics"
master = "local"

# Create Configuration object for Spark.
conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .set('spark.jars.packages', 'org.postgresql:postgresql:42.7.0')\
    .setAppName(appName)\
    .setMaster(master)

# conf = pyspark.SparkConf().\
#     set('spark.jars.packages', 'org.postgresql:postgresql:42.7.0')\
#     .setAppName(appName).setMaster(master)

# Create Spark Context with the new configurations rather than relying on the default one
sc = SparkContext.getOrCreate(conf=conf)

# You need to create SQL Context to conduct some database operations like what we will see later.
sqlContext = SQLContext(sc)

# If you have SQL context, you create the session from the Spark Context
spark = sqlContext.sparkSession.builder.getOrCreate()



In [4]:
from preprocess import *
df = read_from_pgadmin()
df_clean = clean_data(df)
properties['table'] = 'fifa.clean_data'
write_to_pgadmin(df_clean)

In [19]:
from preprocess import *
properties['table'] = 'fifa.clean_data'
df_new = read_from_pgadmin()
df_new.show(5)
preprocess_pipeline = get_preprocess_pipeline()
preprocess_pipeline_model = preprocess_pipeline.fit(df_new)
df_processed = preprocess_pipeline_model.transform(df_new)

+-------+---+---------+---------+---------+-----------+------------------------+-------------+----------------+---------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+--------------------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----------------+
|overall|age|height_cm

In [20]:
df_processed.show(5)

+-------+--------------------+
|outcome|            features|
+-------+--------------------+
|   94.0|[6.00629302994062...|
|   93.0|[6.43531396065066...|
|   90.0|[6.64982442600569...|
|   90.0|[6.22080349529564...|
|   90.0|[6.00629302994062...|
+-------+--------------------+
only showing top 5 rows



In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [7]:
train_df, test_df = df_processed.randomSplit([0.8, 0.2], seed=42)
train_df, val_df = train_df.randomSplit([0.75, 0.25], seed=42)

In [22]:

X_train = np.array(train_df.select("features").rdd.flatMap(lambda x: x).collect())
y_train = np.array(train_df.select("outcome").rdd.flatMap(lambda x: x).collect())

X_val = np.array(val_df.select("features").rdd.flatMap(lambda x: x).collect())
y_val = np.array(val_df.select("outcome").rdd.flatMap(lambda x: x).collect())

X_test = np.array(test_df.select("features").rdd.flatMap(lambda x: x).collect())
y_test = np.array(test_df.select("outcome").rdd.flatMap(lambda x: x).collect())

class MyDataSet(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = MyDataSet(X_train, y_train)
val_dataset = MyDataSet(X_val, y_val)
test_dataset = MyDataSet(X_test, y_test)

### Single Layer

In [23]:
class SingleLayerNN(nn.Module):
    def __init__(self, input_dim):
        super(SingleLayerNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.fc1(x)
    


### Multiple Layer

In [24]:
class MultiLayerNN(nn.Module):
    def __init__(self, input_dim):
        super(MultiLayerNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [14]:
import itertools
from sklearn.metrics import mean_squared_error

param_grid = {
    'lr': [0.01, 0.001],
    'batch_size': [32, 64]
}

def train_and_evaluate(model, train_loader, val_loader, lr=0.01, epochs=20):
    criterion = nn.MSELoss() 
    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_val_rmse = float('inf')
    best_model_state = None

    for epoch in range(epochs):
        model.train()
        epoch_train_loss = 0
        for batch_features, batch_labels in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item()
        
        epoch_train_loss /= len(train_loader)
        
        model.eval()
        epoch_val_loss = 0
        with torch.no_grad():
            for val_features, val_labels in val_loader:
                val_outputs = model(val_features)
                val_loss = criterion(val_outputs, val_labels)
                epoch_val_loss += val_loss.item()
        
        epoch_val_loss /= len(val_loader)
        val_rmse = np.sqrt(epoch_val_loss)

        # Check if this is the best model so far
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_model_state = model.state_dict()  # Save the best model’s state

    # Load the best model state for final evaluation
    if best_model_state:
        model.load_state_dict(best_model_state)
    
    return best_val_rmse

def grid_search(model_class, param_grid, X_train, y_train, X_val, y_val, model_name = "model"):
    best_rmse = float('inf')
    best_params = None
    best_model = None

    # Iterate over all parameter combinations
    for lr, batch_size in itertools.product(param_grid['lr'], param_grid['batch_size']):
        print(f"\nTraining with lr={lr}, batch_size={batch_size}")

        model = model_class(X_train.shape[1])
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Train and evaluate on validation data
        val_rmse = train_and_evaluate(model, train_loader, val_loader, lr=lr)

        # Update best model if validation RMSE improves
        if val_rmse < best_rmse:
            best_rmse = val_rmse
            best_params = {'lr': lr, 'batch_size': batch_size}
            best_model = model
            torch.save(best_model.state_dict(), f"{model_name}_best_model.pth")
            
    # Print best parameters
    print("\nBest parameters found:")
    print(best_params)
    print(f"Validation RMSE with best parameters: {best_rmse:.4f}")
    
    return best_model, best_params

input_dim = X_train.shape[1]

print("Grid Search for Single-Layer Model")
best_single_layer_model, best_single_params = grid_search(SingleLayerNN, param_grid, X_train, y_train, X_val, y_val, "single_layer")

print("\nGrid Search for Multi-Layer Model")
best_multi_layer_model, best_multi_params = grid_search(MultiLayerNN, param_grid, X_train, y_train, X_val, y_val, "multi_layer")






Grid Search for Single-Layer Model

Training with lr=0.01, batch_size=32

Training with lr=0.01, batch_size=64

Training with lr=0.001, batch_size=32

Training with lr=0.001, batch_size=64

Best parameters found:
{'lr': 0.01, 'batch_size': 64}
Validation RMSE with best parameters: 2.5196

Grid Search for Multi-Layer Model

Training with lr=0.01, batch_size=32

Training with lr=0.01, batch_size=64

Training with lr=0.001, batch_size=32

Training with lr=0.001, batch_size=64

Best parameters found:
{'lr': 0.01, 'batch_size': 64}
Validation RMSE with best parameters: 0.8684


In [15]:
best_single_layer_model = SingleLayerNN(input_dim)
best_single_layer_model.load_state_dict(torch.load("single_layer_best_model.pth"))

best_multi_layer_model = MultiLayerNN(input_dim)
best_multi_layer_model.load_state_dict(torch.load("multi_layer_best_model.pth"))

  best_single_layer_model.load_state_dict(torch.load("single_layer_best_model.pth"))
  best_multi_layer_model.load_state_dict(torch.load("multi_layer_best_model.pth"))


<All keys matched successfully>

In [25]:
def final_test_evaluation(model, test_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch_features, batch_labels in test_loader:
            outputs = model(batch_features)
            predictions.extend(outputs.numpy())
            actuals.extend(batch_labels.numpy())
    
    test_mse = mean_squared_error(actuals, predictions)
    test_rmse = np.sqrt(test_mse)
    print(f"Test RMSE: {test_rmse:.4f}")

test_loader = DataLoader(test_dataset, batch_size=best_single_params['batch_size'], shuffle=False)
print("\nEvaluating Single-Layer Model on Test Data")
final_test_evaluation(best_single_layer_model, test_loader)

test_loader = DataLoader(test_dataset, batch_size=best_multi_params['batch_size'], shuffle=False)
print("\nEvaluating Multi-Layer Model on Test Data")
final_test_evaluation(best_multi_layer_model, test_loader)


Evaluating Single-Layer Model on Test Data
Test RMSE: 2.5601

Evaluating Multi-Layer Model on Test Data
Test RMSE: 0.9134
