In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType  

appName = "FifaProject"
master = "local" 
spark = SparkSession.builder \
    .appName(appName) \
    .master(master) \
    .config("spark.driver.host", "127.0.0.1") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate() 
#.config("spark.jars", "/Users/kozasound/Desktop/14_763_AISTC/postgresql-42.6.2.jar") \
    
print("Spark session started successfully with PostgreSQL driver")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/14 20:55:34 INFO SparkEnv: Registering MapOutputTracker
24/11/14 20:55:34 INFO SparkEnv: Registering BlockManagerMaster
24/11/14 20:55:34 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/11/14 20:55:34 INFO SparkEnv: Registering OutputCommitCoordinator


Spark session started successfully with PostgreSQL driver


Task-I: Build and populate necessary tables (30% of course project
grade)
• Ingest the data from all years (Male: 2015-2022 and Female: 2016-2022) into
one Postgres Database table.
o Conduct any column name changes to ensure data from various years are
properly aligned in the correct columns in your DB table.
• Add a new column for the year. Also, ensure every record can be uniquely
identified in the database table.
• Your tables should be created in schema with the name “fifa”.
• In your ReadMe.md, add a description of the features in the dataset.
• In your ReadMe.md file, comment on the benefit of using PostgreSQL DB table
compared to a NoSQL Database in this case.

In [4]:
from pyspark.sql.functions import lit, when, col
from pyspark.sql.types import BooleanType, DoubleType, IntegerType


bucket_name = 'dataproc-staging-us-central1-291694249410-7gsa4pxg'

male_files = [f'gs://{bucket_name}/notebooks/jupyter/players_{year}.csv' for year in range(15, 23)]
male_years = [2000 + year for year in range(15, 23)]
female_files = [f'gs://{bucket_name}/notebooks/jupyter/female_players_{year}.csv' for year in range(16, 23)]
female_years = [2000 + year for year in range(16, 23)]
files = female_files + male_files
years = female_years + male_years
genders = ['Female'] * len(female_files) + ['Male'] * len(male_files)

df_list = []

for file, year, gender in zip(files, years, genders):
    try:
        temp_df = spark.read.csv(file, header=True, inferSchema=True)
        temp_df = temp_df.withColumn("year", lit(year)).withColumn("sex", lit(gender))
        temp_df = temp_df.select([when(col(c) == "", None).otherwise(col(c)).alias(c) for c in temp_df.columns])
        
        columns_to_cast = {
            'value_eur': DoubleType(),
            'wage_eur': DoubleType(),
            'club_team_id': IntegerType(),
            'league_level': IntegerType(),
            'club_jersey_number': IntegerType(),
        }
        
        for column, col_type in columns_to_cast.items():
            if column in temp_df.columns:
                temp_df = temp_df.withColumn(column, col(column).cast(col_type))
        
        if 'real_face' in temp_df.columns:
            temp_df = temp_df.withColumn("real_face", when(col("real_face") == "Yes", True).when(col("real_face") == "No", False).otherwise(None).cast(BooleanType()))
        
        df_list.append(temp_df)
        
        print(f"Data for year {year}, gender: {gender} loaded into DataFrame.")
    except Exception as e:
        print(f"An error occurred while processing year {year}, gender: {gender}: {e}")

#df = df_list[0]  
#for temp_df in df_list[1:]:
    #df = df.unionByName(temp_df, allowMissingColumns=True)
#df.show()


Data for year 2016, gender: Female loaded into DataFrame.
Data for year 2017, gender: Female loaded into DataFrame.
Data for year 2018, gender: Female loaded into DataFrame.
Data for year 2019, gender: Female loaded into DataFrame.
Data for year 2020, gender: Female loaded into DataFrame.
Data for year 2021, gender: Female loaded into DataFrame.
Data for year 2022, gender: Female loaded into DataFrame.
Data for year 2015, gender: Male loaded into DataFrame.
Data for year 2016, gender: Male loaded into DataFrame.
Data for year 2017, gender: Male loaded into DataFrame.
Data for year 2018, gender: Male loaded into DataFrame.
Data for year 2019, gender: Male loaded into DataFrame.
Data for year 2020, gender: Male loaded into DataFrame.
Data for year 2021, gender: Male loaded into DataFrame.
Data for year 2022, gender: Male loaded into DataFrame.


24/11/14 21:01:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---------+--------------------+-------------+--------------------+----------------+-------+---------+---------+--------+---+----------+---------+---------+------------+---------+-----------+------------+-------------+------------------+----------------+-----------+-------------------------+--------------+----------------+--------------+---------------+--------------------+--------------+---------+-----------+------------------------+-------------+----------------+---------+------------------+--------------------+--------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+

Task-II: Conduct analytics on your dataset (20% of course project
grade)
Develop Python functions that run Spark to answer the following questions (given that x,
y and z) are user-entered parameters. Core analysis should be conducted via Spark
and data should be ingested from Postgres database.
• In Year X, what were the Y clubs that had the highest number of players with
contracts ending in year Z (or after)?
o X is a year between (2015 and 2022, inclusively).
o Y is a positive integer.
o Z is a year that can hold the value of 2023 or a year after it.
• In sports, maturity and energy of teams depend on the average age of team
players (among other factors). Therefore, it’s important to have a function that
can find clubs with such features.
o List the X clubs with the highest (or lowest) average player age for a given
year Y.
▪ X represents a positive integer, but you should handle a scenario if
X is not positive value.
▪ Y represents a year between 2015 and 2022 inclusively.
▪ Provide the user with the ability to choose if they want the highest
average age or the lowest average age.
▪ Make sure to handle this scenario as well: if the user requests 5
clubs with highest averages but there are 3 clubs that share the
same count at rank number 5, please include all of them in your
output
• What is the most popular nationality in the dataset for each year? (i.e. display the
most frequent nation for 2015, 2016, etc.).

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, max as max_, avg, lit, when
from pyspark.sql.types import BooleanType, DoubleType, IntegerType


# Bullet Point 1: 
def top_clubs_by_contracts(df, x_year, y_clubs, z_contract):
    if x_year < 2015 or x_year > 2022:
        print("Invalid year. Please enter a year between 2015 and 2022.")
        return
    if y_clubs <= 0:
        print("Invalid number of clubs. Please enter a positive integer.")
        return
    if z_contract < 2023:
        print("Invalid contract year. Please enter a year 2023 or later.")
        return

    filt_df = df.filter((col("year") == x_year) & (col("club_contract_valid_until") >= z_contract) & (col("sex") == "Male"))
    club_counts = filt_df.groupBy("club_name") \
                         .agg(count("sofifa_id").alias("player_count")) \
                         .orderBy(col("player_count").desc()) \
                         .limit(y_clubs)
    club_counts.show()

top_clubs_by_contracts(df, 2020, 5, 2024)

# Bullet Point 2: 
def clubs_by_avg_age(df, y_year, x_clubs, order='highest'):
    if y_year < 2015 or y_year > 2022:
        print("Invalid year. Please enter a year between 2015 and 2022.")
        return
    if x_clubs <= 0:
        print("Invalid number of clubs. Please enter a positive integer.")
        return
    if order not in ['highest', 'lowest']:
        print("Invalid order. Please choose 'highest' or 'lowest'.")
        return

    filt_df = df.filter((col("year") == y_year) & (col("sex") == "Male"))
    avg_age_df = filt_df.groupBy("club_name").agg(avg("age").alias("avg_age"))
    
    if order == 'highest':
        sorted_df = avg_age_df.orderBy(col("avg_age").desc())
    else:
        sorted_df = avg_age_df.orderBy(col("avg_age").asc())
    
    top_df = sorted_df.limit(x_clubs)
    max_age = top_df.select("avg_age").collect()[-1][0] if top_df.count() == x_clubs else None
    
    res_df = sorted_df.filter(col("avg_age") >= max_age) if max_age else top_df
    res_df.show()

clubs_by_avg_age(df, 2020, 5, order='lowest')

# Bullet Point 3: 
def most_popular_nationality(df):
    nat_counts = df.filter(col("sex") == "Male").groupBy("year", "nationality_name").agg(count("*").alias("count"))
    max_per_year = nat_counts.groupBy("year").agg(max_("count").alias("max_count"))
    most_popular = nat_counts.alias("nc").join(
        max_per_year.alias("mpy"),
        (col("nc.year") == col("mpy.year")) & (col("nc.count") == col("mpy.max_count"))
    ).select(col("nc.year"), col("nc.nationality_name"), col("nc.count")).orderBy("nc.year")
    most_popular.show()

most_popular_nationality(df)

+-------------------+------------+
|          club_name|player_count|
+-------------------+------------+
|   Deportes Iquique|          12|
|Patriotas Boyacá FC|          12|
|          Al Ain FC|          11|
|     Atlético Huila|          11|
|  Alianza Petrolera|          11|
+-------------------+------------+

+--------------------+------------------+
|           club_name|           avg_age|
+--------------------+------------------+
|            Barnsley|21.566666666666666|
|          Godoy Cruz|21.607142857142858|
|     Fortuna Sittard|             21.68|
|       SC Heerenveen|21.695652173913043|
|Futebol Clube de ...|22.291666666666668|
|       AFC Wimbledon|22.392857142857142|
|FC Würzburger Kic...|22.428571428571427|
|          RB Leipzig|22.454545454545453|
|        Silkeborg IF| 22.48148148148148|
|           Brentford|22.533333333333335|
|    Waasland-Beveren| 22.53846153846154|
|            OGC Nice|22.551724137931036|
|            KRC Genk|22.571428571428573|
|           



+----+----------------+-----+
|year|nationality_name|count|
+----+----------------+-----+
|2015|         England| 1627|
|2016|         England| 1519|
|2017|         England| 1627|
|2018|         England| 1633|
|2019|         England| 1625|
|2020|         England| 1670|
|2021|         England| 1685|
|2022|         England| 1719|
+----+----------------+-----+



                                                                                

Task- III Machine Learning Modeling (30% of course project grade)
• Build a machine learning model that can predict the overall value for each player
based on their skillsets.
o Use proper feature engineering principles (including data cleaning and
data engineering)
o Build two versions: one in Spark and the other one in PyTorch or
Tensorflow.
o For each version, choose two different classifiers/regressors. You can use
the same two choices for Spark and PyTorch/Tensorflow, and neural
networks of substantial different structures (deep vs shallow, MLP vs
CNN) count as two different classifiers/regressors. For each
classifier/regressor, identify a few tunable parameters for your model and
tune the parameters (using proper metric(s)). Then, run the best model
(after tuning) on the test data set and record the test accuracy.
o In your ReadMe file, explain why you chose the classifiers/regressors and
provide comments on the impact of the tunable parameters on the
accuracy. Also, compare the selected models.

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, max as max_, avg, lit, when, regexp_extract
from pyspark.sql.types import BooleanType, DoubleType, IntegerType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
import matplotlib.pyplot as plt

task3_male_data = df.filter((col("sex") == "Male"))
task3_male_data = task3_male_data.drop('dob')

row_count = task3_male_data.count()
print(f"Total number of rows with 'sex' == 'Male': {row_count}")

for col_name in task3_male_data.columns:
    if '.' in col_name:
        new_col_name = col_name.replace('.', '_')
        task3_male_data = task3_male_data.withColumnRenamed(col_name, new_col_name)

threshold = 0.1 * task3_male_data.count()
dropped_columns = []

for col_name in task3_male_data.columns:
    null_count = task3_male_data.filter(col(col_name).isNull()).count()
    if null_count > threshold:
        task3_male_data = task3_male_data.drop(col_name)
        dropped_columns.append(col_name)

task3_male_data = task3_male_data.na.drop()

def extract_base_value(col_name):
    return regexp_extract(col(col_name), r"(\d+)", 0).cast('int')

columns_to_process = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 
                      'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 
                      'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']

for col_name in columns_to_process:
    if col_name in task3_male_data.columns:
        task3_male_data = task3_male_data.withColumn(col_name, extract_base_value(col_name))

columns_to_keep = [col for col in task3_male_data.columns if col not in dropped_columns]
task3_male_data = task3_male_data.select(columns_to_keep)

numerical_cols = []
categorical_cols = []
binary_cols = []
nominal_cols = []

for col_name in task3_male_data.columns:
    if dict(task3_male_data.dtypes)[col_name] in ['int', 'double']:
        numerical_cols.append(col_name)
    else:
        categorical_cols.append(col_name)

for col_name in categorical_cols:
    unique_values = task3_male_data.select(col_name).distinct().count()
    if unique_values == 2:
        binary_cols.append(col_name)
    elif unique_values > 2:
        nominal_cols.append(col_name)

for col_name in binary_cols:
    task3_male_data = task3_male_data.withColumn(col_name, 
        when(col(col_name) == 'Right', 1).when(col(col_name) == 'Yes', 1).otherwise(0))

indexers = []
encoders = []

for col_name in nominal_cols:
    indexer = StringIndexer(inputCol=col_name, outputCol=col_name + "_index")
    encoder = OneHotEncoder(inputCol=col_name + "_index", outputCol=col_name + "_onehot")
    indexers.append(indexer)
    encoders.append(encoder)

pipeline = Pipeline(stages=indexers + encoders)
task3_male_data = pipeline.fit(task3_male_data).transform(task3_male_data)

df_pandas = task3_male_data.select(numerical_cols).toPandas()
correlation_matrix = df_pandas.corr()
threshold = 0.95
to_drop = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            to_drop.add(colname)

task3_male_data = task3_male_data.drop(*list(to_drop))

task3_male_data = task3_male_data.drop(*['short_name', 'player_positions', 'work_rate','league_name','nationality_name','club_position', 
                                         'player_url', 'long_name', 'club_name', 'club_joined', 'club_contract_valid_until', 
                                         'body_type', 'player_face_url', 'club_logo_url', 'club_flag_url', 'nation_flag_url', 'sex'])

feature_columns = [col for col in task3_male_data.columns if col != 'overall' and col != 'sofifa_id' and 'onehot' not in col]
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="vectorized_features")
df_with_assembled_features = vector_assembler.transform(task3_male_data)

scaler = StandardScaler(inputCol="vectorized_features", outputCol="scaled_features", withStd=True, withMean=True)
scaler_pipeline = Pipeline(stages=[scaler])
df_maleFIFA_scaled = scaler_pipeline.fit(df_with_assembled_features).transform(df_with_assembled_features)

df_maleFIFA_scaled.select("scaled_features").show(5, truncate=False)

                                                                                

Total number of rows with 'sex' == 'Male': 142079


24/11/14 22:21:15 WARN DAGScheduler: Broadcasting large task binary with size 1077.9 KiB
24/11/14 22:21:36 WARN DAGScheduler: Broadcasting large task binary with size 23.6 MiB
24/11/14 22:21:47 WARN DAGScheduler: Broadcasting large task binary with size 23.3 MiB
24/11/14 22:21:50 WARN DAGScheduler: Broadcasting large task binary with size 23.4 MiB


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[Stage 913:>                                                        (0 + 1) / 1]                                                                                

In [11]:
from pyspark.ml.regression import RandomForestRegressor, LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, abs, when


train_data, test_data = df_maleFIFA_scaled.select("sofifa_id", "scaled_features", "overall").randomSplit([0.8, 0.2], seed=42)


rf = RandomForestRegressor(featuresCol="scaled_features", labelCol="overall")
lr = LinearRegression(featuresCol="scaled_features", labelCol="overall")


rf_model = rf.fit(train_data)
lr_model = lr.fit(train_data)


rf_train_predictions = rf_model.transform(train_data)
rf_test_predictions = rf_model.transform(test_data)
lr_train_predictions = lr_model.transform(train_data)
lr_test_predictions = lr_model.transform(test_data)


evaluator = RegressionEvaluator(labelCol="overall", predictionCol="prediction", metricName="rmse")


rf_rmse_train = evaluator.evaluate(rf_train_predictions)
rf_rmse_test = evaluator.evaluate(rf_test_predictions)
lr_rmse_train = evaluator.evaluate(lr_train_predictions)
lr_rmse_test = evaluator.evaluate(lr_test_predictions)

print(f"Random Forest Train RMSE: {rf_rmse_train}")
print(f"Random Forest Test RMSE: {rf_rmse_test}")
print(f"Linear Regression Train RMSE: {lr_rmse_train}")
print(f"Linear Regression Test RMSE: {lr_rmse_test}")


def calculate_accuracy(predictions, threshold=0.05):
    correct_predictions = predictions.withColumn(
        "accuracy",
        when(abs(col("prediction") - col("overall")) / col("overall") <= threshold, 1).otherwise(0)
    )
    accuracy = correct_predictions.select("accuracy").agg({"accuracy": "avg"}).collect()[0][0]
    return accuracy * 100  


rf_accuracy_train = calculate_accuracy(rf_train_predictions)
rf_accuracy_test = calculate_accuracy(rf_test_predictions)
lr_accuracy_train = calculate_accuracy(lr_train_predictions)
lr_accuracy_test = calculate_accuracy(lr_test_predictions)

print(f"Random Forest Train Accuracy: {rf_accuracy_train}%")
print(f"Random Forest Test Accuracy: {rf_accuracy_test}%")
print(f"Linear Regression Train Accuracy: {lr_accuracy_train}%")
print(f"Linear Regression Test Accuracy: {lr_accuracy_test}%")


24/11/14 22:22:21 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB
24/11/14 22:22:23 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB
24/11/14 22:22:35 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB
24/11/14 22:22:45 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB
24/11/14 22:22:56 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB
24/11/14 22:22:59 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB
24/11/14 22:23:02 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB
24/11/14 22:23:05 WARN DAGScheduler: Broadcasting large task binary with size 23.8 MiB
24/11/14 22:23:13 WARN Instrumentation: [b4f3e319] regParam is zero, which might cause numerical instability and overfitting.
24/11/14 22:23:14 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB
24/11/14 22:23:23 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB
24/1

Random Forest Train RMSE: 1.672286588699054
Random Forest Test RMSE: 1.6942322616753598
Linear Regression Train RMSE: 1.8317847602055326
Linear Regression Test RMSE: 1.8426220175842092


24/11/14 22:24:39 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB
24/11/14 22:24:53 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB
24/11/14 22:25:07 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB
24/11/14 22:25:22 WARN DAGScheduler: Broadcasting large task binary with size 23.7 MiB

Random Forest Train Accuracy: 93.41791852111942%
Random Forest Test Accuracy: 93.40916053921569%
Linear Regression Train Accuracy: 91.71451075661672%
Linear Regression Test Accuracy: 91.5594362745098%


                                                                                

In [None]:
!pip install torch

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

df_pandas = df_maleFIFA_scaled.select("scaled_features", "overall").toPandas()
features = np.array(df_pandas['scaled_features'].to_list())
labels = np.array(df_pandas['overall'])

X = torch.tensor(features, dtype=torch.float32)
y = torch.tensor(labels, dtype=torch.float32)

train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super(MLPModel, self).__init__()
        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.output = nn.Linear(hidden_dim, 1)
        
    def forward(self, x):
        x = self.relu(self.hidden(x))
        x = self.output(x)
        return x

model = MLPModel(input_dim=X_train.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss}")


test_loader = DataLoader(test_data, batch_size=64, shuffle=False)
def evaluate_model(model, data_loader, criterion):
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for inputs, targets in data_loader:
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            total_loss += loss.item()
            all_preds.append(outputs.squeeze())
            all_targets.append(targets)
    all_preds = torch.cat(all_preds)
    all_targets = torch.cat(all_targets)
    rmse = torch.sqrt(torch.tensor(total_loss / len(data_loader)))
    correct = (torch.abs(all_preds - all_targets) / all_targets <= 0.05).float()
    accuracy = correct.mean().item() * 100
    return rmse.item(), accuracy

test_rmse, test_accuracy = evaluate_model(model, test_loader, criterion)
print(f"Test RMSE: {test_rmse}")
print(f"Test Accuracy: {test_accuracy}%")


ModuleNotFoundError: No module named 'torch'

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

class CNNModel(nn.Module):
    def __init__(self, input_dim, conv1_out=32, conv2_out=64):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=conv1_out, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=conv1_out, out_channels=conv2_out, kernel_size=3)
        self.fc1 = nn.Linear(conv2_out * (input_dim - 4), 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 1)
        
    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

df_pandas = df_maleFIFA_scaled.select("scaled_features", "overall").toPandas()
features = np.array(df_pandas['scaled_features'].to_list())
labels = np.array(df_pandas['overall'])

X = torch.tensor(features, dtype=torch.float32)
y = torch.tensor(labels, dtype=torch.float32)

train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

model = CNNModel(input_dim=X_train.shape[1])

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss}")

def evaluate_model(model, data_loader, criterion, threshold=0.1):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for inputs, targets in data_loader:
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            total_loss += loss.item()
            predictions = outputs.squeeze()
            correct_predictions = (torch.abs(predictions - targets) / targets <= threshold).float()
            total_correct += correct_predictions.sum().item()
            total_samples += len(targets)

    avg_loss = total_loss / len(data_loader)
    rmse = np.sqrt(avg_loss)
    accuracy = (total_correct / total_samples) * 100

    return rmse, accuracy

test_rmse, test_accuracy = evaluate_model(model, test_loader, criterion)
print(f"Test RMSE: {test_rmse}")
print(f"Test Accuracy: {test_accuracy}%")

train_rmse, train_accuracy = evaluate_model(model, train_loader, criterion)
print(f"Train RMSE: {train_rmse}")
print(f"Train Accuracy: {train_accuracy}%")
