In [None]:
#Task-I

In [1]:
# Uncomment the following lines if you are using Windows!
import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext

appName = "Big Data Analytics"
master = "local"

# Create Configuration object for Spark.
conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .setAppName(appName)\
    .setMaster(master)

# Create Spark Context with the new configurations rather than relying on the default one
sc = SparkContext.getOrCreate(conf=conf)

# You need to create SQL Context to conduct some database operations like what we will see later.
sqlContext = SQLContext(sc)

# If you have SQL context, you create the session from the Spark Context
spark = sqlContext.sparkSession.builder.getOrCreate()



In [2]:
from pyspark.sql.functions import lit, monotonically_increasing_id
years = range(15,23)
players = []

for year in years:
    df = spark.read.csv(f"data_folder/players_{year}.csv", header = True, inferSchema = True)
    df = df.withColumn('year',lit(year))
    df = df.withColumn('unique_id',monotonically_increasing_id())
    players.append(df)
    
merged_df = players[0]
for df in players[1:]:
    merged_df = merged_df.union(df)

# merged_df.show(vertical=True)


In [3]:
table_name = "FIFA_Players"
spark.sql("CREATE SCHEMA IF NOT EXISTS fifa")
fully_qualified_table_name = "fifa." + table_name

db_properties={}
db_properties['username']="postgres"
db_properties['password']="gilgamesh"
db_properties['url']= "jdbc:postgresql://localhost:5432/postgres"
db_properties['table']="fully_qualified_table_name"
db_properties['driver']="org.postgresql.Driver"


merged_df.write.format("jdbc")\
.mode("overwrite")\
.option("url", db_properties['url'])\
.option("dbtable", db_properties['table'])\
.option("user", db_properties['username'])\
.option("password", db_properties['password'])\
.option("Driver", db_properties['driver'])\
.save()


In [4]:
#Task-II

In [5]:
from pyspark.sql.functions import col
df_2022 = merged_df.filter(merged_df["year"] == 22)
def x_clubs(x):
    contract_counts = df_2022.filter(df_2022["club_contract_valid_until"] == 2023) \
                                  .groupBy("club_name") \
                                  .count() \
                                  .sort(col("count").desc())

    top_clubs = contract_counts.limit(x)
    return top_clubs

In [6]:
# df_2022.show(vertical=True)
top_x_clubs = x_clubs(10)
top_x_clubs.show()

+--------------------+-----+
|           club_name|count|
+--------------------+-----+
|En Avant de Guingamp|   19|
| Club Atlético Lanús|   17|
|       Lechia Gdańsk|   17|
|            Barnsley|   16|
|        Kasimpaşa SK|   16|
|        Bengaluru FC|   16|
|        FC Barcelona|   15|
|  SV Wehen Wiesbaden|   15|
|          CA Osasuna|   15|
|      Zagłębie Lubin|   15|
+--------------------+-----+



In [7]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

year_counts = merged_df.filter(merged_df["age"] > 27) \
                        .groupby("club_name", "year") \
                        .agg(F.count("*").alias("count"))

def y_clubs(y):

    avg_counts = year_counts.groupby("club_name") \
                                      .agg(F.avg("count").alias("avg_count")) \
                                      .orderBy(F.desc("avg_count"))
    
    avg_counts = avg_counts.withColumn("rank", F.dense_rank().over(Window.orderBy(F.desc("avg_count"))))
    top_y = avg_counts.filter(F.col("rank") <= y).drop("rank")
    
    years_present = year_counts.groupby("club_name").agg(F.countDistinct("year").alias("years_present"))
    top_y = top_y.join(years_present, on="club_name", how="left")

    return top_y

In [8]:
top_y = y_clubs(6)
filtered_df = year_counts.filter(year_counts['club_name'] == 'Shanghai Shenhua FC')
filtered_df.show()
# Show the results
top_y.show()

+-------------------+----+-----+
|          club_name|year|count|
+-------------------+----+-----+
|Shanghai Shenhua FC|  19|   15|
|Shanghai Shenhua FC|  20|   17|
|Shanghai Shenhua FC|  21|   20|
|Shanghai Shenhua FC|  22|   22|
+-------------------+----+-----+

+--------------------+---------+-------------+
|           club_name|avg_count|years_present|
+--------------------+---------+-------------+
|                null|   109.25|         null|
|  Dorados de Sinaloa|     19.0|            1|
| Matsumoto Yamaga FC|     19.0|            1|
| Shanghai Shenhua FC|     18.5|            4|
|          Qingdao FC|     18.0|            2|
|Club Deportivo Jo...|     17.5|            2|
|            Altay SK|     17.0|            1|
|         Guaireña FC|     17.0|            1|
+--------------------+---------+-------------+



In [9]:
from pyspark.sql import functions as F

def most_frequent_position(df):
    filtered_df = df.filter(df['nation_position'].isNotNull())
    position_counts = filtered_df.groupby("year", "nation_position") \
                             .agg(F.count("*").alias("count")) \
                             .orderBy("year", F.desc("count"))

    window_spec = Window.partitionBy("year").orderBy(F.desc("count"))
    ranked_positions = position_counts.withColumn("rank", F.rank().over(window_spec))

    most_frequent_positions = ranked_positions.filter(F.col("rank") == 1).select("year", "nation_position", "count")

    return most_frequent_positions

In [10]:
most = most_frequent_position(merged_df)
most.show()

+----+---------------+-----+
|year|nation_position|count|
+----+---------------+-----+
|  15|            SUB|  564|
|  16|            SUB|  511|
|  17|            SUB|  564|
|  18|            SUB|  600|
|  19|            SUB|  576|
|  20|            SUB|  588|
|  21|            SUB|  588|
|  22|            SUB|  396|
+----+---------------+-----+



In [11]:
#Task-III

In [12]:
#Data Preparation and Feature Engineering

merged_df.printSchema()


root
 |-- sofifa_id: integer (nullable = true)
 |-- player_url: string (nullable = true)
 |-- short_name: string (nullable = true)
 |-- long_name: string (nullable = true)
 |-- player_positions: string (nullable = true)
 |-- overall: integer (nullable = true)
 |-- potential: integer (nullable = true)
 |-- value_eur: double (nullable = true)
 |-- wage_eur: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- dob: date (nullable = true)
 |-- height_cm: integer (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- club_team_id: double (nullable = true)
 |-- club_name: string (nullable = true)
 |-- league_name: string (nullable = true)
 |-- league_level: integer (nullable = true)
 |-- club_position: string (nullable = true)
 |-- club_jersey_number: integer (nullable = true)
 |-- club_loaned_from: string (nullable = true)
 |-- club_joined: date (nullable = true)
 |-- club_contract_valid_until: integer (nullable = true)
 |-- nationality_id: integer (nullable = true)
 

In [13]:
from pyspark.sql.functions import col

casted_df = merged_df \
    .withColumn("mentality_composure", col("mentality_composure").cast("int")) \
    .withColumn("ls", col("ls").cast("int")) \
    .withColumn("st", col("st").cast("int")) \
    .withColumn("rs", col("rs").cast("int")) \
    .withColumn("lw", col("lw").cast("int")) \
    .withColumn("lf", col("lf").cast("int")) \
    .withColumn("cf", col("cf").cast("int")) \
    .withColumn("rw", col("rw").cast("int")) \
    .withColumn("lam", col("lam").cast("int")) \
    .withColumn("cam", col("cam").cast("int")) \
    .withColumn("ram", col("ram").cast("int")) \
    .withColumn("lm", col("lm").cast("int")) \
    .withColumn("lcm", col("lcm").cast("int")) \
    .withColumn("cm", col("cm").cast("int")) \
    .withColumn("rcm", col("rcm").cast("int")) \
    .withColumn("rm", col("rm").cast("int")) \
    .withColumn("lwb", col("lwb").cast("int")) \
    .withColumn("ldm", col("ldm").cast("int")) \
    .withColumn("cdm", col("cdm").cast("int")) \
    .withColumn("rdm", col("rdm").cast("int")) \
    .withColumn("rwb", col("rwb").cast("int")) \
    .withColumn("lb", col("lb").cast("int")) \
    .withColumn("cb", col("cb").cast("int")) \
    .withColumn("rcb", col("rcb").cast("int")) \
    .withColumn("rb", col("rb").cast("int")) \
    .withColumn("gk", col("gk").cast("int"))


In [14]:
droped_df = casted_df.drop("sofifa_id")\
                    .drop("player_url")\
                    .drop("short_name")\
                    .drop("long_name")\
                    .drop("player_positions")\
                    .drop("potential")\
                    .drop("dob")\
                    .drop("club_team_id")\
                    .drop("club_name")\
                    .drop("club_position")\
                    .drop("club_jersey_number")\
                    .drop("club_loaned_from")\
                    .drop("club_joined")\
                    .drop("club_contract_valid_until")\
                    .drop("nationality_name")\
                    .drop("nation_position")\
                    .drop("preferred_foot")\
                    .drop("release_clause_eur")\
                    .drop("player_tags")\
                    .drop("player_traits")\
                    .drop("mentality_composure")\
                    .drop("player_face_url")\
                    .drop("club_logo_url")\
                    .drop("club_flag_url")\
                    .drop("nation_logo_url")\
                    .drop("nation_flag_url")\
                    .drop("year")\
                    .drop("unique_id")

In [15]:
from pyspark.sql.functions import *

null_counts_plays_df = droped_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) \
                        for c in droped_df.columns])

null_counts_plays_df.show(truncate=False, vertical=True)

-RECORD 0-----------------------------
 overall                     | 0      
 value_eur                   | 1897   
 wage_eur                    | 1622   
 age                         | 0      
 height_cm                   | 0      
 weight_kg                   | 0      
 league_name                 | 1630   
 league_level                | 2015   
 nationality_id              | 0      
 nation_team_id              | 133635 
 nation_jersey_number        | 133635 
 weak_foot                   | 0      
 skill_moves                 | 0      
 international_reputation    | 0      
 work_rate                   | 0      
 body_type                   | 0      
 real_face                   | 0      
 pace                        | 15791  
 shooting                    | 15791  
 passing                     | 15791  
 dribbling                   | 15791  
 defending                   | 15791  
 physic                      | 15791  
 attacking_crossing          | 0      
 attacking_finishing     

In [16]:
droped_NA_df = droped_df.drop("nation_team_id")\
                    .drop("nation_jersey_number")\
                    .drop("goalkeeping_speed")\
                    .drop("ls")\
                    .drop("st")\
                    .drop("rs")\
                    .drop("lam")\
                    .drop("cam")\
                    .drop("ram")\
                    .drop("lm")\
                    .drop("lcm")\
                    .drop("cm")\
                    .drop("rcm")\
                    .drop("rm")\
                    .drop("lwb")\
                    .drop("ldm")\
                    .drop("cdm")\
                    .drop("rdm")\
                    .drop("rwb")\
                    .drop("lb")\
                    .drop("cb")\
                    .drop("rcb")\
                    .drop("cb")\
                    .drop("gk")\
                    .drop("league_name")


In [17]:
casted_types_df_with_na_dropped_rows = droped_NA_df.na.drop()

In [18]:
casted_types_df_with_na_dropped_rows.show(vertical= True)

-RECORD 0---------------------------------------
 overall                     | 84               
 value_eur                   | 2.05E7           
 wage_eur                    | 170000.0         
 age                         | 27               
 height_cm                   | 188              
 weight_kg                   | 88               
 league_level                | 1                
 nationality_id              | 129              
 weak_foot                   | 3                
 skill_moves                 | 2                
 international_reputation    | 2                
 work_rate                   | High/High        
 body_type                   | Normal (185+)    
 real_face                   | No               
 pace                        | 75               
 shooting                    | 45               
 passing                     | 54               
 dribbling                   | 64               
 defending                   | 88               
 physic             

In [19]:
distinct_df = (casted_types_df_with_na_dropped_rows.distinct())

In [20]:
numeric_features = [feature[0] for feature in distinct_df.dtypes if feature[1] in ('int','double')]
#numeric_features

In [54]:
# import matplotlib.pyplot as plt
# converted_data = distinct_df[numeric_features].toPandas()

# figure = plt.boxplot(converted_data)

In [None]:
spotted_data = distinct_df[numeric_features[36:38]].toPandas()
figure_subset = plt.boxplot(spotted_data)

In [21]:
from functools import reduce

def column_add(a,b):
     return  a.__add__(b)
    
def find_outliers(df):
    # Identifying the numerical columns in a spark dataframe
    numeric_columns = [column[0] for column in df.dtypes if column[1]=='int']

    # Using the `for` loop to create new columns by identifying the outliers for each feature
    for column in numeric_columns:

        less_Q1 = 'less_Q1_{}'.format(column)
        more_Q3 = 'more_Q3_{}'.format(column)
        Q1 = 'Q1_{}'.format(column)
        Q3 = 'Q3_{}'.format(column)

        # Q1 : First Quartile ., Q3 : Third Quartile
        Q1 = df.approxQuantile(column,[0.25],relativeError=0)
        Q3 = df.approxQuantile(column,[0.75],relativeError=0)
        
        # IQR : Inter Quantile Range
        # We need to define the index [0], as Q1 & Q3 are a set of lists., to perform a mathematical operation
        # Q1 & Q3 are defined seperately so as to have a clear indication on First Quantile & 3rd Quantile
        IQR = Q3[0] - Q1[0]
        
        #selecting the data, with -1.5*IQR to + 1.5*IQR., where param = 1.5 default value
        less_Q1 =  Q1[0] - 1.5*IQR
        more_Q3 =  Q3[0] + 1.5*IQR
        
        isOutlierCol = 'is_outlier_{}'.format(column)
        
        df = df.withColumn(isOutlierCol,when((df[column] > more_Q3) | (df[column] < less_Q1), 1).otherwise(0))
    

    # Selecting the specific columns which we have added above, to check if there are any outliers
    selected_columns = [column for column in df.columns if column.startswith("is_outlier")]
    # Adding all the outlier columns into a new colum "total_outliers", to see the total number of outliers
    df = df.withColumn('total_outliers',reduce(column_add, ( df[col] for col in  selected_columns)))

    # Dropping the extra columns created above, just to create nice dataframe., without extra columns
    df = df.drop(*[column for column in df.columns if column.startswith("is_outlier")])

    return df

In [22]:
numeric_columns = [column[0] for column in distinct_df.dtypes if column[1] in ('int','double')]
distinct_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in numeric_columns]).show(1, vertical=True)


-RECORD 0--------------------------
 overall                     | 0   
 value_eur                   | 0   
 wage_eur                    | 0   
 age                         | 0   
 height_cm                   | 0   
 weight_kg                   | 0   
 league_level                | 0   
 nationality_id              | 0   
 weak_foot                   | 0   
 skill_moves                 | 0   
 international_reputation    | 0   
 pace                        | 0   
 shooting                    | 0   
 passing                     | 0   
 dribbling                   | 0   
 defending                   | 0   
 physic                      | 0   
 attacking_crossing          | 0   
 attacking_finishing         | 0   
 attacking_heading_accuracy  | 0   
 attacking_short_passing     | 0   
 attacking_volleys           | 0   
 skill_dribbling             | 0   
 skill_curve                 | 0   
 skill_fk_accuracy           | 0   
 skill_long_passing          | 0   
 skill_ball_control         

In [58]:
# df_with_outlier_handling = find_outliers(distinct_df)
# df_with_outlier_handling.show(1, vertical=True)

In [None]:
# df_with_outlier_handling.groupby("total_outliers").count().show()

In [None]:
# df_with_substituted_na_and_outliers = df_with_outlier_handling.\
#         filter(df_with_outlier_handling['total_Outliers']<=4)
# print(df_with_substituted_na_and_outliers.count())

In [23]:
correlation_matrix = distinct_df.toPandas().corr()
print(correlation_matrix)

                              overall  value_eur  wage_eur       age  \
overall                      1.000000   0.659782  0.746309  0.445287   
value_eur                    0.659782   1.000000  0.876754  0.113020   
wage_eur                     0.746309   0.876754  1.000000  0.225713   
age                          0.445287   0.113020  0.225713  1.000000   
height_cm                    0.078584   0.026333  0.045127  0.087859   
weight_kg                    0.148160   0.050939  0.083823  0.211989   
league_level                -0.243805  -0.170542 -0.220649 -0.044850   
nationality_id              -0.023993  -0.034303 -0.019413  0.059548   
weak_foot                    0.212258   0.147824  0.155376  0.094469   
skill_moves                  0.276299   0.257358  0.241259  0.020486   
international_reputation     0.413802   0.471604  0.547154  0.283899   
pace                         0.218850   0.212947  0.193639 -0.205547   
shooting                     0.452281   0.344557  0.361704  0.17

In [24]:
df_with_handled_correlations = distinct_df\
                .drop("lf","cf","rw")

In [25]:
from pyspark.sql.functions import when

df_with_handled_binary = distinct_df.withColumn("real_face_encoded", when(df_with_handled_correlations["real_face"] == "Yes", 1).otherwise(0)
)

df_with_handled_binary.select("real_face", "real_face_encoded").distinct().show()

+---------+-----------------+
|real_face|real_face_encoded|
+---------+-----------------+
|      Yes|                1|
|       No|                0|
+---------+-----------------+



In [26]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

stage_1 = StringIndexer(inputCol= 'work_rate', outputCol= 'work_rate_index')
stage_2 = StringIndexer(inputCol= 'body_type', outputCol= 'body_type_index')


stage_3= OneHotEncoder(inputCols=["work_rate_index","body_type_index",], 
                        outputCols=['work_rate_encoded','body_type_encoded',])


pipeline = Pipeline(stages=[stage_1, stage_2, stage_3])


pipeline_model = pipeline.fit(df_with_handled_binary)
df_encoded = pipeline_model.transform(df_with_handled_binary)

In [27]:

print(df_encoded.select("work_rate").distinct().count())

df_encoded.select("work_rate","work_rate_index","work_rate_encoded")\
                .distinct().toPandas()

9


Unnamed: 0,work_rate,work_rate_index,work_rate_encoded
0,Medium/High,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
1,Low/High,7.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0)"
2,High/Medium,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,Medium/Medium,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,Low/Low,8.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
5,Low/Medium,6.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0)"
6,High/High,5.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0)"
7,Medium/Low,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)"
8,High/Low,4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)"


In [28]:
feature_list = df_encoded.drop("overall","real_face","work_rate","work_rate_index",
                                        "body_type","body_type_index","rf","lcb").columns
print(len(feature_list))

['value_eur', 'wage_eur', 'age', 'height_cm', 'weight_kg', 'league_level', 'nationality_id', 'weak_foot', 'skill_moves', 'international_reputation', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'lw', 'lf', 'cf', 'rw', 'rb', '

In [29]:
from pyspark.ml.feature import VectorAssembler

vector_assembler = VectorAssembler(
    inputCols=feature_list, 
    outputCol="vectorized_features")

df_with_assembled_features = vector_assembler.transform(df_encoded)
# df_encoded.show(vertical = True)

In [45]:
import torch
pandas_df =  df_with_assembled_features.toPandas()
features = torch.tensor(pandas_df['vectorized_features'].tolist())
labels = torch.tensor(pandas_df['overall'].tolist())
print(type(features))
print(type(labels))

<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [35]:
print(len(feature_list))

57


In [None]:
#PySpark
#RandomForestRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

rf = RandomForestRegressor(featuresCol='vectorized_features', labelCol='overall')

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .addGrid(rf.maxBins, [32, 64, 128]) \
    .build()

evaluator = RegressionEvaluator(labelCol="overall", predictionCol="prediction", metricName="rmse")

cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

cvModel = cv.fit(df_with_assembled_features)

bestModel = cvModel.bestModel

predictions = bestModel.transform(df_with_assembled_features)

rmse = evaluator.evaluate(predictions)
print(f"RMSE = {rmse}")
accuracy_test = (predictions.filter(predictions.outcome == predictions.prediction)
    .count() / float(predictions.count()))
print(f"Train Accuracy : {np.round(accuracy_train*100,2)}%")

In [33]:
#PySpark
#LinearRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
lr = LinearRegression(featuresCol='vectorized_features', labelCol='overall')
paramGrid = ParamGridBuilder() \
    .addGrid(lr.maxIter, [10, 50, 100]) \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()
evaluator = RegressionEvaluator(labelCol="overall", predictionCol="prediction", metricName="rmse")

crossval = CrossValidator(estimator=lr, 
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator, 
                          numFolds=5)
cvModel = crossval.fit(df_with_assembled_features)
bestModel = cvModel.bestModel
predictions = bestModel.transform(df_with_assembled_features)
rmse = evaluator.evaluate(predictions)
print("(RMSE) = %g" % rmse)
accuracy_test = (predictions.filter(predictions.outcome == predictions.prediction)
    .count() / float(predictions.count()))
print(f"Train Accuracy : {np.round(accuracy_train*100,2)}%")

KeyboardInterrupt: 

In [49]:
#PyTorch
#MLP
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features.float()
        self.labels = labels.float()

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


dataset = CustomDataset(features, labels)

from torch.utils.data import random_split
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        return self.layers(x)

model = MLP(72, 10, 1)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 500
# Training loop
for epoch in range(num_epochs):
    for inputs, targets in train_loader:

        outputs = model(inputs)
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

test_loss = 0.0
with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        test_loss += loss.item()

rmse = math.sqrt(test_loss / len(test_loader))
print(f"Test RMSE: {rmse}")    
accuracy_test = (predictions.filter(predictions.outcome == predictions.prediction)
    .count() / float(predictions.count()))
print(f"Train Accuracy : {np.round(accuracy_train*100,2)}%")

Test RMSE: 7.83687500463911


NameError: name 'predictions' is not defined

In [54]:
#PyTorch
#CNN
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np  # If needed for np.round

class CNNRegressor(nn.Module):
    def __init__(self, input_size):
        super(CNNRegressor, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(32, 64, 3, 1, 1)
        
        # Calculate the size of the pooled feature maps
        self.feature_size = self._get_conv_output(input_size)

        self.fc1 = nn.Linear(self.feature_size, 512)
        self.fc2 = nn.Linear(512, 1)

    def _get_conv_output(self, shape):
        with torch.no_grad():
            input = torch.autograd.Variable(torch.rand(1, *shape))
            output = self.pool(F.relu(self.conv1(input)))
            output = self.pool(F.relu(self.conv2(output)))
            return int(np.prod(output.size()))

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, self.feature_size)  # Flatten the tensor
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Example input size (channels, height, width)
model = CNNRegressor((1, 64, 64))  # Adjust as per your input

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Assume num_epochs, train_loader, and test_loader are defined

# Training loop
for epoch in range(num_epochs):
    for data, targets in train_loader:
        # Reshape data to have a single channel
        data = data.view(data.size(0), 1, 72, 1)  # Adjust the dimensions as necessary

        outputs = model(data)
        loss = criterion(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Test the model
model.eval()  # Evaluation mode
test_loss = 0
with torch.no_grad():
    for data, targets in test_loader:
        outputs = model(data)
        test_loss += criterion(outputs, targets).item()

print(f'Average Test Loss: {test_loss / len(test_loader)}')


RuntimeError: Given input size: (32x72x1). Calculated output size: (32x36x0). Output size is too small