In [1]:
import pandas as pd
import statsmodels.formula.api as sm
Steam = pd.read_csv('https://raw.githubusercontent.com/TomdeFluiter/StudentsPerformance/refs/heads/main/Steam.csv')
Steam.head()

Unnamed: 0,name,releaseDate,copiesSold,price,revenue,avgPlaytime,reviewScore,publisherClass,publishers,developers,steamId
0,WWE 2K24,07-03-2024,165301,99.99,8055097.0,42.36514,71,AAA,2K,Visual Concepts,2315690
1,EARTH DEFENSE FORCE 6,25-07-2024,159806,59.99,7882151.0,29.651061,57,Indie,D3PUBLISHER,SANDLOT,2291060
2,Sins of a Solar Empire II,15-08-2024,214192,49.99,7815247.0,12.452593,88,Indie,Stardock Entertainment,"Ironclad Games Corporation,Stardock Entertainment",1575940
3,Legend of Mortal,14-06-2024,440998,19.99,7756399.0,24.797817,76,Indie,"Paras Games,Obb Studio Inc.",Obb Studio Inc.,1859910
4,Shin Megami Tensei V: Vengeance,13-06-2024,141306,59.99,7629252.0,34.258496,96,AA,SEGA,ATLUS,1875830


In [2]:
Steam2 = Steam.drop(['name', 'publishers', 'developers', 'steamId'], axis=1)

Steam2['releaseDate'] = pd.to_datetime(Steam2['releaseDate'], format='%d-%m-%Y')
reference_date = pd.to_datetime('2024-01-01')
Steam2['releaseDate'] = (Steam2['releaseDate']- reference_date).dt.days

Steam2.head()

Unnamed: 0,releaseDate,copiesSold,price,revenue,avgPlaytime,reviewScore,publisherClass
0,66,165301,99.99,8055097.0,42.36514,71,AAA
1,206,159806,59.99,7882151.0,29.651061,57,Indie
2,227,214192,49.99,7815247.0,12.452593,88,Indie
3,165,440998,19.99,7756399.0,24.797817,76,Indie
4,164,141306,59.99,7629252.0,34.258496,96,AA


In [3]:
Steam_dummies = pd.get_dummies(Steam2['publisherClass'], dtype = int)
Steam2 = pd.concat([Steam2,Steam_dummies], axis=1)
Steam2 = Steam2.drop(['publisherClass'], axis=1)
print(Steam2.head())

model1 = sm.ols('copiesSold~price+revenue+avgPlaytime', data=Steam2).fit()
print(model1.summary())

   releaseDate  copiesSold  price    revenue  avgPlaytime  reviewScore  AA  \
0           66      165301  99.99  8055097.0    42.365140           71   0   
1          206      159806  59.99  7882151.0    29.651061           57   0   
2          227      214192  49.99  7815247.0    12.452593           88   0   
3          165      440998  19.99  7756399.0    24.797817           76   0   
4          164      141306  59.99  7629252.0    34.258496           96   1   

   AAA  Hobbyist  Indie  
0    1         0      0  
1    0         0      1  
2    0         0      1  
3    0         0      1  
4    0         0      0  
                            OLS Regression Results                            
Dep. Variable:             copiesSold   R-squared:                       0.398
Model:                            OLS   Adj. R-squared:                  0.397
Method:                 Least Squares   F-statistic:                     330.4
Date:                Fri, 11 Oct 2024   Prob (F-statistic):

In [4]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# 2. Definieer de features (X) en de target (y)
X = Steam2.drop('copiesSold', axis=1)
y = Steam2['copiesSold']

# 3. Optioneel: Schalen van de numerieke features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 5. Lineaire regressie model trainen
model = LinearRegression()
model.fit(X_train, y_train)

# 6. Voorspellen en evalueren
y_pred = model.predict(X_test)

# Bereken evaluatiemetrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

# Resultaten tonen
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Absolute Error (MAE): 125681.23219687393
Mean Squared Error (MSE): 279920263823.53357
Root Mean Squared Error (RMSE): 529074.9132434211


In [5]:
from sklearn.preprocessing import MinMaxScaler

columns = X.columns
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=columns)

In [6]:
import numpy as np
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV, RepeatedKFold
alpha_range = np.arange(start=2600,stop=2700,step=1)

cv = RepeatedKFold(n_splits=5)
param = {'alpha':alpha_range}
LassoModel = Lasso()

LassoM = GridSearchCV(LassoModel,param_grid=param,scoring='neg_mean_absolute_error', cv=cv)
LassoM.fit(X,y)
print("Best Alpha: ",LassoM.best_params_['alpha'])

RidgeModel = Ridge()

RidgeM = GridSearchCV(RidgeModel, param_grid=param, scoring='neg_mean_absolute_error', cv=cv)
RidgeM.fit(X, y)
print("Best Alpha: ", RidgeM.best_params_['alpha'])


Best Alpha:  2600
Best Alpha:  2600


In [7]:
from numpy import absolute, mean
from sklearn.model_selection import cross_val_score


RidgeModel = Ridge(alpha=1)
scoresridge = cross_val_score(RidgeModel,X,y,scoring='neg_mean_absolute_error', cv=cv)
ridgeMAE = mean(absolute(scoresridge))
print('the average prediction error with full data is %.0f' % ridgeMAE)

the average prediction error with full data is 161515


In [8]:
LassoModel = Lasso(alpha=2659)
scoreslasso = cross_val_score(LassoModel,X,y,scoring='neg_mean_absolute_error', cv=cv)
lassoMAE = mean(absolute(scoreslasso))
print('the average prediction error with full data is %.0f' % lassoMAE)

the average prediction error with full data is 135854


In [9]:
%pip install tensorflow
%pip install scikeras[tensorflow]

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [10]:
import numpy as np
import tensorflow as tf
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import RandomizedSearchCV

def build_model(n_layers=3, n_units=64, activation='softplus'):
    nnmodel = tf.keras.models.Sequential()
    nnmodel.add(tf.keras.layers.Input(shape=(X.shape[1],)))

    for _ in range(n_layers):
        nnmodel.add(tf.keras.layers.Dense(n_units, activation=activation))
    
    nnmodel.add(tf.keras.layers.Dense(1))
    nnmodel.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mae'])
    return nnmodel

model = KerasRegressor(model=build_model,
                       n_layers=[1,2,3,4,5],
                       n_units=[32,64,128,256,512],
                       activation=['relu','tanh','softplus','elu'],
                       epochs=[50,100,150,200],
                       batch_size=[16,32,64],
                       verbose = 0)

param_dist = {
    'n_layers': [1,2,3,4,5],
    'n_units': [32,64,128,256,512],
    'activation': ['relu','tanh','softplus','elu'],
    'epochs': [50,100,150,200],
    'batch_size': [16,32,64]
}

random_search = RandomizedSearchCV(estimator=model,
                                   param_distributions=param_dist,
                                   n_iter=10,
                                   cv=5,
                                   random_state=42)

random_search.fit(X,y)
print("best parameters found: ", random_search.best_params_)

2024-10-11 08:02:40.039264: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-11 08:02:40.294083: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-11 08:02:44.885701: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-11 08:02:47.609744: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-11 08:02:51.661840: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 



  _data = np.array(data, dtype=dtype, copy=copy,


best parameters found:  {'n_units': 512, 'n_layers': 4, 'epochs': 100, 'batch_size': 32, 'activation': 'elu'}


In [14]:
# Retrieve the best model from the random search
best_model = random_search.best_estimator_

# Make predictions on the training set (or any other validation/test set)
y_pred = best_model.predict(X)

# Calculate Mean Absolute Error (or any other metric)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = mse ** 0.5
print("Mean Absolute Error of the best model: ", mae)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Absolute Error of the best model:  66882.61666471354
Mean Squared Error (MSE): 701924406517.4418
Root Mean Squared Error (RMSE): 837809.2900639392
