In [None]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

Pull data from CSV

In [None]:
column_names = ["name","total_shipped", "genres", "themes", "game_engines", "age_ratings"]
df = pd.read_csv("videogame_data.csv", sep=",")

Preprocessing

In [None]:
label_encoders = {}
encoded_values_dict = {}

for column in ["genres", "themes", "age_ratings", "game_engines"]:
    label_encoders[column] = LabelEncoder()
    label_encoders[column].fit(df[column].explode())

    # Create a dictionary with values as keys and encoding values as values
    encoded_values_dict[column] = {cls: enc for enc, cls in enumerate(label_encoders[column].classes_)}
    
X = df.drop(columns=["total_shipped","name"])
y = df["total_shipped"]
X_encoded = X.copy()
for column in ["genres", "themes", "age_ratings", "game_engines"]:
    encoded_column = []
    for val in X[column]:
        encoded_values = encoded_values_dict[column][val]
        encoded_column.append(encoded_values)
    X_encoded[column] = encoded_column

Training and Testing values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [None]:
svm_model = SVR(kernel='rbf', C=1.0, epsilon=2.2)
random_forest_model = RandomForestRegressor(max_features="log2")
linear_regression_model = LinearRegression(n_jobs=-1)
neural_net_model = MLPRegressor(hidden_layer_sizes=(50, 30, 10), activation="relu", random_state=42)

svm_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)
linear_regression_model.fit(X_train, y_train)
neural_net_model.fit(X_train, y_train)

In [None]:
# Model Prediction
svm_predictions = svm_model.predict(X_test)
rf_predictions = random_forest_model.predict(X_test)
lr_predictions = linear_regression_model.predict(X_test)
nn_predictions = neural_net_model.predict(X_test)

In [None]:
# Evaluation
svm_mse = mean_squared_error(y_test, svm_predictions)
rf_mse = mean_squared_error(y_test, rf_predictions)
lr_mse = mean_squared_error(y_test, lr_predictions)
nn_mse = mean_squared_error(y_test, nn_predictions)

model_data = {
    "svm_model": {"model":svm_model,"MSE":svm_mse},
    "random_forest_model": {"model":random_forest_model,"MSE":rf_mse},
    "linear_regression_model": {"model":linear_regression_model,"MSE":lr_mse},
    "neural_net_model": {"model":neural_net_model,"MSE":nn_mse}
}
model_data = dict(sorted(model_data.items(), key=lambda item: item[1]["MSE"]))
best_model = list(model_data.items())[0][1]["model"]

print("Support Vector Machines MSE:", svm_mse)
print("Random Forest MSE:", rf_mse)
print("Linear Regression MSE:", lr_mse)
print("Neural Network MSE:", nn_mse)

print("Best is: "+ str(list(model_data.items())[0][0]))


In [None]:
game_name = input("Enter your games name: ")
user_input_dict = {
    "genres": str([genre.strip().lower() for genre in input("Enter genres (comma separated): ").split(",")]),
    "themes": str([theme.strip().lower() for theme in input("Enter themes (comma separated): ").split(",")]),
    "game_engines": input("Enter the game engine you will be using: ").strip().lower(),
    "age_ratings": input("Enter the age rating: ").strip().lower(),
}

#print([user_input_dict])
df_user_input = pd.DataFrame([user_input_dict],columns=["genres", "themes", "game_engines", "age_ratings"])
#print(df_user_input)

# Apply the encoding using the encoded_values_dict for list values
for column in ["genres", "themes"]:
    encoded_values = []
    for val in df_user_input[column]:
        if val in encoded_values_dict[column]:
            encoded_values.append(encoded_values_dict[column][val])
        else:
            #if combo does not exist in model, take the first suggestion
            print(ast.literal_eval(val))
            value_list = ast.literal_eval(val)
            if str([value_list[0]]) in encoded_values_dict[column]:
                encoded_values.append(encoded_values_dict[column][str([value_list[0]])])
            else:
                encoded_values.append(-1)
    df_user_input[column] = encoded_values

# Apply the encoding for non-list values
for column in ["game_engines", "age_ratings"]:
    encoded_values = []
    for val in df_user_input[column]:
        if val in encoded_values_dict[column]:
            encoded_values.append(encoded_values_dict[column][val])
        else:
            encoded_values.append(-1)
    df_user_input[column] = encoded_values


df_user_input_encoded = pd.DataFrame(df_user_input, columns=["genres", "themes", "age_ratings", "game_engines"])
best_predictions = best_model.predict(df_user_input)
print(game_name+" will sell around "+str(round(best_predictions[0],2))+" Million copies based on previous data")