In [139]:
%matplotlib inline

# base code is from scikit-learn 0.23.2
# Author: Pedro Morales <part.morales@gmail.com>

In [106]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
np.random.seed(0)


# Read the CSV and Perform Basic Data Cleaning

In [164]:
df = pd.read_csv("./data/merged_games_df.csv")
# drop redundant columns
df = df.drop(columns=['Unnamed: 0', 'Name', 'Platform','NA_Sales','JP_Sales', 'EU_Sales', 'Other_Sales'])
# Drop the null rows
# df = df.dropna()
df.head(3)

Unnamed: 0,Year,Genre,Publisher,Global_Sales,Console,Game Title,Price,Mean,Median
0,1981,Action,Mattel Interactive,670000,2600,ADVENTURES OF TRON,16.1,False,False
1,1981,Action,Men-A-Vision,770000,2600,AIR RAID,0.0,False,False
2,1981,Action,Data Age,390000,2600,AIRLOCK,17.0,False,False


In [165]:
df.dtypes

Year              int64
Genre            object
Publisher        object
Global_Sales      int64
Console          object
Game Title       object
Price           float64
Mean               bool
Median             bool
dtype: object

# Select your features (columns)

In [170]:
# Set features. This will also be used as your x values.
# X = df[['Console','Game Title', 'Year', 'Genre', 'Publisher', 'Global_Sales']]
# y = df[['Price']]
X = df.drop('Median', axis=1)
# y = df[['Price']]
y = np.array(df['Median'])
y

array([False, False, False, ...,  True,  True, False])

In [171]:
numeric_features = ['Year', 'Global_Sales', 'Price']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Console','Game Title', 'Genre', 'Publisher']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.950


In [172]:
from sklearn import set_config
set_config(display='diagram')
clf

In [173]:
numeric_features = ['Year', 'Global_Sales', 'Price']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Console','Game Title', 'Genre', 'Publisher']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', DecisionTreeClassifier())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 1.000


In [174]:
from sklearn import set_config
set_config(display='diagram')
clf

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)