# Model 1: SVM - Video game resale price predictor



In [1]:
#import dependancies
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# import CSV
videogame_df = pd.read_csv("../data/merged_games_df.csv")
videogame_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Console,Game Title,Price,Mean,Median
0,0,ADVENTURES OF TRON,2600,1981,Action,Mattel Interactive,630000.0,30000.0,,10000.0,670000,2600,ADVENTURES OF TRON,16.1,False,False
1,1,AIR RAID,2600,1981,Action,Men-A-Vision,720000.0,40000.0,,10000.0,770000,2600,AIR RAID,0.0,False,False
2,2,AIRLOCK,2600,1981,Action,Data Age,360000.0,20000.0,,,390000,2600,AIRLOCK,17.0,False,False
3,3,ALIEN,2600,1981,Action,20th Century Fox Video Games,740000.0,40000.0,,10000.0,790000,2600,ALIEN,61.24,False,True
4,4,ARMOR AMBUSH,2600,1981,Action,Mattel Interactive,150000.0,10000.0,,,170000,2600,ARMOR AMBUSH,9.92,False,False


In [3]:
# convert NA to 0 and check data types
clean_df = videogame_df.fillna(0)
clean_df.dtypes

Unnamed: 0        int64
Name             object
Platform         object
Year              int64
Genre            object
Publisher        object
NA_Sales        float64
EU_Sales        float64
JP_Sales        float64
Other_Sales     float64
Global_Sales      int64
Console          object
Game Title       object
Price           float64
Mean               bool
Median             bool
dtype: object

In [4]:
# assign number values to platforms
platform_value = {
    "2600":0,
    "GEN":1,
    "N64":2,
    "NES":3,
    "PS":4,
    "SAT":5,
    "SCD":6,
    "SNES":7
}
clean_df['Platform_num'] = clean_df['Platform'].map(platform_value)
clean_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Console,Game Title,Price,Mean,Median,Platform_num
0,0,ADVENTURES OF TRON,2600,1981,Action,Mattel Interactive,630000.0,30000.0,0.0,10000.0,670000,2600,ADVENTURES OF TRON,16.1,False,False,0
1,1,AIR RAID,2600,1981,Action,Men-A-Vision,720000.0,40000.0,0.0,10000.0,770000,2600,AIR RAID,0.0,False,False,0
2,2,AIRLOCK,2600,1981,Action,Data Age,360000.0,20000.0,0.0,0.0,390000,2600,AIRLOCK,17.0,False,False,0
3,3,ALIEN,2600,1981,Action,20th Century Fox Video Games,740000.0,40000.0,0.0,10000.0,790000,2600,ALIEN,61.24,False,True,0
4,4,ARMOR AMBUSH,2600,1981,Action,Mattel Interactive,150000.0,10000.0,0.0,0.0,170000,2600,ARMOR AMBUSH,9.92,False,False,0


In [5]:
# Assign number values to Genres
genre_value = {
    "Action":0,
    "Adventure":1,
    "Fighting":2,
    "Misc":3,
    "Platform":4,
    "Puzzle":5,
    "Racing":6,
    "Role-Playing":7,
    "Shooter":8,
    "Simulation":9,
    "Sports":10,
    "Strategy":11
}
clean_df['Genre_num'] = clean_df['Genre'].map(genre_value)
clean_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Console,Game Title,Price,Mean,Median,Platform_num,Genre_num
0,0,ADVENTURES OF TRON,2600,1981,Action,Mattel Interactive,630000.0,30000.0,0.0,10000.0,670000,2600,ADVENTURES OF TRON,16.1,False,False,0,0
1,1,AIR RAID,2600,1981,Action,Men-A-Vision,720000.0,40000.0,0.0,10000.0,770000,2600,AIR RAID,0.0,False,False,0,0
2,2,AIRLOCK,2600,1981,Action,Data Age,360000.0,20000.0,0.0,0.0,390000,2600,AIRLOCK,17.0,False,False,0,0
3,3,ALIEN,2600,1981,Action,20th Century Fox Video Games,740000.0,40000.0,0.0,10000.0,790000,2600,ALIEN,61.24,False,True,0,0
4,4,ARMOR AMBUSH,2600,1981,Action,Mattel Interactive,150000.0,10000.0,0.0,0.0,170000,2600,ARMOR AMBUSH,9.92,False,False,0,0


## insert chart(s) here for quick viz

In [None]:
# scatter plot and/or Seaborn pair plot TBD

In [6]:
# define outcome/y
target = clean_df["Mean"]
target_names = ["True", "False"]

In [7]:
# drop outcome from to make new DF for features/X
data = clean_df.drop(["Mean","Name","Platform","Publisher","Console","Game Title","Median","Genre"],axis = 1, inplace=False)
feature_names = data.columns
data.head()

Unnamed: 0.1,Unnamed: 0,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Price,Platform_num,Genre_num
0,0,1981,630000.0,30000.0,0.0,10000.0,670000,16.1,0,0
1,1,1981,720000.0,40000.0,0.0,10000.0,770000,0.0,0,0
2,2,1981,360000.0,20000.0,0.0,0.0,390000,17.0,0,0
3,3,1981,740000.0,40000.0,0.0,10000.0,790000,61.24,0,0
4,4,1981,150000.0,10000.0,0.0,0.0,170000,9.92,0,0


In [8]:
# set up train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
# set SVM classifier and fit training data
model = SVC(kernel='linear')
# kernal type depends on how data is laid out(see scatter plot)liner if it looks pretty straight forward or rbf
model.fit(X_train,y_train)

In [None]:
# Print the model score using the test data
model.score(X_test, y_test)

In [None]:
# Calculate the classification report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions, target_names=target_names))

## Analysis notes
