In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
# Import data
path = "resources/vgsales.csv"
vg_sales_df = pd.read_csv(path)
vg_sales_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [3]:
# Drop columns
vg_sales_df.drop(["Name", "Year", "Publisher", "Global_Sales"], axis=1, inplace=True)
vg_sales_df.set_index("Rank", inplace=True)
vg_sales_df.head()

Unnamed: 0_level_0,Platform,Genre,NA_Sales,EU_Sales,JP_Sales,Other_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Wii,Sports,41.49,29.02,3.77,8.46
2,NES,Platform,29.08,3.58,6.81,0.77
3,Wii,Racing,15.85,12.88,3.79,3.31
4,Wii,Sports,15.75,11.01,3.28,2.96
5,GB,Role-Playing,11.27,8.89,10.22,1.0


### Categorize NA_Sales Column

In [4]:
# Copy vg_sales_df
categorize_na_sales = vg_sales_df.copy()

In [5]:
# Set threshold sales number
na_sales_threshold = 0.1

# Reassign NA_sales value
for index,row in categorize_na_sales.iterrows():
    
    # NA Sales
    if row.NA_Sales > na_sales_threshold:
        categorize_na_sales.loc[index, "NA_Sales"] = 1
    else:
        categorize_na_sales.loc[index, "NA_Sales"] = 0
        
categorize_na_sales.head()

Unnamed: 0_level_0,Platform,Genre,NA_Sales,EU_Sales,JP_Sales,Other_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Wii,Sports,1.0,29.02,3.77,8.46
2,NES,Platform,1.0,3.58,6.81,0.77
3,Wii,Racing,1.0,12.88,3.79,3.31
4,Wii,Sports,1.0,11.01,3.28,2.96
5,GB,Role-Playing,1.0,8.89,10.22,1.0


In [6]:
# Get unique Platform values
categorize_na_sales["Platform"].unique()

array(['Wii', 'NES', 'GB', 'DS', 'X360', 'PS3', 'PS2', 'SNES', 'GBA',
       '3DS', 'PS4', 'N64', 'PS', 'XB', 'PC', '2600', 'PSP', 'XOne', 'GC',
       'WiiU', 'GEN', 'DC', 'PSV', 'SAT', 'SCD', 'WS', 'NG', 'TG16',
       '3DO', 'GG', 'PCFX'], dtype=object)

In [7]:
# Make sure there are no null values
categorize_na_sales["Platform"].isnull().sum()

0

In [8]:
# Get unique Genre values
categorize_na_sales["Genre"].unique()

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)

In [9]:
# Make sure there are no null values
categorize_na_sales["Genre"].isnull().sum()

0

In [10]:
# Make sure sales columns do not have null values
print(f"NA_Sales has {categorize_na_sales['NA_Sales'].isnull().sum()} null values.")
print(f"EU_Sales has {categorize_na_sales['EU_Sales'].isnull().sum()} null values.")
print(f"JP_Sales has {categorize_na_sales['JP_Sales'].isnull().sum()} null values.")
print(f"Other_Sales has {categorize_na_sales['Other_Sales'].isnull().sum()} null values.")

NA_Sales has 0 null values.
EU_Sales has 0 null values.
JP_Sales has 0 null values.
Other_Sales has 0 null values.


### Encode Categorical Data - Platform and Genre

In [11]:
# Encode Platform and Genre columns
sales_df_encoded = pd.get_dummies(categorize_na_sales, columns=["Platform", "Genre"])
sales_df_encoded.head()

Unnamed: 0_level_0,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Platform_2600,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_GB,...,Genre_Fighting,Genre_Misc,Genre_Platform,Genre_Puzzle,Genre_Racing,Genre_Role-Playing,Genre_Shooter,Genre_Simulation,Genre_Sports,Genre_Strategy
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,29.02,3.77,8.46,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1.0,3.58,6.81,0.77,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1.0,12.88,3.79,3.31,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1.0,11.01,3.28,2.96,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,1.0,8.89,10.22,1.0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [12]:
# Create smaller df of 500 rows selected randomly from larger dataset
sales_reduced = sales_df_encoded.sample(n=500)
sales_reduced.head()

Unnamed: 0_level_0,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Platform_2600,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_GB,...,Genre_Fighting,Genre_Misc,Genre_Platform,Genre_Puzzle,Genre_Racing,Genre_Role-Playing,Genre_Shooter,Genre_Simulation,Genre_Sports,Genre_Strategy
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6425,1.0,0.0,0.0,0.02,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1043,1.0,0.16,0.0,0.12,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
16041,0.0,0.01,0.0,0.0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
10477,0.0,0.01,0.0,0.01,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
303,1.0,1.35,0.0,0.54,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [13]:
# Count True/False NA_Sales numbers
sales_reduced["NA_Sales"].value_counts()

0.0    283
1.0    217
Name: NA_Sales, dtype: int64

In [14]:
# Set X and y data
y = sales_reduced["NA_Sales"]
X = sales_reduced.drop(columns="NA_Sales")

In [15]:
# Split data into training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(375, 46)

In [16]:
# Create a logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [17]:
# Train the model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [18]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0.0,0.0
1,0.0,0.0
2,1.0,1.0
3,0.0,0.0
4,0.0,1.0
5,0.0,0.0
6,0.0,0.0
7,1.0,1.0
8,1.0,1.0
9,0.0,0.0


In [19]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.728
