In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

## Read in data and remove unnecessary columns

In [2]:
# Import data
path = "resources/vgsales.csv"
vg_sales_df = pd.read_csv(path)
vg_sales_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [3]:
# Drop columns
vg_sales_df.drop(["Name", "Year", "Publisher", "Global_Sales"], axis=1, inplace=True)
vg_sales_df.set_index("Rank", inplace=True)
vg_sales_df.head()

Unnamed: 0_level_0,Platform,Genre,NA_Sales,EU_Sales,JP_Sales,Other_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Wii,Sports,41.49,29.02,3.77,8.46
2,NES,Platform,29.08,3.58,6.81,0.77
3,Wii,Racing,15.85,12.88,3.79,3.31
4,Wii,Sports,15.75,11.01,3.28,2.96
5,GB,Role-Playing,11.27,8.89,10.22,1.0


In [4]:
# Get unique Platform values
vg_sales_df["Platform"].unique()

array(['Wii', 'NES', 'GB', 'DS', 'X360', 'PS3', 'PS2', 'SNES', 'GBA',
       '3DS', 'PS4', 'N64', 'PS', 'XB', 'PC', '2600', 'PSP', 'XOne', 'GC',
       'WiiU', 'GEN', 'DC', 'PSV', 'SAT', 'SCD', 'WS', 'NG', 'TG16',
       '3DO', 'GG', 'PCFX'], dtype=object)

In [5]:
# Make sure there are no null values
vg_sales_df["Platform"].isnull().sum()

0

In [6]:
# Get unique Genre values
vg_sales_df["Genre"].unique()

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)

In [7]:
# Make sure there are no null values
vg_sales_df["Genre"].isnull().sum()

0

In [8]:
# Make sure sales columns do not have null values
print(f"NA_Sales has {vg_sales_df['NA_Sales'].isnull().sum()} null values.")
print(f"EU_Sales has {vg_sales_df['EU_Sales'].isnull().sum()} null values.")
print(f"JP_Sales has {vg_sales_df['JP_Sales'].isnull().sum()} null values.")
print(f"Other_Sales has {vg_sales_df['Other_Sales'].isnull().sum()} null values.")

NA_Sales has 0 null values.
EU_Sales has 0 null values.
JP_Sales has 0 null values.
Other_Sales has 0 null values.


In [9]:
# Write reduced df to csv
vg_sales_df.to_csv('resources/vgsales_reduced.csv')

## Categorize NA_Sales Column

In [10]:
# Copy vg_sales_df
categorize_na_sales = vg_sales_df.filter(["NA_Sales"], axis=1)
categorize_na_sales

Unnamed: 0_level_0,NA_Sales
Rank,Unnamed: 1_level_1
1,41.49
2,29.08
3,15.85
4,15.75
5,11.27
...,...
16596,0.01
16597,0.01
16598,0.00
16599,0.00


In [11]:
# Set threshold sales number
na_sales_threshold = 0.1

# Reassign NA_sales value
for index,row in categorize_na_sales.iterrows():
    
    # NA Sales
    if row.NA_Sales > na_sales_threshold:
        categorize_na_sales.loc[index] = 1
    else:
        categorize_na_sales.loc[index] = 0

na_sales_col_name = "NA_Sales_gt_" + str(na_sales_threshold)
categorize_na_sales.\
    rename(columns={"NA_Sales": na_sales_col_name}, inplace=True)

categorize_na_sales.head()

Unnamed: 0_level_0,NA_Sales_gt_0.1
Rank,Unnamed: 1_level_1
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0


In [12]:
# Write categorized df to csv file
output_file_str = "NA_Sales_thresh_" + \
    str(na_sales_threshold).split(".")[0] + "p" + str(na_sales_threshold).split(".")[1] + ".csv"
categorize_na_sales.to_csv('resources/' + output_file_str)

## Separate Sales Data for EU, JP, and Other

In [13]:
# Create dataframe with only sales data
sales_data = vg_sales_df.filter(["EU_Sales","JP_Sales","Other_Sales"], axis=1)
sales_data.head()

Unnamed: 0_level_0,EU_Sales,JP_Sales,Other_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,29.02,3.77,8.46
2,3.58,6.81,0.77
3,12.88,3.79,3.31
4,11.01,3.28,2.96
5,8.89,10.22,1.0


In [14]:
# Create platform df
platform_df = vg_sales_df.filter(["Platform"], axis=1)
platform_df.head()

Unnamed: 0_level_0,Platform
Rank,Unnamed: 1_level_1
1,Wii
2,NES
3,Wii
4,Wii
5,GB


## Encode Platform data

In [15]:
# Encode platform df
platform_encoded_df = pd.get_dummies(platform_df)
platform_encoded_df.head()

Unnamed: 0_level_0,Platform_2600,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_GB,Platform_GBA,Platform_GC,Platform_GEN,Platform_GG,...,Platform_SAT,Platform_SCD,Platform_SNES,Platform_TG16,Platform_WS,Platform_Wii,Platform_WiiU,Platform_X360,Platform_XB,Platform_XOne
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Get shape of platform_encoded_df
platform_encoded_df.shape

(16598, 31)

In [17]:
# Write encoded platform data to CSV
platform_encoded_df.to_csv('resources/encoded_platform.csv')

## Encode Genre data

In [18]:
# Create genre df
genre_df = vg_sales_df.filter(["Genre"], axis=1)
genre_df.head()

Unnamed: 0_level_0,Genre
Rank,Unnamed: 1_level_1
1,Sports
2,Platform
3,Racing
4,Sports
5,Role-Playing


In [19]:
# Encode genre df
genre_encoded_df = pd.get_dummies(genre_df)
genre_encoded_df.head()

Unnamed: 0_level_0,Genre_Action,Genre_Adventure,Genre_Fighting,Genre_Misc,Genre_Platform,Genre_Puzzle,Genre_Racing,Genre_Role-Playing,Genre_Shooter,Genre_Simulation,Genre_Sports,Genre_Strategy
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,1,0
5,0,0,0,0,0,0,0,1,0,0,0,0


In [20]:
# Get shape of genre_encoded_df
genre_encoded_df.shape

(16598, 12)

In [21]:
# Write encoded genre data to CSV
genre_encoded_df.to_csv('resources/encoded_genre.csv')

## Combine dataframes for analysis

In [22]:
# Combine dataframes to create sales_df_encoded
sales_df_encoded = categorize_na_sales.join(sales_data, how='outer').\
    join(platform_encoded_df, how='outer').join(genre_encoded_df, how='outer')
sales_df_encoded.head()

Unnamed: 0_level_0,NA_Sales_gt_0.1,EU_Sales,JP_Sales,Other_Sales,Platform_2600,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_GB,...,Genre_Fighting,Genre_Misc,Genre_Platform,Genre_Puzzle,Genre_Racing,Genre_Role-Playing,Genre_Shooter,Genre_Simulation,Genre_Sports,Genre_Strategy
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,29.02,3.77,8.46,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1.0,3.58,6.81,0.77,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1.0,12.88,3.79,3.31,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1.0,11.01,3.28,2.96,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,1.0,8.89,10.22,1.0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [23]:
# Create smaller df of 100 rows selected randomly from larger dataset
sales_reduced = sales_df_encoded.sample(n=500)
sales_reduced.head()

Unnamed: 0_level_0,NA_Sales_gt_0.1,EU_Sales,JP_Sales,Other_Sales,Platform_2600,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_GB,...,Genre_Fighting,Genre_Misc,Genre_Platform,Genre_Puzzle,Genre_Racing,Genre_Role-Playing,Genre_Shooter,Genre_Simulation,Genre_Sports,Genre_Strategy
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15820,0.0,0.0,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
11844,0.0,0.03,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
15105,0.0,0.0,0.02,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10240,0.0,0.02,0.0,0.01,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4324,0.0,0.0,0.45,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


## Perform ML with Logistic Regression

In [24]:
# Count True/False NA_Sales numbers
sales_reduced[na_sales_col_name].value_counts()

0.0    281
1.0    219
Name: NA_Sales_gt_0.1, dtype: int64

In [25]:
# Set X and y data
y = sales_reduced[na_sales_col_name]
X = sales_reduced.drop(columns=na_sales_col_name)

In [26]:
# Split data into training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(375, 46)

In [27]:
# Create a logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [28]:
# Train the model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [29]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1.0,0.0
1,0.0,0.0
2,1.0,1.0
3,1.0,0.0
4,1.0,1.0
5,0.0,0.0
6,0.0,0.0
7,1.0,1.0
8,0.0,1.0
9,0.0,0.0


In [30]:
# Assess accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.728


In [31]:
# # Print confusion matrix
# from sklearn.metrics import confusion_matrix, classification_report
# matrix = confusion_matrix(y_test, y_pred)
# print(matrix)

# Display the confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])

cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,56,14
Actual low_risk,20,35


In [32]:
# Print classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.74      0.80      0.77        70
         1.0       0.71      0.64      0.67        55

    accuracy                           0.73       125
   macro avg       0.73      0.72      0.72       125
weighted avg       0.73      0.73      0.73       125

