In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [65]:
vgdata = pd.read_csv("data1.csv", encoding='windows-1252')
vgdata.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [66]:
# What are the NA_Sales for a game before it is released in the United States?

In [67]:
#Dropping null values from columns where not a lot are missing
vgdata.dropna(subset=['Name'],inplace=True)
vgdata.dropna(subset=['Year_of_Release'],inplace=True)
vgdata.dropna(subset=['Publisher'],inplace=True)
vgdata.dropna(subset=['Genre'],inplace=True)

#Filling Null Values for larger missing data
vgdata['Developer'].fillna('None',inplace=True)
vgdata['Rating'].fillna('None',inplace=True)

vgdata.sort_values('NA_Sales',ascending=False).head(5)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
9,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31,,,,,,
5,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E


In [68]:
#Drop Columns we will not use
vgdata.drop(["Critic_Score","Critic_Count","User_Score","User_Count","Global_Sales",'JP_Sales',"EU_Sales","Other_Sales"],
            axis=1,inplace=True)
vgdata.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,,


In [69]:
#Checking cardinality

columns=['Platform','Year_of_Release','Genre','Publisher','Developer','Rating']
for i in columns:
    print(i)
    print("_"*30)
    print(vgdata[i].nunique())
    print('\n')

Platform
______________________________
31


Year_of_Release
______________________________
39


Genre
______________________________
12


Publisher
______________________________
580


Developer
______________________________
1681


Rating
______________________________
9




In [70]:
#OHE GENRE AND RATING
vgdata=pd.concat([vgdata,pd.get_dummies(vgdata.Rating)],axis=1)
vgdata=pd.concat([vgdata,pd.get_dummies(vgdata.Genre)],axis=1)
vgdata.drop(['Genre','Rating'],axis=1,inplace=True)

vgdata.head()

Unnamed: 0,Name,Platform,Year_of_Release,Publisher,NA_Sales,Developer,AO,E,E10+,EC,...,Fighting,Misc,Platform.1,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,Wii Sports,Wii,2006.0,Nintendo,41.36,Nintendo,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,Super Mario Bros.,NES,1985.0,Nintendo,29.08,,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Mario Kart Wii,Wii,2008.0,Nintendo,15.68,Nintendo,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,Wii Sports Resort,Wii,2009.0,Nintendo,15.61,Nintendo,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,Pokemon Red/Pokemon Blue,GB,1996.0,Nintendo,11.27,,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [71]:
#LABEL ENCODE DEVELOPER PUBLISHER YEAR_OF RELEASE AND PLATFORM

Dev_LE=LabelEncoder()
vgdata.Developer=Dev_LE.fit_transform(vgdata.Developer)

Pub_LE=LabelEncoder()
vgdata.Publisher=Pub_LE.fit_transform(vgdata.Publisher)

YOR_LE=LabelEncoder()
vgdata.Year_of_Release=YOR_LE.fit_transform(vgdata.Year_of_Release)

Plat_LE=LabelEncoder()
vgdata.iloc[:,1]=Plat_LE.fit_transform(vgdata.iloc[:,1])

vgdata.rename(columns={vgdata.columns[1]:'System'}, inplace=True)

vgdata.head()

Unnamed: 0,Name,System,Year_of_Release,Publisher,NA_Sales,Developer,AO,E,E10+,EC,...,Fighting,Misc,System.1,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,Wii Sports,26,26,361,41.36,1008,0,1,0,0,...,0,0,26,0,0,0,0,0,1,0
1,Super Mario Bros.,11,5,361,29.08,1027,0,0,0,0,...,0,0,11,0,0,0,0,0,0,0
2,Mario Kart Wii,26,28,361,15.68,1008,0,1,0,0,...,0,0,26,0,1,0,0,0,0,0
3,Wii Sports Resort,26,29,361,15.61,1008,0,1,0,0,...,0,0,26,0,0,0,0,0,1,0
4,Pokemon Red/Pokemon Blue,5,16,361,11.27,1027,0,0,0,0,...,0,0,5,0,0,1,0,0,0,0


In [72]:
#Create Variables for ML
Y=vgdata['NA_Sales']
X=vgdata
X.drop(['NA_Sales'],axis=1,inplace=True)

In [73]:
X.drop('Name',axis=1,inplace=True)
X.head()

Unnamed: 0,System,Year_of_Release,Publisher,Developer,AO,E,E10+,EC,K-A,M,...,Fighting,Misc,System.1,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,26,26,361,1008,0,1,0,0,0,0,...,0,0,26,0,0,0,0,0,1,0
1,11,5,361,1027,0,0,0,0,0,0,...,0,0,11,0,0,0,0,0,0,0
2,26,28,361,1008,0,1,0,0,0,0,...,0,0,26,0,1,0,0,0,0,0
3,26,29,361,1008,0,1,0,0,0,0,...,0,0,26,0,0,0,0,0,1,0
4,5,16,361,1027,0,0,0,0,0,0,...,0,0,5,0,0,1,0,0,0,0


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [75]:
LR=LinearRegression()
#fit
LR.fit(X_train,y_train)
#predict
pred = LR.predict(X_test)
#scoring
print(mean_squared_error(y_test,pred))

1.034198954254693


In [None]:
# Inference explained in pdf document 