In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from scipy.sparse import hstack, csr_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from joblib import dump, load

In [2]:
df = pd.read_csv('vgsales.csv')
df = df.dropna()
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [3]:
swords = stopwords.words('english')

In [4]:
def clean_text(sent):
    tokens = word_tokenize(sent)
    wnl = WordNetLemmatizer()
    clean = ' '.join([wnl.lemmatize(word)
                     for word in tokens if word.isalpha()])
    return clean

In [5]:
tfidf = TfidfVectorizer(analyzer=clean_text)

In [6]:
text_columns = ['Name', 'Platform', 'Genre', 'Publisher']

In [7]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
X_text = tfidf.fit_transform(df[text_columns].apply(''.join, axis=1))

In [9]:
X_numerical = df[['Year', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']]

In [10]:
X_combined = hstack([X_text, csr_matrix(X_numerical)])

In [11]:
df['Global_Sales'] = np.log1p(df['Global_Sales'])

In [12]:
y = df['Global_Sales']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.25, random_state=42)

In [14]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

In [15]:
rf_regressor.fit(X_train, y_train)

In [16]:
y_pred = rf_regressor.predict(X_test)

In [17]:
mse = mean_squared_error(y_test, y_pred)

In [18]:
rmse = np.sqrt(mse)

In [19]:
r2 = r2_score(y_test,y_pred)

In [20]:
y_pred

array([0.4543816 , 0.04515781, 0.17395331, ..., 1.14892863, 0.06539287,
       0.1220375 ])

In [21]:
mse

0.0009324878659488037

In [22]:
rmse

0.030536664289814035

In [23]:
r2

0.9942637625927162

In [24]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([y_test.min(), y_test.max()], [
         y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Values (y_test)')
plt.ylabel('Predicted Values (y_pred)')
plt.title('Actual vs. Predicted Values')
plt.show()

NameError: name 'plt' is not defined

In [24]:
import pickle
pickle.dump(rf_regressor, open('./vgmodel.sav', 'wb'))