In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
from sklearn.metrics import classification_report
import numpy as np
from sklearn.tree import export_text

#Classification trees to find what key factors drive a game profitability
#By Ryan Franco

In [None]:
ds = pd.read_pickle("../../data/interim/final_merged_data.pkl");

#uncomment to remove free games, can muddy up the results sometimes
ds = ds[ds['f2p_flag'] == False].copy()
ds = ds.drop(columns=['f2p_flag'])


#drop title and app_id, not a attribute that contributes to game success
ds = ds.drop(['title'], axis=1)
ds = ds.drop(['app_id'], axis=1)


#Convert date to a number and make a col for month (there is one for year already)
ds['release'] = pd.to_datetime(ds['release'])
ds['release_month'] = ds['release'].dt.month
ds = ds.drop(['release'], axis=1)

ds



In [None]:
#Split categories into binary values, ex 1 or 0  for Action. DO this for every categories. Yes I know its alot of columns
ds['Categories'] = ds['Categories'].fillna('') #fill missing values with NaN
ds['Categories'] = ds['Categories'].str.split(',') #seperate string
ds['Categories'] = ds['Categories'].apply(lambda x: [c.strip() for c in x if c.strip() != '']) # remove spaces

#turn alll those catgories to indepedent columns
mlb_categories = MultiLabelBinarizer() 

categories_encoded = pd.DataFrame(
    mlb_categories.fit_transform(ds['Categories']),
    columns=[f"category_{cat}" for cat in mlb_categories.classes_],
    index=ds.index
)

#add the new colms to data set then remove catgories colm
ds = pd.concat([ds, categories_encoded], axis=1)
ds = ds.drop(['Categories'], axis=1)


In [None]:
#Split Genres into binary values, the same as above except for Genres
ds['Genres'] = ds['Genres'].fillna('')
ds['Genres'] = ds['Genres'].str.split(',')
ds['Genres'] = ds['Genres'].apply(lambda x: [g.strip() for g in x if g.strip() != ''])
mlb_genres = MultiLabelBinarizer()

genres_encoded = pd.DataFrame(
    mlb_genres.fit_transform(ds['Genres']),
    columns=[f"genre_{g}" for g in mlb_genres.classes_],
    index=ds.index
)
ds = pd.concat([ds, genres_encoded], axis=1)
ds = ds.drop(['Genres'], axis=1)

In [None]:
#Split tags into binary values, only the 50 most commons because there are 100's of tags. We already have enough columns
ds['Tags'] = ds['Tags'].fillna('')
ds['Tags'] = ds['Tags'].str.split(',')
ds['Tags'] = ds['Tags'].apply(lambda x: [t.strip() for t in x if t.strip() != ''])

#Count how many times that tag appears 
tag_counter = Counter(tag for tags_list in ds['Tags'] for tag in tags_list)

top_50_tags = [tag for tag, count in tag_counter.most_common(50)] #only taking the top 50 because there are alot of tags

#create 50 colms for the top 50 tags
for tag in top_50_tags:
    ds[f"tag_{tag}"] = ds['Tags'].apply(lambda tags_list: 1 if tag in tags_list else 0)
ds = ds.drop(['Tags'], axis=1)



In [None]:
# We create a binary value for the tree to work
#in this case its profitablity, 1 = proftiable 0 = not proftable 


#Target var based on average revenue, top 50%
ds['profit_flag'] = (ds['estimated_revenue'] > ds['estimated_revenue'].median()).astype(int)

#If you want it base on a custom revenue
#ds['profit_flag'] = (ds['estimated_revenue'] > 1000000).astype(int)





In [None]:
#Classification basesd on overall attributes
#Cols to drop that leak the results
colsToDrop = [
    'profit_flag',
    'estimated_revenue',
    'log_estimated_revenue',
    'copies_sold_reviews_proxy',
    'peak_players',
    'user_reviews'
]

game_features = ds.drop( colsToDrop, axis=1)


#col to predict
profit_labels = ds['profit_flag']

#20% of data is going be testing, the rest if for training
testDataPercentage = 0.2

#training data     , the test data     , data profitable  , test profitable = 
game_features_train, game_features_test, profit_labels_train, profit_labels_test = \
    train_test_split(game_features, profit_labels, test_size= testDataPercentage, random_state=13)

#Create the decision tree
clf = DecisionTreeClassifier(max_depth=12, random_state=14)
clf.fit(game_features_train, profit_labels_train)

#Prints the tree, easy for me to under compared to the chart at the end, 5 depth because too big it hard to read
# rules = export_text(clf, feature_names=list(game_features_train.columns), max_depth=5)
# print(rules)

#Prdict based on trained data
prediction = clf.predict(game_features_test)
print(classification_report(profit_labels_test, prediction))

#show importants in attributes
pd.DataFrame({
    'feature': game_features_train.columns,
    'importance': clf.feature_importances_
}).sort_values(by='importance', ascending=False)




In [None]:
#Classification basesd on ONLY Tags, what tags predict profitabilty
#----------------------------------


colsToDrop = [
    'profit_flag',
    'estimated_revenue',
    'log_estimated_revenue',
    'copies_sold_reviews_proxy',
    'user_reviews',
    'peak_players'
]

game_features = ds.drop( colsToDrop, axis=1)


tag_cols = [col for col in game_features.columns if col.startswith("tag_")]

# Remove free to play, hard to indicate revenue with :/
tag_cols = [col for col in tag_cols if col != "tag_Free to Play"]

X = game_features[tag_cols];  
y = profit_labels

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=13
)


clf = DecisionTreeClassifier(max_depth=12, random_state=13)
clf.fit(X_train, y_train)


prediction = clf.predict(X_test)
print(classification_report(y_test, prediction))

importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': clf.feature_importances_
})

# Filter to only tag columns
tag_importance = importance_df[importance_df['feature'].str.startswith("tag_")]

tag_importance.sort_values(by='importance', ascending=False)





In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(160, 100))
plot_tree(clf, feature_names=game_features.columns, filled=True, max_depth=2)
plt.show()
