## SteamGames
#### Sheida Majidi

# causal inference project

In [1]:
import sys
import pandas as pd
import numpy as np
from packaging import version
import sklearn
import ast
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
assert sys.version_info >= (3, 7)
assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

### Load Data

In [2]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_games_data():
    tarball_path = Path("/Users/sheidamajidi/Desktop/Winter2024/COURSES/INSY695/Individual Assignments/ML1/games.csv")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://www.kaggle.com/datasets/mexwell/steamgames/data"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as games_tarball:
            games_tarball.extractall(path="datasets")
    return pd.read_csv(Path("/Users/sheidamajidi/Desktop/Winter2024/COURSES/INSY695/Individual Assignments/ML1/games.csv"))

games = load_games_data()

### Preprocess Data

In [3]:
games.head()

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


#### Data Cleaning 

##### Handling missing values

In [None]:
# Dropping specified columns with more than 85% missing values
columns_to_drop = ['Reviews', 'Score rank', 'Metacritic url', 'Notes']
for column in columns_to_drop:
    if games[column].isnull().mean() > 0.85:
        games.drop(column, axis=1, inplace=True)

# Saving the modified dataset 
#games.to_csv("/Users/sheidamajidi/Desktop/Winter2024/COURSES/INSY695/Individual Assignments/ML1/SteamGames/games_cleaned.csv", index=False) 

In [None]:
# impute missing values

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Select only numeric columns for imputation
numeric_cols = games.select_dtypes(include=[np.number])

iterative_imputer = IterativeImputer()
imputed_data = iterative_imputer.fit_transform(numeric_cols)

# Creating a DataFrame with the imputed data
imputed_df = pd.DataFrame(imputed_data, columns=numeric_cols.columns)

# Merging imputed numeric data back with non-numeric data
games[numeric_cols.columns] = imputed_df


In [None]:
# Select only numeric columns
numeric_columns = games.select_dtypes(include=[np.number]).columns

# Print the list of numeric columns
print("Numeric columns in the dataset:")
print(numeric_columns.tolist())

##### Handling categorical columns

In [None]:
# Process the 'Suppported languages' column

# Prepare numeric features (X) and target (y)
# Include only numeric columns and 'Supported languages' as a categorical column
numeric_cols = ['Required age', 'Price', 'DLC count', 
                 'Metacritic score', 'User score', 'Positive', 'Negative', 'Achievements', 
                 'Recommendations', 'Average playtime forever', 'Average playtime two weeks', 
                 'Median playtime forever', 'Median playtime two weeks']

X = games[numeric_cols + ['Supported languages']]
y = games['Peak CCU']

from catboost import CatBoostRegressor

model = CatBoostRegressor(cat_features=['Supported languages'], random_state=42)

# Fit the model
model.fit(X, y)


In [None]:
# Process the 'Full audio languages' column

# Function to convert list of languages into a single string
def concat_languages(lang_list):
    return ', '.join(lang_list) if isinstance(lang_list, list) else lang_list

# Apply the function to the 'Full audio languages' column
games['Full audio languages'] = games['Full audio languages'].apply(concat_languages)

# Prepare your features (X) and target (y)
numeric_cols = ['Required age', 'Price', 'DLC count', 
                'Metacritic score', 'User score', 'Positive', 'Negative', 
                'Achievements', 'Recommendations', 'Average playtime forever', 
                'Average playtime two weeks', 'Median playtime forever', 
                'Median playtime two weeks']

X = games[numeric_cols + ['Full audio languages']]
y = games['Peak CCU']

# Instantiate a CatBoost model
model = CatBoostRegressor(cat_features=['Full audio languages'], random_state=42)

# Fit the model
model.fit(X, y)


In [None]:
# Process the 'Genres', 'Categories', and 'Tags' columns

# Function to transform comma-separated strings into single strings
def transform_column(col):
    return col.apply(lambda x: x if isinstance(x, str) else '')

# Apply the function to the 'Genres', 'Categories', and 'Tags' columns
games['Genres'] = transform_column(games['Genres'])
games['Categories'] = transform_column(games['Categories'])
games['Tags'] = transform_column(games['Tags'])

# Prepare features (X) and target (y)
numeric_cols = ['Required age', 'Price', 'DLC count', 
                'Metacritic score', 'User score', 'Positive', 'Negative', 
                'Achievements', 'Recommendations', 'Average playtime forever', 
                'Average playtime two weeks', 'Median playtime forever', 
                'Median playtime two weeks']

categorical_cols = ['Genres', 'Categories', 'Tags', 'Full audio languages']

X = games[numeric_cols + categorical_cols]
y = games['Peak CCU']

# Instantiate a CatBoost model
model = CatBoostRegressor(cat_features=categorical_cols, random_state=42)

# Fit the model
model.fit(X, y)


In [None]:
pd.set_option('display.max_columns', None)

print(games.columns.tolist())

# Reset pandas display options to default
#pd.reset_option('display.max_columns')

##### Identify and remove outliers 

In [None]:
# Identify and remove outliers using Isolation Forest

from sklearn.ensemble import IsolationForest

# Selecting relevant numeric features
feature_columns = ['Price', 'Required age', 'DLC count', 'Metacritic score',
                   'User score', 'Positive', 'Negative', 'Achievements',
                   'Recommendations', 'Average playtime forever',
                   'Average playtime two weeks', 'Median playtime forever',
                   'Median playtime two weeks']
X = games_train[feature_columns]

# Initialize and fit the Isolation Forest model
isolation_forest = IsolationForest(random_state=42)
outlier_pred = isolation_forest.fit_predict(X)

# Filter out the outliers (outlier_pred == -1 indicates an outlier)
games_train_filtered = games_train[outlier_pred != -1]

# Optionally, save the filtered dataset to a new CSV file
games_train_filtered.to_csv("games_without_outliers.csv", index=False)


In [None]:
outlier_pred

In [None]:
# Outlier detection results

# Count the number of outliers
num_outliers = np.sum(outlier_pred == -1)
num_normal = np.sum(outlier_pred == 1)

print(f"Number of Outliers: {num_outliers}")
print(f"Number of Normal Observations: {num_normal}")

# Optionally, examine the outliers
outliers = games_train[outlier_pred == -1]
print(outliers[feature_columns])


In [None]:
# Filter out the outliers (outlier_pred == -1 indicates an outlier)
games_train_filtered = games_train[outlier_pred != -1]

# Save the filtered dataset to a new CSV file
#games_filtered.to_csv("games_without_outliers.csv", index=False)

print(f"Original Dataset Size: {len(games_train)}")
print(f"Filtered Dataset Size: {len(games_train_filtered)}")



In [None]:
## Drop unnecessary columns (not significant for predicting the target variable of Peak CCU)

games_train_filtered = games_train_filtered.drop(columns=['AppID', 'Name', 'Estimated owners', 'Support url', 'Movies', 
                                              'Header image', 'Website', 'Support email', 'Screenshots',
                                              'Metacritic score', 'User score', 'Achievements', 'Recommendations'])

# Save the filtered dataset to a new CSV file
games_train_filtered.to_csv("games_train_filtered.csv", index=False)


##### Handing text "about the game"

In [None]:
games_train_filtered

In [None]:
games_train_filtered['About the game']


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#import nltk
import re
#from nltk.tokenize import RegexpTokenizer
def clean_text(text):
    if isinstance(text, str):
        # Remove special characters and convert to lowercase
        text_cleaned = re.sub(r'[^a-zA-Z0-9 ]', '', text).lower()
    else:
        # If the value is missing, replace it with an empty string
        text_cleaned = text
    return text_cleaned

#regexp = RegexpTokenizer('\w+')
games_train['cleaned_text'] = games_train_filtered['About the game'].apply(lambda x:clean_text(x))
games_train['cleaned_text'] = games_train['cleaned_text'].fillna('no text')

In [None]:
vect = CountVectorizer(stop_words='english', max_features=1000)
vect.fit_transform(games_train['cleaned_text']).toarray()

##### Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
scaler = StandardScaler()

# Columns to scale
columns_to_scale = ['DLC count', 'Positive', 'Negative','Score rank', 'Required age', 'Price']

# Apply the scaler to these columns
games_train_filtered[columns_to_scale] = scaler.fit_transform(games_train_filtered[columns_to_scale])


##### PCA

In [None]:
from sklearn.decomposition import PCA


pca_columns = ['DLC count', 'Positive', 'Negative','Score rank', 'Required age', 'Price']

# Prepare the data for PCA
data_for_pca = games_train_filtered[pca_columns]

In [None]:
n_features = data_for_pca.shape[1]  # Number of features
components = np.arange(1, n_features + 1)
variances = []

for i in components:
    pca = PCA(n_components=i)
    pca.fit(data_for_pca)
    total_variance = np.sum(pca.explained_variance_ratio_)
    variances.append(total_variance)

# Apply PCA with 5 components
pca = PCA(n_components=5)
train_pca = pd.DataFrame(pca.fit_transform(data_for_pca), index=games_train_filtered.index)

# Display the first few rows of the transformed dataset
train_pca.head()

### Exloratory Data Analysis 

In [None]:
# Visualize the correlation between numeric columns
import seaborn as sns

games['Peak CCU'] = pd.to_numeric(games['Peak CCU'], errors='coerce')

# Select numeric columns and a few key encoded categorical columns for correlation
selected_columns = ['Peak CCU', 'Price', 'Required age', 'DLC count', 'Metacritic score', 
                    'User score', 'Positive', 'Negative', 'Achievements', 'Recommendations','Average playtime forever',
                    'Average playtime two weeks', 'Median playtime forever', 'Median playtime two weeks']


correlations = games[selected_columns].corr()

# Plot correlation heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(correlations, annot=False, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
corr_matrix = games.corr(numeric_only=True)

In [None]:
corr_matrix['Price'].sort_values(ascending=False)

In [None]:
games.plot(kind="scatter", x='Peak CCU', y='Average playtime forever',
             alpha=0.1, grid=True)
plt.show()

### Causal Inference