## PRE-PROCESSING
### Imports

In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder 



- Renaming columns to match each other
- Merging files together into a new csv file
- Cleaning Nan from needed files

In [18]:
SteamSpy_x = pd.read_csv('steamspy_data.csv')
SteamApp_y = pd.read_csv('steam_app_data.csv')

# Rename columns to match wording of both files to avoid conflicts
SteamSpy_x.rename(columns={'appid': 'appid', 'name': 'name', 'price': 'price'}, inplace=True)
SteamApp_y.rename(columns={'steam_appid': 'appid', 'name': 'name', 'price_overview': 'price'}, inplace=True)

# Drop NaN values from needed categories in both dataframes
SteamSpy_x.dropna(subset=['appid', 'name', 'price', 'ccu'], inplace=True)
SteamApp_y.dropna(subset=['appid', 'name', 'price'], inplace=True)

# Convert data types to reduce memory usage
SteamSpy_x['appid'] = SteamSpy_x['appid'].astype('int32')
SteamSpy_x['ccu'] = SteamSpy_x['ccu'].astype('int32')
SteamSpy_x['price'] = SteamSpy_x['price'].astype('float32')

SteamApp_y['appid'] = SteamApp_y['appid'].astype('int32')

# Merging the two dataframes
# Please note if there are overlapping columns they will be suffixed with '_x' and '_y' respectively
merged_df = pd.merge(SteamSpy_x[['appid', 'name', 'price', 'ccu']], SteamApp_y, on='appid', how='inner')
merged_df = merged_df[merged_df['ccu'] > 1000]

label_encoder = LabelEncoder()

# Encoding categorical variables to numeric values
if 'name_x' in merged_df.columns:
    merged_df['name_x'] = label_encoder.fit_transform(merged_df['name_x'])
if 'name_y' in merged_df.columns:
    merged_df['name_y'] = label_encoder.fit_transform(merged_df['name_y'])

merged_df = merged_df.drop(columns=['legal_notice', 'demos', 'drm_notice', 'ext_user_account_notice', 'fullgame', 'controller_support', 'detailed_description', 'about_the_game', 'short_description', 'package_groups', 'metacritic', 'reviews', 'screenshots', 'movies', 'recommendations', 'achievements', 'support_info', 'background', 'content_descriptors', 'type', 'is_free', 'release_date', 'platforms', 'required_age', 'categories', 'genres', 'dlc', 'mac_requirements', 'linux_requirements', 'pc_requirements', 'developers', 'publishers', 'header_image', 'website', 'release_date', 'supported_languages', 'price_y', 'packages'])


merged_df.to_csv('merged_steam_data.csv', index=False)

# Displaying pandas table of the merged dataframe
merged_df


  SteamApp_y = pd.read_csv('steam_app_data.csv')


Unnamed: 0,appid,name_x,price_x,ccu,name_y
0,10,52,999.0,16360,52
9,220,140,999.0,1177,140
10,240,53,999.0,7752,53
19,550,164,999.0,20328,164
20,620,225,199.0,2793,225
...,...,...,...,...,...
41330,1593500,129,4999.0,12509,129
43062,1677740,274,799.0,4423,274
45374,1794680,319,299.0,52451,319
46038,1832640,184,199.0,3505,184
