In [1]:
import pandas as pd
import re

# Load the datasets
cleaned_dataset_path = './data/Cleaned_Dataset.csv'
steam_dataset_path = './data/steam.csv'

df_cleaned = pd.read_csv(cleaned_dataset_path)
df_steam = pd.read_csv(steam_dataset_path)

# Normalize game names for better matching
def normalize_name(name):
    name = str(name).strip().lower()
    name = re.sub(r'[^a-z0-9\s]', '', name)  # Remove special characters like ™ or ™
    name = name.replace(':', '')
    return name

df_cleaned['normalized_game_name'] = df_cleaned['game_name'].apply(normalize_name)
df_steam['normalized_name'] = df_steam['name'].apply(normalize_name)

# Merge datasets on normalized names
df_merged = df_cleaned.merge(df_steam, how='left', left_on='normalized_game_name', right_on='normalized_name')

# Filter rows where no match was found
unmatched_games = df_merged[df_merged['appid'].isna()]
unmatched_game_list = unmatched_games['game_name'].unique()

# Save unmatched games to a file
unmatched_games_path = './data/unmatched_games.csv'
pd.DataFrame(unmatched_game_list, columns=['Unmatched Game Name']).to_csv(unmatched_games_path, index=False)

# Select final columns for the merged dataset
final_columns = ['user_id', 'game_name', 'name', 'appid', 'genres', 'price', 'rating']
df_final = df_merged[final_columns]

# Save the final dataset
final_dataset_path = './data/final_dataset.csv'
df_final.to_csv(final_dataset_path, index=False)

print(f"Unmatched games saved to {unmatched_games_path}")
print(f"Final dataset saved to {final_dataset_path}")

Unmatched games saved to ./unmatched_games.csv
Final dataset saved to ./data/final_dataset.csv


In [2]:
# Check for missing values and duplicates
df_final.isnull().sum(), df_cleaned.duplicated().sum()

(user_id      0
 game_name    0
 name         0
 appid        0
 genres       0
 price        0
 rating       0
 dtype: int64,
 0)

In [3]:
import pandas as pd

# Load the datasets
final_dataset_path = './data/final_dataset.csv'
steam_media_data_path = './data/steam_media_data.csv'

df_final = pd.read_csv(final_dataset_path)
df_steam_media = pd.read_csv(steam_media_data_path, low_memory=False)

# Normalize the appid columns to numeric
df_final['appid'] = pd.to_numeric(df_final['appid'], errors='coerce')
df_steam_media['steam_appid'] = pd.to_numeric(df_steam_media['steam_appid'], errors='coerce')

# Rename steam_appid to appid for consistency
df_steam_media.rename(columns={'steam_appid': 'appid'}, inplace=True)

# Merge the datasets based on appid
df_combined = df_final.merge(df_steam_media[['appid', 'header_image', 'background', 'movies']], how='left', on='appid')

# Check for missing values in the added columns
missing_values = df_combined[['header_image', 'background', 'movies']].isna().sum()
print("Missing values in the added columns:")
print(missing_values)

# Handle missing values (example: fill with placeholders or drop rows with nulls)
if missing_values.any():
    print("Handling missing values...")
    df_combined.fillna({'header_image': 'no_image_available', 
                        'background': 'no_background_available', 
                        'movies': 'no_movies_available'}, inplace=True)
    print("Missing values have been handled.")

# Save the combined dataset
combined_dataset_path = './data/combined_final_dataset.csv'
df_combined.to_csv(combined_dataset_path, index=False)

print(f"Combined dataset saved to {combined_dataset_path}")

Missing values in the added columns:
header_image        0
background          0
movies          31068
dtype: int64
Handling missing values...
Missing values have been handled.
Combined dataset saved to ./data/combined_final_dataset.csv


In [4]:
import pandas as pd

# Load the datasets
final_dataset_path = './data/combined_final_dataset.csv'
steam_description_data_path = './data/steam_description_data.csv'

df_final = pd.read_csv(final_dataset_path)
df_steam_description = pd.read_csv(steam_description_data_path, low_memory=False)

# Normalize the appid columns to numeric
df_final['appid'] = pd.to_numeric(df_final['appid'], errors='coerce')
df_steam_description['steam_appid'] = pd.to_numeric(df_steam_description['steam_appid'], errors='coerce')

# Rename steam_appid to appid for consistency
df_steam_description.rename(columns={'steam_appid': 'appid'}, inplace=True)

# Merge the datasets based on appid
df_combined = df_final.merge(df_steam_description[['appid', 'short_description']], how='left', on='appid')

# Check for missing values in the short_description column
missing_values = df_combined['short_description'].isna().sum()
print(f"Missing values in the 'short_description' column: {missing_values}")

# Handle missing values (example: fill with placeholders or drop rows with nulls)
if missing_values > 0:
    print("Handling missing values...")
    df_combined['short_description'].fillna('No description available', inplace=True)
    print("Missing values in 'short_description' have been handled.")

# Save the combined dataset
combined_dataset_path = './data/combined_final_dataset.csv'
df_combined.to_csv(combined_dataset_path, index=False)

print(f"Combined dataset saved to {combined_dataset_path}")

Missing values in the 'short_description' column: 0
Combined dataset saved to ./data/combined_final_dataset.csv
