<center style="font-size:50px;">Video Game Sales and Engagement Analysis</center>

### **1. Handling Missing Values**

In [3]:
import pandas as pd

# Load dataframes (path adjusted for clarity)
df = pd.read_csv(r'C:\Users\dhara\Documents\Interns\Labmentix\Video game\games.csv')
sales_df = pd.read_csv(r'C:\Users\dhara\Documents\Interns\Labmentix\Video game\vgsales.csv')

# Handle missing categorical values in sales data
sales_df['Year'] = sales_df['Year'].fillna('Unknown')
sales_df['Publisher'] = sales_df['Publisher'].fillna('Unknown')

# Ensure sales columns are numeric and fill NaNs with 0.0
sales_cols = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
for col in sales_cols:
    sales_df[col] = pd.to_numeric(sales_df[col], errors='coerce').fillna(0.0)

### **2. Removing Duplicate Records**

In [4]:
# Standardize key column name before merging
sales_df.rename(columns={'Name': 'Title'}, inplace=True)

# 'how=inner' keeps games found in BOTH datasets
# Note: For robust de-duplication across platforms/years, .drop_duplicates() would be used on the resulting merged_df
merged_df = pd.merge(df, sales_df, on='Title', how='inner')

### **3. Trimming and Removing Irregular Spaces**

In [5]:
def clean_k_format(value):
    if pd.isna(value):
        return 0
    # The .strip() function is used here to remove leading/trailing spaces
    val = str(value).lower().strip() 
    if 'k' in val:
        return int(float(val.replace('k', '')) * 1000)
    try:
        return int(float(val))
    except ValueError:
        return 0

### **4. Standardizing Data Formats**

In [6]:
# Date Formatting
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce').dt.strftime('%Y-%m-%d')

# Numeric Formatting (K-format conversion)
cols_to_clean = ['Plays', 'Playing', 'Backlogs', 'Wishlist', 'Number of Reviews', 'Times Listed']
for col in cols_to_clean:
    if col in df.columns:
        df[col] = df[col].apply(clean_k_format)

### **5. Final Data Validation**

In [7]:
# Save cleaned outputs
df.to_csv('cleaned_games.csv', index=False)
sales_df.to_csv('cleaned_vgsales.csv', index=False)
merged_df.to_csv('merged_game_data.csv', index=False)

print(f"Process Complete!")
print(f"Merged file saved as 'merged_game_data.csv' with {merged_df.shape[0]} rows.")

Process Complete!
Merged file saved as 'merged_game_data.csv' with 1344 rows.
