In [40]:
import pandas as pd
import numpy as np

path = "Reddit_Gaming.csv"
df = pd.read_csv(path, low_memory=False)

print("Shape:", df.shape)
df.head()


Shape: (340, 46)


Unnamed: 0,ai_use_case,game,subreddit,query_used,kind,post_id,comment_id,parent_post_id,created_utc,score,...,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45
0,image_generation,Battlefield,gaming,"(""AI art"" OR ""AI-generated"" OR ""generative AI""...",post,1pt4ihv,,,1766422405,19563,...,,,,,,,,,,
1,image_generation,Clair Obscur: Expedition 33,gaming,"(""AI art"" OR ""AI-generated"" OR ""generative AI""...",post,1prottw,,,1766265012,7626,...,,,,,,,,,,
2,image_generation,Unknown/General,gaming,"(""AI art"" OR ""AI-generated"" OR ""generative AI""...",post,1pqmi6c,,,1766154619,4464,...,,,,,,,,,,
3,image_generation,Unknown/General,gaming,"(""AI art"" OR ""AI-generated"" OR ""generative AI""...",post,1pp5zbx,,,1766000551,0,...,,,,,,,,,,
4,image_generation,Arc Raiders,gaming,"(""AI art"" OR ""AI-generated"" OR ""generative AI""...",post,1pp47wn,,,1765996446,7323,...,,,,,,,,,,


Remove “Unnamed” and fully empty columns

In [41]:
# 1) Drop columns like Unnamed: 0, Unnamed: 36, ...
unnamed_cols = [c for c in df.columns if str(c).startswith("Unnamed")]
df = df.drop(columns=unnamed_cols, errors="ignore")

# 2) Drop columns that are completely empty
df = df.dropna(axis=1, how="all")

print("Dropped Unnamed:", len(unnamed_cols))
print("Shape after column cleanup:", df.shape)


Dropped Unnamed: 26
Shape after column cleanup: (340, 20)


Standardize column names

In [42]:
df.columns = (df.columns.astype(str)
              .str.strip()
              .str.lower()
              .str.replace(" ", "_"))

df.columns


Index(['ai_use_case', 'game', 'subreddit', 'query_used', 'kind', 'post_id',
       'comment_id', 'parent_post_id', 'created_utc', 'score', 'num_comments',
       'permalink', 'title', 'text', 'combined_text', 'has_reaction_terms',
       'created_dt', 'text_len', 'score_filled', 'log_score'],
      dtype='object')

Convert key columns to correct data types

In [43]:
# timestamps
if "created_utc" in df.columns:
    df["created_utc"] = pd.to_numeric(df["created_utc"], errors="coerce")
    df["created_datetime_utc"] = pd.to_datetime(df["created_utc"], unit="s", utc=True, errors="coerce")

# engagement columns
for col in ["score", "num_comments"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")


Remove duplicates

In [44]:
if "post_id" in df.columns:
    before = len(df)
    df = df.drop_duplicates(subset=["post_id"])
    print("Duplicates removed:", before - len(df))


Duplicates removed: 40


Missing values + quick quality checks

In [45]:
# Missingness table (top 15)
missing = (df.isna().mean().sort_values(ascending=False) * 100).round(1)
display(missing.head(15))

# Quick checks for key columns
key_cols = ["game", "ai_use_case", "kind", "subreddit", "score", "num_comments"]
print({c: c in df.columns for c in key_cols})

df[["game","ai_use_case","kind"]].value_counts().head(10) if all(c in df.columns for c in ["game","ai_use_case","kind"]) else None


Unnamed: 0,0
comment_id,99.7
parent_post_id,99.7
text,8.0
created_utc,0.3
num_comments,0.3
text_len,0.3
score_filled,0.3
log_score,0.3
created_dt,0.3
created_datetime_utc,0.3


{'game': True, 'ai_use_case': True, 'kind': True, 'subreddit': True, 'score': True, 'num_comments': True}


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
game,ai_use_case,kind,Unnamed: 3_level_1
Unknown/General,image_generation,post,99
Unknown/General,gameplay_ai,post,72
Call of Duty,image_generation,post,46
Call of Duty,gameplay_ai,post,14
Unknown/General,moderation_ai,post,7
Clair Obscur: Expedition 33,image_generation,post,5
Battlefield,image_generation,post,4
Battlefield,gameplay_ai,post,3
Civilization,image_generation,post,3
Assassin's Creed,image_generation,post,3


Drop columns that are basically empty

In [46]:
df = df.drop(columns=["comment_id", "parent_post_id"], errors="ignore")


Keep only posts

In [47]:
df = df[df["kind"].str.lower().eq("post")].copy()
print(df["kind"].value_counts())


kind
post    299
Name: count, dtype: int64


Drop “helper” columns unless you need them

In [48]:
drop_helpers = ["text_len", "score_filled", "log_score", "created_datetime_utc", "has_reaction_terms", "created_utc"]
df = df.drop(columns=[c for c in drop_helpers if c in df.columns], errors="ignore")


Quick sanity checks

In [49]:
print("Shape:", df.shape)
display(df[["game","ai_use_case","score","num_comments"]].describe())
print(df[["game","ai_use_case"]].isna().mean().round(3))


Shape: (299, 13)


Unnamed: 0,score,num_comments
count,299.0,299.0
mean,692.595318,155.478261
std,2676.655025,348.534143
min,0.0,0.0
25%,0.0,9.0
50%,8.0,31.0
75%,188.5,111.5
max,31575.0,2563.0


game           0.0
ai_use_case    0.0
dtype: float64


In [50]:
clean_path = "Cleaned_Reddit_Gaming.csv"
df.to_csv(clean_path, index=False)

clean_path


'Cleaned_Reddit_Gaming.csv'