# **DATA READING AND NEEDED CHANGES IN DATA**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd


In [None]:
df = pd.read_csv("/content/drive/MyDrive/second_all_reviews.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730945 entries, 0 to 730944
Data columns (total 11 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   appid                    730945 non-null  int64  
 1   review                   730945 non-null  object 
 2   word_count               730945 non-null  int64  
 3   voted_up                 730945 non-null  bool   
 4   votes_up                 730945 non-null  int64  
 5   votes_funny              730945 non-null  int64  
 6   timestamp_created        730945 non-null  int64  
 7   author_playtime_forever  730945 non-null  int64  
 8   name                     730945 non-null  object 
 9   price                    730945 non-null  int64  
 10  release_date             0 non-null       float64
dtypes: bool(1), float64(1), int64(7), object(2)
memory usage: 56.5+ MB


In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)   # remove non-letters
    words = [w for w in text.split() if w not in stop_words and len(w) > 2]
    return " ".join(words)

df["clean_review"] = df["review"].dropna().apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df.head()

Unnamed: 0,appid,review,word_count,voted_up,votes_up,votes_funny,timestamp_created,author_playtime_forever,name,price,release_date,clean_review
0,1938090,I really loved the game but itll take you a lo...,26,True,0,0,1757864825,1485,Call of Duty: Modern Warfare II,6999,,really loved game itll take long time learn ta...
1,1938090,i cant join the game for some reason,8,False,0,0,1757864289,346,Call of Duty: Modern Warfare II,6999,,cant join game reason
2,1938090,these new games are just living in the MASSIVE...,133,False,0,0,1757861908,2131,Call of Duty: Modern Warfare II,6999,,new games living massive shadow black ops revi...
3,1938090,Its fun of course you have cheaters but beside...,25,True,0,0,1757854957,344,Call of Duty: Modern Warfare II,6999,,fun course cheaters besides game ment fun even...
4,1938090,I play this game as a casual player on PC. I h...,39,False,0,1,1757846859,1746,Call of Duty: Modern Warfare II,6999,,play game casual player mods hack shadow banne...


In [None]:
# ---------- CONFIG ----------
INPUT_FILE = "/content/drive/MyDrive/second_all_reviews.csv"
OUTPUT_FILE = "/content/drive/MyDrive/cleaned_all_reviews.csv"

# ---------- LOAD DATA ----------
df = pd.read_csv(INPUT_FILE)

# ---------- CLEANING ----------
# Drop unnecessary columns
drop_columns = ["release_date", "votes_up", "votes_funny", "timestamp_created"]
df = df.drop(columns=[col for col in drop_columns if col in df.columns])

# Fix price (divide cents by 100 → dollars)
if "price" in df.columns:
    df["price"] = df["price"] / 100

# ---------- SAVE CLEAN FILE ----------
df.to_csv(OUTPUT_FILE, index=False)

print(f"✅ Cleaned dataset saved to {OUTPUT_FILE}")


✅ Cleaned dataset saved to /content/drive/MyDrive/cleaned_all_reviews.csv


In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)   # remove non-letters
    words = [w for w in text.split() if w not in stop_words and len(w) > 2]
    return " ".join(words)

df["clean_review"] = df["review"].dropna().apply(clean_text)
df.to_csv("/content/drive/MyDrive/cleaned_all_reviews", index = False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df.head()

Unnamed: 0,appid,review,word_count,voted_up,author_playtime_forever,name,price,clean_review
0,1938090,I really loved the game but itll take you a lo...,26,True,1485,Call of Duty: Modern Warfare II,69.99,really loved game itll take long time learn ta...
1,1938090,i cant join the game for some reason,8,False,346,Call of Duty: Modern Warfare II,69.99,cant join game reason
2,1938090,these new games are just living in the MASSIVE...,133,False,2131,Call of Duty: Modern Warfare II,69.99,new games living massive shadow black ops revi...
3,1938090,Its fun of course you have cheaters but beside...,25,True,344,Call of Duty: Modern Warfare II,69.99,fun course cheaters besides game ment fun even...
4,1938090,I play this game as a casual player on PC. I h...,39,False,1746,Call of Duty: Modern Warfare II,69.99,play game casual player mods hack shadow banne...


# **DATA DESCRIPTION**

In [None]:
bins = [0,20,40,df["price"].max()]
labels = ["LOW", "MIDDLE", "HIGH"]
df["price_section"] = pd.cut(df["price"], bins = bins, labels = labels, include_lowest=True)

In [None]:
games_per_section = df.groupby("price_section")["appid"].nunique()
print("\n📊 Number of Games in Each Section:")
print(games_per_section)


📊 Number of Games in Each Section:
price_section
LOW       403
MIDDLE    228
HIGH      112
Name: appid, dtype: int64


  games_per_section = df.groupby("price_section")["appid"].nunique()


In [None]:
reviews_per_section = df["price_section"].value_counts().sort_index()
print("\n📝 Number of Reviews in Each Section:")
print(reviews_per_section)


📝 Number of Reviews in Each Section:
price_section
LOW       392139
MIDDLE    227964
HIGH      110842
Name: count, dtype: int64


In [None]:
avg_words_review = df.groupby("price_section")["word_count"].mean()
print("\n🔤 Avg Word Count in Review Column:")
print(avg_words_review)


🔤 Avg Word Count in Review Column:
price_section
LOW       53.706148
MIDDLE    60.086970
HIGH      63.455883
Name: word_count, dtype: float64


  avg_words_review = df.groupby("price_section")["word_count"].mean()


In [None]:
avg_words_clean = df.groupby("price_section")["clean_review"].apply(lambda x: x.str.split().str.len().mean())
print("\n🔤 Avg Word Count in Clean_Review Column:")
print(avg_words_clean)

  avg_words_clean = df.groupby("price_section")["clean_review"].apply(lambda x: x.str.split().str.len().mean())



🔤 Avg Word Count in Clean_Review Column:
price_section
LOW       27.718100
MIDDLE    31.150436
HIGH      32.854875
Name: clean_review, dtype: float64


# **Sentiment analysis**

In [None]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis', model = "nlptown/bert-base-multilingual-uncased-sentiment")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
from tqdm import tqdm

In [None]:
df['clean_review'] = df['clean_review'].astype(str)

classifier = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=0,  # GPU, -1 for CPU
    truncation=True
)

reviews_list = df['clean_review'].tolist()
batch_size = 32
save_interval = 5000  # save every 5000 reviews
all_labels = []

for i in tqdm(range(0, len(reviews_list), batch_size), desc="Sentiment Analysis"):
    batch = reviews_list[i:i + batch_size]
    results = classifier(batch, truncation=True)
    all_labels.extend([r['label'] for r in results])

    # Periodic saving
    if (i + batch_size) % save_interval < batch_size or (i + batch_size) >= len(reviews_list):
        temp_df = df.iloc[:len(all_labels)].copy()
        temp_df['clean_review_sentiment'] = all_labels
        temp_df['clean_review_sentiment_num'] = temp_df['clean_review_sentiment'].str.extract(r'(\d)').astype(int)
        temp_df.to_csv("/content/drive/MyDrive/cleaned_all_reviews_temp.csv", index=False)
        print(f"💾 Saved progress at {len(all_labels)} reviews")

# Final save
df['clean_review_sentiment'] = all_labels
df['clean_review_sentiment_num'] = df['clean_review_sentiment'].str.extract(r'(\d)').astype(int)
df.to_csv("/content/drive/MyDrive/cleaned_all_reviews.csv", index=False)
print("✅ Sentiment analysis complete and final CSV saved.")


Device set to use cuda:0
Sentiment Analysis:   0%|          | 10/22843 [00:03<1:38:27,  3.86it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Sentiment Analysis:   1%|          | 157/22843 [01:10<1:51:40,  3.39it/s]

💾 Saved progress at 5024 reviews


Sentiment Analysis:   1%|▏         | 313/22843 [02:08<2:51:38,  2.19it/s]

💾 Saved progress at 10016 reviews


Sentiment Analysis:   2%|▏         | 469/22843 [02:56<2:30:01,  2.49it/s]

💾 Saved progress at 15008 reviews


Sentiment Analysis:   3%|▎         | 625/22843 [03:42<2:18:05,  2.68it/s]

💾 Saved progress at 20000 reviews


Sentiment Analysis:   3%|▎         | 782/22843 [04:28<2:33:19,  2.40it/s]

💾 Saved progress at 25024 reviews


Sentiment Analysis:   4%|▍         | 938/22843 [05:21<3:33:54,  1.71it/s]

💾 Saved progress at 30016 reviews


Sentiment Analysis:   5%|▍         | 1094/22843 [06:05<2:51:06,  2.12it/s]

💾 Saved progress at 35008 reviews


Sentiment Analysis:   5%|▌         | 1250/22843 [06:52<3:24:34,  1.76it/s]

💾 Saved progress at 40000 reviews


Sentiment Analysis:   6%|▌         | 1407/22843 [07:38<3:20:38,  1.78it/s]

💾 Saved progress at 45024 reviews


Sentiment Analysis:   7%|▋         | 1563/22843 [08:24<3:38:12,  1.63it/s]

💾 Saved progress at 50016 reviews


Sentiment Analysis:   8%|▊         | 1719/22843 [09:10<3:28:16,  1.69it/s]

💾 Saved progress at 55008 reviews


Sentiment Analysis:   8%|▊         | 1875/22843 [09:56<4:32:20,  1.28it/s]

💾 Saved progress at 60000 reviews


Sentiment Analysis:   9%|▉         | 2032/22843 [10:45<4:17:57,  1.34it/s]

💾 Saved progress at 65024 reviews


Sentiment Analysis:  10%|▉         | 2188/22843 [11:32<4:10:47,  1.37it/s]

💾 Saved progress at 70016 reviews


Sentiment Analysis:  10%|█         | 2344/22843 [12:20<6:35:26,  1.16s/it]

💾 Saved progress at 75008 reviews


Sentiment Analysis:  11%|█         | 2500/22843 [13:07<4:29:47,  1.26it/s]

💾 Saved progress at 80000 reviews


Sentiment Analysis:  12%|█▏        | 2657/22843 [13:55<4:37:27,  1.21it/s]

💾 Saved progress at 85024 reviews


Sentiment Analysis:  12%|█▏        | 2813/22843 [14:44<7:11:08,  1.29s/it]

💾 Saved progress at 90016 reviews


Sentiment Analysis:  13%|█▎        | 2969/22843 [15:30<4:45:26,  1.16it/s]

💾 Saved progress at 95008 reviews


Sentiment Analysis:  14%|█▎        | 3125/22843 [16:20<5:46:35,  1.05s/it]

💾 Saved progress at 100000 reviews


Sentiment Analysis:  14%|█▍        | 3282/22843 [17:09<7:06:41,  1.31s/it]

💾 Saved progress at 105024 reviews


Sentiment Analysis:  15%|█▌        | 3438/22843 [17:56<5:28:58,  1.02s/it]

💾 Saved progress at 110016 reviews


Sentiment Analysis:  16%|█▌        | 3594/22843 [18:43<6:05:45,  1.14s/it]

💾 Saved progress at 115008 reviews


Sentiment Analysis:  16%|█▋        | 3750/22843 [19:30<6:04:24,  1.15s/it]

💾 Saved progress at 120000 reviews


Sentiment Analysis:  17%|█▋        | 3907/22843 [20:16<5:43:17,  1.09s/it]

💾 Saved progress at 125024 reviews


Sentiment Analysis:  18%|█▊        | 4063/22843 [21:03<7:37:51,  1.46s/it]

💾 Saved progress at 130016 reviews


Sentiment Analysis:  18%|█▊        | 4219/22843 [21:49<5:49:18,  1.13s/it]

💾 Saved progress at 135008 reviews


Sentiment Analysis:  19%|█▉        | 4375/22843 [22:37<8:14:42,  1.61s/it]

💾 Saved progress at 140000 reviews


Sentiment Analysis:  20%|█▉        | 4532/22843 [23:25<6:07:20,  1.20s/it]

💾 Saved progress at 145024 reviews


Sentiment Analysis:  21%|██        | 4688/22843 [24:12<6:45:32,  1.34s/it]

💾 Saved progress at 150016 reviews


Sentiment Analysis:  21%|██        | 4844/22843 [24:59<7:39:00,  1.53s/it]

💾 Saved progress at 155008 reviews


Sentiment Analysis:  22%|██▏       | 5000/22843 [25:45<6:51:26,  1.38s/it]

💾 Saved progress at 160000 reviews


Sentiment Analysis:  23%|██▎       | 5157/22843 [26:34<7:40:18,  1.56s/it]

💾 Saved progress at 165024 reviews


Sentiment Analysis:  23%|██▎       | 5313/22843 [27:22<7:49:36,  1.61s/it]

💾 Saved progress at 170016 reviews


Sentiment Analysis:  24%|██▍       | 5469/22843 [28:09<6:35:22,  1.37s/it]

💾 Saved progress at 175008 reviews


Sentiment Analysis:  25%|██▍       | 5625/22843 [28:56<8:45:40,  1.83s/it]

💾 Saved progress at 180000 reviews


Sentiment Analysis:  25%|██▌       | 5782/22843 [29:44<7:43:32,  1.63s/it]

💾 Saved progress at 185024 reviews


Sentiment Analysis:  26%|██▌       | 5938/22843 [30:32<7:15:04,  1.54s/it]

💾 Saved progress at 190016 reviews


Sentiment Analysis:  27%|██▋       | 6094/22843 [31:21<7:59:41,  1.72s/it]

💾 Saved progress at 195008 reviews


Sentiment Analysis:  27%|██▋       | 6250/22843 [32:11<8:47:53,  1.91s/it]

💾 Saved progress at 200000 reviews


Sentiment Analysis:  28%|██▊       | 6407/22843 [32:59<7:21:55,  1.61s/it]

💾 Saved progress at 205024 reviews


Sentiment Analysis:  29%|██▊       | 6563/22843 [33:48<7:40:40,  1.70s/it]

💾 Saved progress at 210016 reviews


Sentiment Analysis:  29%|██▉       | 6719/22843 [34:39<8:47:26,  1.96s/it]

💾 Saved progress at 215008 reviews


Sentiment Analysis:  30%|███       | 6875/22843 [35:27<8:50:33,  1.99s/it]

💾 Saved progress at 220000 reviews


Sentiment Analysis:  31%|███       | 7032/22843 [36:16<7:37:46,  1.74s/it]

💾 Saved progress at 225024 reviews


Sentiment Analysis:  31%|███▏      | 7188/22843 [37:07<7:52:50,  1.81s/it]

💾 Saved progress at 230016 reviews


Sentiment Analysis:  32%|███▏      | 7344/22843 [37:58<8:44:52,  2.03s/it]

💾 Saved progress at 235008 reviews


Sentiment Analysis:  33%|███▎      | 7500/22843 [38:48<9:17:12,  2.18s/it]

💾 Saved progress at 240000 reviews


Sentiment Analysis:  34%|███▎      | 7657/22843 [39:38<8:53:37,  2.11s/it]

💾 Saved progress at 245024 reviews


Sentiment Analysis:  34%|███▍      | 7813/22843 [40:28<8:18:18,  1.99s/it]

💾 Saved progress at 250016 reviews


Sentiment Analysis:  35%|███▍      | 7969/22843 [41:19<7:41:59,  1.86s/it]

💾 Saved progress at 255008 reviews


Sentiment Analysis:  36%|███▌      | 8125/22843 [42:10<9:01:34,  2.21s/it]

💾 Saved progress at 260000 reviews


Sentiment Analysis:  36%|███▋      | 8282/22843 [43:01<9:09:26,  2.26s/it]

💾 Saved progress at 265024 reviews


Sentiment Analysis:  37%|███▋      | 8438/22843 [43:51<9:02:26,  2.26s/it]

💾 Saved progress at 270016 reviews


Sentiment Analysis:  38%|███▊      | 8594/22843 [44:45<10:15:33,  2.59s/it]

💾 Saved progress at 275008 reviews


Sentiment Analysis:  38%|███▊      | 8750/22843 [45:39<9:12:05,  2.35s/it]

💾 Saved progress at 280000 reviews


Sentiment Analysis:  39%|███▉      | 8907/22843 [46:32<8:58:19,  2.32s/it]

💾 Saved progress at 285024 reviews


Sentiment Analysis:  40%|███▉      | 9063/22843 [47:25<9:02:38,  2.36s/it]

💾 Saved progress at 290016 reviews


Sentiment Analysis:  40%|████      | 9219/22843 [48:17<9:22:26,  2.48s/it]

💾 Saved progress at 295008 reviews


Sentiment Analysis:  41%|████      | 9375/22843 [49:09<9:33:11,  2.55s/it]

💾 Saved progress at 300000 reviews


Sentiment Analysis:  42%|████▏     | 9532/22843 [50:05<9:19:57,  2.52s/it]

💾 Saved progress at 305024 reviews


Sentiment Analysis:  42%|████▏     | 9688/22843 [50:59<8:36:16,  2.35s/it]

💾 Saved progress at 310016 reviews


Sentiment Analysis:  43%|████▎     | 9844/22843 [51:52<8:34:02,  2.37s/it]

💾 Saved progress at 315008 reviews


Sentiment Analysis:  44%|████▍     | 10000/22843 [52:45<8:31:47,  2.39s/it]

💾 Saved progress at 320000 reviews


Sentiment Analysis:  44%|████▍     | 10157/22843 [53:39<9:19:05,  2.64s/it]

💾 Saved progress at 325024 reviews


Sentiment Analysis:  45%|████▌     | 10313/22843 [54:34<9:48:04,  2.82s/it]

💾 Saved progress at 330016 reviews


Sentiment Analysis:  46%|████▌     | 10469/22843 [55:29<10:20:15,  3.01s/it]

💾 Saved progress at 335008 reviews


Sentiment Analysis:  47%|████▋     | 10625/22843 [56:23<9:44:47,  2.87s/it]

💾 Saved progress at 340000 reviews


Sentiment Analysis:  47%|████▋     | 10782/22843 [57:18<9:23:36,  2.80s/it]

💾 Saved progress at 345024 reviews


Sentiment Analysis:  48%|████▊     | 10938/22843 [58:13<9:04:01,  2.74s/it]

💾 Saved progress at 350016 reviews


Sentiment Analysis:  49%|████▊     | 11094/22843 [59:08<9:19:04,  2.86s/it]

💾 Saved progress at 355008 reviews


Sentiment Analysis:  49%|████▉     | 11250/22843 [1:00:02<9:34:15,  2.97s/it]

💾 Saved progress at 360000 reviews


Sentiment Analysis:  50%|████▉     | 11407/22843 [1:01:10<9:33:04,  3.01s/it]

💾 Saved progress at 365024 reviews


Sentiment Analysis:  51%|█████     | 11563/22843 [1:02:05<9:50:46,  3.14s/it]

💾 Saved progress at 370016 reviews


Sentiment Analysis:  51%|█████▏    | 11719/22843 [1:02:59<9:27:04,  3.06s/it]

💾 Saved progress at 375008 reviews


Sentiment Analysis:  52%|█████▏    | 11875/22843 [1:03:57<9:02:26,  2.97s/it]

💾 Saved progress at 380000 reviews


Sentiment Analysis:  53%|█████▎    | 12032/22843 [1:04:52<9:17:08,  3.09s/it]

💾 Saved progress at 385024 reviews


Sentiment Analysis:  53%|█████▎    | 12188/22843 [1:05:46<9:13:11,  3.12s/it]

💾 Saved progress at 390016 reviews


Sentiment Analysis:  54%|█████▍    | 12344/22843 [1:06:42<9:26:16,  3.24s/it]

💾 Saved progress at 395008 reviews


Sentiment Analysis:  55%|█████▍    | 12500/22843 [1:07:38<9:42:19,  3.38s/it]

💾 Saved progress at 400000 reviews


Sentiment Analysis:  55%|█████▌    | 12657/22843 [1:08:35<8:48:24,  3.11s/it]

💾 Saved progress at 405024 reviews


Sentiment Analysis:  56%|█████▌    | 12813/22843 [1:09:32<9:43:59,  3.49s/it]

💾 Saved progress at 410016 reviews


Sentiment Analysis:  57%|█████▋    | 12969/22843 [1:10:28<9:37:14,  3.51s/it]

💾 Saved progress at 415008 reviews


Sentiment Analysis:  57%|█████▋    | 13125/22843 [1:11:23<8:44:05,  3.24s/it]

💾 Saved progress at 420000 reviews


Sentiment Analysis:  58%|█████▊    | 13282/22843 [1:12:21<9:35:26,  3.61s/it]

💾 Saved progress at 425024 reviews


Sentiment Analysis:  59%|█████▉    | 13438/22843 [1:13:18<9:03:47,  3.47s/it]

💾 Saved progress at 430016 reviews


Sentiment Analysis:  60%|█████▉    | 13594/22843 [1:14:14<9:13:01,  3.59s/it]

💾 Saved progress at 435008 reviews


Sentiment Analysis:  60%|██████    | 13750/22843 [1:15:13<9:14:11,  3.66s/it]

💾 Saved progress at 440000 reviews


Sentiment Analysis:  61%|██████    | 13907/22843 [1:16:09<9:02:14,  3.64s/it]

💾 Saved progress at 445024 reviews


Sentiment Analysis:  62%|██████▏   | 14063/22843 [1:17:05<9:07:02,  3.74s/it]

💾 Saved progress at 450016 reviews


Sentiment Analysis:  62%|██████▏   | 14219/22843 [1:18:03<8:46:05,  3.66s/it]

💾 Saved progress at 455008 reviews


Sentiment Analysis:  63%|██████▎   | 14375/22843 [1:19:00<8:53:07,  3.78s/it]

💾 Saved progress at 460000 reviews


Sentiment Analysis:  64%|██████▎   | 14532/22843 [1:19:56<8:30:40,  3.69s/it]

💾 Saved progress at 465024 reviews


Sentiment Analysis:  64%|██████▍   | 14688/22843 [1:20:54<8:45:07,  3.86s/it]

💾 Saved progress at 470016 reviews


Sentiment Analysis:  65%|██████▍   | 14844/22843 [1:21:52<9:34:21,  4.31s/it]

💾 Saved progress at 475008 reviews


Sentiment Analysis:  66%|██████▌   | 15000/22843 [1:22:48<8:11:29,  3.76s/it]

💾 Saved progress at 480000 reviews


Sentiment Analysis:  66%|██████▋   | 15157/22843 [1:23:45<8:14:24,  3.86s/it]

💾 Saved progress at 485024 reviews


Sentiment Analysis:  67%|██████▋   | 15313/22843 [1:24:42<8:08:39,  3.89s/it]

💾 Saved progress at 490016 reviews


Sentiment Analysis:  68%|██████▊   | 15469/22843 [1:25:38<7:53:12,  3.85s/it]

💾 Saved progress at 495008 reviews


Sentiment Analysis:  68%|██████▊   | 15625/22843 [1:26:39<8:24:01,  4.19s/it]

💾 Saved progress at 500000 reviews


Sentiment Analysis:  69%|██████▉   | 15782/22843 [1:27:36<7:54:36,  4.03s/it]

💾 Saved progress at 505024 reviews


Sentiment Analysis:  70%|██████▉   | 15938/22843 [1:28:32<7:43:03,  4.02s/it]

💾 Saved progress at 510016 reviews


Sentiment Analysis:  70%|███████   | 16094/22843 [1:29:31<7:53:26,  4.21s/it]

💾 Saved progress at 515008 reviews


Sentiment Analysis:  71%|███████   | 16250/22843 [1:30:28<7:18:10,  3.99s/it]

💾 Saved progress at 520000 reviews


Sentiment Analysis:  72%|███████▏  | 16407/22843 [1:31:28<7:25:01,  4.15s/it]

💾 Saved progress at 525024 reviews


Sentiment Analysis:  73%|███████▎  | 16563/22843 [1:32:26<7:11:48,  4.13s/it]

💾 Saved progress at 530016 reviews


Sentiment Analysis:  73%|███████▎  | 16719/22843 [1:33:24<7:10:18,  4.22s/it]

💾 Saved progress at 535008 reviews


Sentiment Analysis:  74%|███████▍  | 16875/22843 [1:34:22<7:02:13,  4.24s/it]

💾 Saved progress at 540000 reviews


Sentiment Analysis:  75%|███████▍  | 17032/22843 [1:35:21<6:54:49,  4.28s/it]

💾 Saved progress at 545024 reviews


Sentiment Analysis:  75%|███████▌  | 17188/22843 [1:36:18<6:45:19,  4.30s/it]

💾 Saved progress at 550016 reviews


Sentiment Analysis:  76%|███████▌  | 17344/22843 [1:37:17<6:50:58,  4.48s/it]

💾 Saved progress at 555008 reviews


Sentiment Analysis:  77%|███████▋  | 17500/22843 [1:38:15<6:36:42,  4.45s/it]

💾 Saved progress at 560000 reviews


Sentiment Analysis:  77%|███████▋  | 17657/22843 [1:39:15<6:23:01,  4.43s/it]

💾 Saved progress at 565024 reviews


Sentiment Analysis:  78%|███████▊  | 17813/22843 [1:40:13<6:12:25,  4.44s/it]

💾 Saved progress at 570016 reviews


Sentiment Analysis:  79%|███████▊  | 17969/22843 [1:41:15<6:18:53,  4.66s/it]

💾 Saved progress at 575008 reviews


Sentiment Analysis:  79%|███████▉  | 18125/22843 [1:42:16<6:08:26,  4.69s/it]

💾 Saved progress at 580000 reviews


Sentiment Analysis:  80%|████████  | 18282/22843 [1:43:16<5:38:33,  4.45s/it]

💾 Saved progress at 585024 reviews


Sentiment Analysis:  81%|████████  | 18438/22843 [1:44:17<6:00:21,  4.91s/it]

💾 Saved progress at 590016 reviews


Sentiment Analysis:  81%|████████▏ | 18594/22843 [1:45:16<5:27:55,  4.63s/it]

💾 Saved progress at 595008 reviews


Sentiment Analysis:  82%|████████▏ | 18750/22843 [1:46:17<5:15:06,  4.62s/it]

💾 Saved progress at 600000 reviews


Sentiment Analysis:  83%|████████▎ | 18907/22843 [1:47:19<5:18:53,  4.86s/it]

💾 Saved progress at 605024 reviews


Sentiment Analysis:  83%|████████▎ | 19063/22843 [1:48:19<4:59:46,  4.76s/it]

💾 Saved progress at 610016 reviews


Sentiment Analysis:  84%|████████▍ | 19219/22843 [1:49:17<4:46:12,  4.74s/it]

💾 Saved progress at 615008 reviews


Sentiment Analysis:  85%|████████▍ | 19375/22843 [1:50:17<5:02:04,  5.23s/it]

💾 Saved progress at 620000 reviews


Sentiment Analysis:  86%|████████▌ | 19532/22843 [1:51:16<4:33:30,  4.96s/it]

💾 Saved progress at 625024 reviews


Sentiment Analysis:  86%|████████▌ | 19688/22843 [1:52:15<4:14:42,  4.84s/it]

💾 Saved progress at 630016 reviews


Sentiment Analysis:  87%|████████▋ | 19844/22843 [1:53:15<4:16:24,  5.13s/it]

💾 Saved progress at 635008 reviews


Sentiment Analysis:  88%|████████▊ | 20000/22843 [1:54:13<3:50:06,  4.86s/it]

💾 Saved progress at 640000 reviews


Sentiment Analysis:  88%|████████▊ | 20157/22843 [1:55:15<3:51:47,  5.18s/it]

💾 Saved progress at 645024 reviews


Sentiment Analysis:  89%|████████▉ | 20313/22843 [1:56:14<3:28:39,  4.95s/it]

💾 Saved progress at 650016 reviews


Sentiment Analysis:  90%|████████▉ | 20469/22843 [1:57:14<3:28:29,  5.27s/it]

💾 Saved progress at 655008 reviews


Sentiment Analysis:  90%|█████████ | 20625/22843 [1:58:15<3:04:32,  4.99s/it]

💾 Saved progress at 660000 reviews


Sentiment Analysis:  91%|█████████ | 20782/22843 [1:59:14<2:54:51,  5.09s/it]

💾 Saved progress at 665024 reviews


Sentiment Analysis:  92%|█████████▏| 20938/22843 [2:00:16<2:45:52,  5.22s/it]

💾 Saved progress at 670016 reviews


Sentiment Analysis:  92%|█████████▏| 21094/22843 [2:01:17<2:26:57,  5.04s/it]

💾 Saved progress at 675008 reviews


Sentiment Analysis:  93%|█████████▎| 21250/22843 [2:02:18<2:25:34,  5.48s/it]

💾 Saved progress at 680000 reviews


Sentiment Analysis:  94%|█████████▎| 21407/22843 [2:03:19<2:01:44,  5.09s/it]

💾 Saved progress at 685024 reviews


Sentiment Analysis:  94%|█████████▍| 21563/22843 [2:04:22<1:59:20,  5.59s/it]

💾 Saved progress at 690016 reviews


Sentiment Analysis:  95%|█████████▌| 21719/22843 [2:05:23<1:40:54,  5.39s/it]

💾 Saved progress at 695008 reviews


Sentiment Analysis:  96%|█████████▌| 21875/22843 [2:06:25<1:25:00,  5.27s/it]

💾 Saved progress at 700000 reviews


Sentiment Analysis:  96%|█████████▋| 22032/22843 [2:07:28<1:10:39,  5.23s/it]

💾 Saved progress at 705024 reviews


Sentiment Analysis:  97%|█████████▋| 22188/22843 [2:08:30<58:43,  5.38s/it]

💾 Saved progress at 710016 reviews


Sentiment Analysis:  98%|█████████▊| 22344/22843 [2:09:33<46:05,  5.54s/it]

💾 Saved progress at 715008 reviews


Sentiment Analysis:  98%|█████████▊| 22500/22843 [2:10:36<32:47,  5.74s/it]

💾 Saved progress at 720000 reviews


Sentiment Analysis:  99%|█████████▉| 22657/22843 [2:11:39<16:37,  5.37s/it]

💾 Saved progress at 725024 reviews


Sentiment Analysis: 100%|█████████▉| 22813/22843 [2:12:42<02:42,  5.41s/it]

💾 Saved progress at 730016 reviews


Sentiment Analysis: 100%|██████████| 22843/22843 [2:13:11<00:00,  2.86it/s]

💾 Saved progress at 730945 reviews





✅ Sentiment analysis complete and final CSV saved.


In [None]:
df2 = pd.read_csv("/content/drive/MyDrive/cleaned_all_reviews.csv")

In [None]:
df2.head()

Unnamed: 0,appid,review,word_count,voted_up,author_playtime_forever,name,price,clean_review,price_section,clean_review_sentiment,clean_review_sentiment_num
0,1938090,I really loved the game but itll take you a lo...,26,True,1485,Call of Duty: Modern Warfare II,69.99,really loved game itll take long time learn ta...,HIGH,5 stars,5
1,1938090,i cant join the game for some reason,8,False,346,Call of Duty: Modern Warfare II,69.99,cant join game reason,HIGH,5 stars,5
2,1938090,these new games are just living in the MASSIVE...,133,False,2131,Call of Duty: Modern Warfare II,69.99,new games living massive shadow black ops revi...,HIGH,1 star,1
3,1938090,Its fun of course you have cheaters but beside...,25,True,344,Call of Duty: Modern Warfare II,69.99,fun course cheaters besides game ment fun even...,HIGH,4 stars,4
4,1938090,I play this game as a casual player on PC. I h...,39,False,1746,Call of Duty: Modern Warfare II,69.99,play game casual player mods hack shadow banne...,HIGH,1 star,1


In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730945 entries, 0 to 730944
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   appid                       730945 non-null  int64  
 1   review                      730945 non-null  object 
 2   word_count                  730945 non-null  int64  
 3   voted_up                    730945 non-null  bool   
 4   author_playtime_forever     730945 non-null  int64  
 5   name                        730945 non-null  object 
 6   price                       730945 non-null  float64
 7   clean_review                726486 non-null  object 
 8   price_section               730945 non-null  object 
 9   clean_review_sentiment      730945 non-null  object 
 10  clean_review_sentiment_num  730945 non-null  int64  
dtypes: bool(1), float64(1), int64(4), object(5)
memory usage: 56.5+ MB


In [None]:
df2["clean_review_sentiment"]

Unnamed: 0,clean_review_sentiment
0,5 stars
1,5 stars
2,1 star
3,4 stars
4,1 star
...,...
730940,4 stars
730941,5 stars
730942,5 stars
730943,4 stars


In [None]:
for col in ["clean_review_sentiment", "clean_review_sentiment_num"]:
    if col not in df2.columns:
        df2[col] = pd.NA

# Now safe to check nulls
print(df2["clean_review_sentiment"].isnull().sum())
print(df2["clean_review_sentiment_num"].isnull().sum())


0
0
