In [61]:
# importing necessary libraries
import pandas as pd
import spacy
from spacy.tokens import DocBin

In [62]:
fp = r"C:\Users\Tino\Desktop\Python\Steam review scraping\reviews.csv"
df = pd.read_csv(fp, sep=";", index_col=0)
df.head()

Unnamed: 0,Found helpful,Recommend,Hours on record,Date posted,Review text
0,1839,True,46.3,"1 November, 2022","“Don't Be Sorry, Be Better.”"
1,4301,True,42.9,"8 November, 2022",Bring God Of War Ragnarok on PC
2,1860,True,18.2,"14 January, 2022",It's rare to see this much polish (on a consol...
3,7225,True,36.4,"17 January, 2022","No additional account, no unnecessary launcher..."
4,2340,True,11.3,"22 December, 2022",hi


In [63]:
# Checking if there are NaN values.
df.isna().sum()

Found helpful        0
Recommend            0
Hours on record      0
Date posted          0
Review text        154
dtype: int64

In [64]:
# Droppping empty review rows and columns that wont be needed for NLP.
df = df.dropna()[["Recommend", "Review text"]]
# Checking if the data is balanced
df["Recommend"].value_counts()

True     36230
False     1276
Name: Recommend, dtype: int64

In [65]:
# There might be some cleaning that needs to be done in the data, lets check what kind of reviews the short reviews are.
# Lets create a boolean mask for the dataframe which tells us if the lenght of the review is under 5 chars long.
mask = df["Review text"].str.len() < 5
# Lets review the what kind of unique reviews the "review text" column contains.
df.loc[mask]["Review text"].unique()

array(['hi', 'BOY', 'boy', ' BOY', 'Boy.', 'BOY!', '👍', 'boy.', 'boi',
       'BOI', ' ᠌', 'Boi.', 'Boy', ':3', 'Boi!', '`', 'OwO', 'good',
       'yes', 'BOY.', 'ok', 'Yes', 'goty', '^_^', 'Boi', 'Boy!', 'GOTY',
       'yes!', 'fun', 'boi.', 'Buoy', 'angy', 'ong', 'game', '.', 'e',
       'Yes.', 'Good', '1337', 'bOY', 'WoaW', 'gg', '++++', 'boii', 'hjj',
       '!', 'ggo', 'gud', 'nice', 'HI', 'GOOD', 'pog', 'yup', 'Gud',
       'boat', 'Ω', 'WOW', 'øks', 'bald', 'epic', 'Fire', 'real', 'peak',
       'bro', 'l', 'm', 'god', 'GAME', 'jeng', 'kill', '9/10', 'Nice',
       'boy!', 'Hmm', 'ies', '3>', ':)', 'top', '1+', 'GOAT', 'dem',
       'yea', 'Epic', 'BOY+', '👍👍👍', '<3', 'gda', 'ye', 'SEX', 'mhm',
       'NKJH', 'لا', 'w', ':D', 'Sure', 'Bald', 'swag', 'voi¡', 'GUD',
       '5*', 'best', 'yes.', '❤', 'BOI.', ';)', '...', 'YOB', 'WOW!',
       ' boy', '+', 'BOI!', 'yeah', '5/5', 'ujgb', 'GOD', 'fye', 'ggz',
       'Fr', 'A1', 'God.', 'ᛒᛟᛁ', 'Gid.', 'gud!', 'ygh', 'yep', '[',
      

In [66]:
# Looks like we wont be missing too much valuable training data if we drop the reviews that are shorter than 5 lenght wise.
# Using the mask we created to drop the unwanted values and updating the indices.
df = df.drop(df.loc[mask].index).reset_index(drop=True)
# lets swap the column places, the model will need a tuple with the text, recommend.
df = df[["Review text", "Recommend"]]

In [60]:
# there are too many positive reviews compared to negative ones so lets balance the data.
# Splitting the data to 2 dataframes and slicing the one with positive ones to match the lenght of the negative reviews.
df_neg = df[df["Recommend"] == False]
df_pos = df[df["Recommend"] == True][:len(df_neg)]
# finally combining the dataframes to a single one.
df_balanced = pd.concat([df_neg, df_pos], axis=0)
df_balanced.reset_index(inplace=True, drop=True)

In [67]:
# creating a list of tuples from the pandas df
tuples = list(df_balanced.itertuples(index=False, name=None))


In [68]:
training_data = []
nlp = spacy.load("en_core_web_md")
# a for loop for assigning binary data to the reviews
for review_text, recommend in nlp.pipe(tuples, as_tuples=True):

    if recommend == True:
        review_text.cats["Recommended"] = 1
        review_text.cats["Not recommended"] = 0
        
    else:
        review_text.cats["Recommended"] = 0
        review_text.cats["Not recommended"] = 1

    training_data.append(review_text)

In [69]:
# checking the categories for the first review.
training_data[0].cats

{'Recommended': 0, 'Not recommended': 1}

In [70]:
doc_bin = DocBin(docs=training_data)
# create the "Data" folder before this
doc_bin.to_disk("./Data/train.spacy")

# Training the model TO DO

python -m spacy init fill-config ./base_config.cfg ./config.cfg
python -m spacy train config.cfg --output ./Output


### Testing the model


In [86]:
# Loading the best model
nlp = spacy.load("Output/model-best")


In [100]:
# Finally we can use the model to check if the a given text is a positive or a negative comment.
review = nlp("dog shit ass game dont buy")
review.cats

{'Recommended': 0.5304109454154968, 'Not recommended': 0.49441006779670715}