Combining Meta-data and review data

Subsetting: https://stackoverflow.com/questions/11350770/filter-pandas-dataframe-by-substring-criteria \
replacing values: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html \
Lemmatization: https://stackoverflow.com/questions/47498293/how-to-lemmatize-strings-in-pandas-dataframes \
stopword removal: https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe \
adding own stopwords: https://stackoverflow.com/questions/26826002/adding-words-to-stop-words-list-in-tfidfvectorizer-in-sklearn
cleaning text: https://www.youtube.com/watch?v=8Fw1nh8lR54


In [None]:
import pandas
import numpy as np
import copy
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from wordcloud import WordCloud
from matplotlib import pyplot as plt
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
#loading in the two review datasets and the metadata dataset
review1 = pandas.read_csv("F:\Master\Block 3\Thesis proposal\Scripts new\dataset.csv")
meta_data = pandas.read_csv("F:\Master\Block 3\Thesis proposal\Scripts new\Metadata.csv", sep = ";")

In [None]:
#checking the first 5 rows of every dataset
print(review1.head())
print(meta_data.head())

#checking all the columns of the dataset to determine which ones to drop
print(review1.columns)
print(meta_data.columns)



In [None]:
# dropping columns 
review1 = review1.drop(["review_votes"], axis = 1)


#Choosing columns in metadata dataset
meta_data = meta_data[["App ID", "Tags"]]


In [None]:
#Making the positive class the minority class (negative reviews), thus making the positive class the negative class
review1["review_score"].value_counts()


review1["review_score"] = review1["review_score"].replace(1,0)
review1["review_score"] = review1["review_score"].replace(-1,1)

In [None]:
#merging the metadata with the dataset of the reviews
complete = review1.merge(meta_data, 'inner', left_on='app_id', right_on= "App ID")


#dropping all reviews which have lower than 600 characters
Complete_600 = complete[complete['review_text'].apply(lambda x: len(str(x)) >=600)]

In [None]:
#seperating dataset for CTM and BerTopic
Dataset_Bert = Complete_600
Dataset_CTM = copy.deepcopy(Complete_600)



Text Cleaning

In [None]:

#cleaning data: removing capital letters 
Dataset_CTM["review_text"] = Dataset_CTM["review_text"].str.lower()

#cleaning data: removing punctuation 
Dataset_CTM["review_text"] = Dataset_CTM["review_text"].str.replace('[^\w\s]','')
Dataset_CTM["review_text"] = Dataset_CTM["review_text"].str.replace('_','')

#cleaning data: removing numbers 
Dataset_CTM["review_text"]  = Dataset_CTM["review_text"].str.replace('\d+', '')

#removing stopwords
additional = ["game", "play"]
stop = ENGLISH_STOP_WORDS.union(additional)
Dataset_CTM["review_text"]  = Dataset_CTM["review_text"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

#looking at a worldcloud to see if there lammetizing was succesful
text = Dataset_CTM['review_text'].values 

wordcloud = WordCloud().generate(str(text))

plt.imshow(wordcloud)
plt.axis("off")
plt.show()

#lemmentizing the words 


Dataset_CTM["review_text"] = Dataset_CTM["review_text"].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))

#looking at a worldcloud to see if there lammetizing was succesful
text = Dataset_CTM['review_text'].values 

wordcloud = WordCloud().generate(str(text))

plt.imshow(wordcloud)
plt.axis("off")
plt.show()



Tag subsetting CTM

In [None]:
#cleaning leftover NA values
Dataset_CTM.head()
Dataset_CTM.isna().sum()
Dataset_CTM = Dataset_CTM[Dataset_CTM["review_text"].notna()]
Dataset_CTM["Tags"].nunique()

In [None]:
#Sample dataset
Dataset_CTM_sample = Dataset_CTM.groupby("Tags").sample(n=4, random_state=101, replace=True)
Dataset_CTM_sample = Dataset_CTM_sample.drop_duplicates(subset=["review_text","App ID"])
Dataset_CTM_sample
Dataset_CTM_sample["Tags"].value_counts()


In [None]:
#JRPG dataset
Dataset_CTM_JRPG = Dataset_CTM[Dataset_CTM["Tags"].str.contains("JRPG", case = False)]
Dataset_CTM_JRPG
Dataset_CTM_JRPG["App ID"].nunique()
Dataset_CTM_JRPG["review_score"].value_counts()
Dataset_CTM_JRPG.to_csv("JRPG.csv")

In [None]:
#roguelite dataset
Dataset_CTM_RL = Dataset_CTM[Dataset_CTM["Tags"].str.contains("rogue-lite", case = False, regex= False)]
Dataset_CTM_RL
Dataset_CTM_RL["App ID"].nunique()
Dataset_CTM_RL["review_score"].value_counts()

In [None]:
#visual novel dataset
Dataset_CTM_novel = Dataset_CTM[Dataset_CTM["Tags"].str.contains("visual novel", case = False)]
Dataset_CTM_novel
Dataset_CTM_novel["App ID"].nunique()
Dataset_CTM_novel["review_score"].value_counts()

In [None]:
#dungeon crawler dataset
Dataset_CTM_dungeon = Dataset_CTM[Dataset_CTM["Tags"].str.contains("dungeon crawler", case = False)]
Dataset_CTM_dungeon
Dataset_CTM_dungeon["App ID"].nunique()
Dataset_CTM_dungeon["review_score"].value_counts()

In [None]:
#converting subsets to csv
Dataset_CTM_JRPG.to_csv("JRPG_final.csv", index=False)
Dataset_CTM_novel.to_csv("novel_final.csv", index=False)
Dataset_CTM_dungeon.to_csv("dungeon_final.csv", index=False)
Dataset_CTM_sample.to_csv("sample.csv", index=False)
Dataset_CTM_RL.to_csv("rogue_lite_final.csv", index=False)

Subset_BerTopic

In [None]:
#cleaning leftover NA values
Dataset_Bert.head()
Dataset_Bert.isna().sum()
Dataset_Bert = Dataset_Bert[Dataset_Bert["review_text"].notna()]
Dataset_Bert["Tags"].nunique()

In [None]:
#Sample dataset
Dataset_Bert_sample = Dataset_Bert.groupby("Tags").sample(n=4, random_state=101, replace=True)
Dataset_Bert_sample = Dataset_Bert_sample.drop_duplicates(subset=["review_text","App ID"])
Dataset_Bert_sample
Dataset_Bert_sample["Tags"].value_counts()

In [None]:
#JRPG dataset
Dataset_Bert_JRPG = Dataset_Bert[Dataset_Bert["Tags"].str.contains("JRPG", case = False)]
Dataset_Bert_JRPG
Dataset_Bert_JRPG["App ID"].nunique()
Dataset_Bert_JRPG["review_score"].value_counts()
Dataset_Bert_JRPG.to_csv("JRPG.csv")

In [None]:
#roguelite dataset
Dataset_Bert_RL = Dataset_Bert[Dataset_Bert["Tags"].str.contains("rogue-lite", case = False, regex= False)]
Dataset_Bert_RL
Dataset_Bert_RL["App ID"].nunique()
Dataset_Bert_RL["review_score"].value_counts()

In [None]:
#visual novel dataset
Dataset_Bert_novel = Dataset_Bert[Dataset_Bert["Tags"].str.contains("visual novel", case = False)]
Dataset_Bert_novel
Dataset_Bert_novel["App ID"].nunique()
Dataset_Bert_novel["review_score"].value_counts()

In [None]:
#dungeon crawler dataset
Dataset_Bert_dungeon = Dataset_Bert[Dataset_Bert["Tags"].str.contains("dungeon crawler", case = False)]
Dataset_Bert_dungeon
Dataset_Bert_dungeon["App ID"].nunique()
Dataset_Bert_dungeon["review_score"].value_counts()

In [None]:
#converting subsets to csv
Dataset_Bert_JRPG.to_csv("JRPG_final_Bert.csv", index=False)
Dataset_Bert_novel.to_csv("novel_final_Bert.csv", index=False)
Dataset_Bert_dungeon.to_csv("dungeon_final_Bert.csv", index=False)
Dataset_Bert_sample.to_csv("sample.csv_Bert", index=False)
Dataset_Bert_RL.to_csv("rogue_lite_final_Bert.csv", index=False)