In [27]:
#General libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Sentiment analysis libraries
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\matia\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\matia\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\matia\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\matia\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\matia\AppData\Roaming\nltk_data...
[

True

In [28]:
main_data = pd.read_csv("data/steam.csv")

In [29]:
sentiment_data = pd.read_csv("data/steam_reviews.csv")
sentiment_data.head(5)

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Ruined my life.,1,0
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,10,Counter-Strike,This game saved my virginity.,1,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1


In [30]:
#Cuanto na
print(sentiment_data.isna().sum())
sentiment_data = sentiment_data.dropna()

app_id               0
app_name        183234
review_text       7305
review_score         0
review_votes         0
dtype: int64


In [31]:
#Drop the columns that are not needed
sentiment_data = sentiment_data.drop(columns=['app_name', 'review_score', 'review_votes'])
sentiment_data.head(5) 


Unnamed: 0,app_id,review_text
0,10,Ruined my life.
1,10,This will be more of a ''my experience with th...
2,10,This game saved my virginity.
3,10,• Do you like original games? • Do you like ga...
4,10,"Easy to learn, hard to master."


In [32]:
#Preprocess the data for sentiment analysis

#Definir el lemmatizer
lemmatizer = WordNetLemmatizer()

#Definir la lista de stopwords
stop_words = set(stopwords.words('english'))

#Definir la función de tokenización
def tokenize(text):
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words]
    words = [word for word in words if word.isalnum()]
    words = [word for word in words if not word in stop_words]
    return words

#Aplicar la tokenización a la columna question_text
sentiment_data["review_text"] = sentiment_data["review_text"].apply(tokenize)

sentiment_data["review_text"]

0                                             [Ruined, life]
1          [This, experience, game, type, review, saying,...
2                             [This, game, saved, virginity]
3          [Do, like, original, game, Do, like, game, lag...
4                                [Easy, learn, hard, master]
                                 ...                        
6417101    [I, really, ove, game, need, somethings, It, b...
6417102    [Used, play, Puzzel, Pirates, back, wa, Steam,...
6417103    [This, game, wa, aright, though, bit, annoying...
6417104    [I, nice, review, recommend, game, know, purch...
6417105    [The, puzzle, game, fun, pay, basically, anyth...
Name: review_text, Length: 6226728, dtype: object

In [33]:
#Copy the data as a backup
sentiment_data_copy = sentiment_data.copy()

In [35]:
#Create a new column with the sentiment score of each review
sid = SentimentIntensityAnalyzer()
sentiment_data_copy["sentiment_score"] = sentiment_data_copy["review_text"].apply(lambda x: sid.polarity_scores(" ".join(x))["compound"])
sentiment_data_copy

Unnamed: 0,app_id,review_text,sentiment_score
0,10,"[Ruined, life]",-0.4767
1,10,"[This, experience, game, type, review, saying,...",0.9953
2,10,"[This, game, saved, virginity]",0.4215
3,10,"[Do, like, original, game, Do, like, game, lag...",0.7783
4,10,"[Easy, learn, hard, master]",0.3612
...,...,...,...
6417101,99910,"[I, really, ove, game, need, somethings, It, b...",0.7269
6417102,99910,"[Used, play, Puzzel, Pirates, back, wa, Steam,...",0.3670
6417103,99910,"[This, game, wa, aright, though, bit, annoying...",-0.2577
6417104,99910,"[I, nice, review, recommend, game, know, purch...",0.8555


In [36]:
#Drop sentiment_score column
sentiment_data_copy = sentiment_data_copy.drop(columns=["review_text"])

In [39]:
#Group the data by app_id and calculate the average sentiment score for each app
sentiment_data_grouped = sentiment_data_copy.groupby("app_id").mean()
sentiment_data_grouped

Unnamed: 0_level_0,sentiment_score
app_id,Unnamed: 1_level_1
10,0.372587
20,0.384973
30,0.369761
40,0.354543
50,0.443567
...,...
562600,0.508900
562700,0.613630
563180,0.770017
563400,0.463830


In [40]:
#Create a new csv file with the sentiment analysis data
sentiment_data_grouped.to_csv("data/sentiment_analysis.csv")