In [1]:
# Se importan las librerías necesarias.

import nltk, seaborn as sns, pandas as pd, matplotlib.pyplot as plt, re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter

In [2]:
user_reviews = pd.read_parquet(r'C:\Users\roylo\OneDrive\Documentos\Data Science\Proyectos Individuales\Machine Learning Operations\Datasets\user_reviews.parquet')

user_reviews.head()

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,2011-11-05,1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,2011-07-15,22200,True,It's unique and worth a playthrough.
2,76561197970982479,2011-04-21,43110,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,2014-06-24,251610,True,I know what you think when you see this title ...
4,js41637,2013-09-08,227300,True,For a simple (it's actually not all that simpl...


Para el analísis de sentimientos se utilizará la clase "SentimentIntensityAnalyzer". La documentación de dicha clase explica que la clase es más precisa con oraciones simples que con textos completos, por lo que por cada review:

1. Se dividirá en oraciones, dicha acción se realizará con "nltk.tokenize.sent_word".
2. Cada una de las oraciones se tokenizará, es decir, eliminar signos de puntuación y stopwords obteniendo palabras clave y uniendolas en una sola "str".
3. Con "SentimentIntensityAnalizer().polarity_scores" se evalúa cada una de las oraciones y definiendo a la oración con el resultado más alto (en éste contexto no se tiene en cuenta "compound"), obteniendo n respuestas siendo n el número de oraciones por review.
4. Por cada review, se hace un conteo de "pos", "neg" y "neu". Definiendo a la review con el conteo más alto.
En caso de empate:
    - pos-neu: Se considera positivo.
    - neg-neu: Se considera negativo.
    - pos-neg: Se considera neutro.

In [3]:
def Tokenize(s:str)->list:
    if type(s)==str:
        phrase = nltk.tokenize.sent_tokenize(s.lower())
        stopwords = nltk.corpus.stopwords.words('english')
        lis = []
        for i in phrase:
            tok = nltk.tokenize.word_tokenize(re.sub('[^a-zA-Z]'," ",str(nltk.tokenize.word_tokenize(i))))
            lis.append([j for j in tok if j not in stopwords])
        ans = []
        for i in lis:
            ans.append(" ".join(i))
        return ans
    else:
        return pd.NA
    
Tokenize(user_reviews['review'][2])

['great atmosphere',
 'gunplay bit chunky times end day game definitely worth hope sequel buy game get sequel']

In [4]:
user_reviews['review_tokenized'] = user_reviews['review'].apply(Tokenize)

user_reviews.dropna(inplace=True)
user_reviews.reset_index(drop=True,inplace=True)

user_reviews.head()

Unnamed: 0,user_id,posted,item_id,recommend,review,review_tokenized
0,76561197970982479,2011-11-05,1250,True,Simple yet with great replayability. In my opi...,"[simple yet great replayability, opinion zombi..."
1,76561197970982479,2011-07-15,22200,True,It's unique and worth a playthrough.,[unique worth playthrough]
2,76561197970982479,2011-04-21,43110,True,Great atmosphere. The gunplay can be a bit chu...,"[great atmosphere, gunplay bit chunky times en..."
3,js41637,2014-06-24,251610,True,I know what you think when you see this title ...,[know think see title barbie dreamhouse party ...
4,js41637,2013-09-08,227300,True,For a simple (it's actually not all that simpl...,"[simple actually simple, truck driving simulat..."


In [5]:
def GetSentiment(t: list) -> int:
    output = {'pos': 2, 'neu': 1, 'neg': 0}
    if not t:
        return pd.NA

    sia = SentimentIntensityAnalyzer()
    sentiment_counts = Counter()

    for i in t:
        score = sia.polarity_scores(i)
        score.pop('compound')
        sentiment_counts[max(score, key=score.get)] += 1

    max_sentiment_count = max(sentiment_counts.values())
    max_sentiments = [sentiment for sentiment, count in sentiment_counts.items() if count == max_sentiment_count]

    if len(max_sentiments) > 1:
        if {'pos', 'neu'} == set(max_sentiments) or {'neu', 'pos'} == set(max_sentiments):
            return output['pos']
        if {'neg', 'neu'} == set(max_sentiments) or {'neu', 'neg'} == set(max_sentiments):
            return output['neg']
        if {'pos', 'neg'} == set(max_sentiments) or {'neg', 'pos'} == set(max_sentiments):
            return output['neu']
    else:
        return output[max_sentiments[0]]

In [6]:
user_reviews['sentiment_analysis'] = user_reviews['review_tokenized'].apply(GetSentiment)

user_reviews.dropna(inplace=True)
user_reviews.reset_index(drop=True,inplace=True)
review_tokenized = user_reviews.pop('review_tokenized')
review = user_reviews.pop('review')

user_reviews.head()

Unnamed: 0,user_id,posted,item_id,recommend,sentiment_analysis
0,76561197970982479,2011-11-05,1250,True,2
1,76561197970982479,2011-07-15,22200,True,1
2,76561197970982479,2011-04-21,43110,True,2
3,js41637,2014-06-24,251610,True,1
4,js41637,2013-09-08,227300,True,1


In [7]:
user_reviews.to_parquet(r'C:\Users\roylo\OneDrive\Documentos\Data Science\Proyectos Individuales\Machine Learning Operations\Datasets\user_reviews.parquet',compression='snappy')