In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
df = pd.read_csv("../data/output/processed_data.csv")

1. Ile znajduje się w zbiorze cech kategorycznych, a ile numerycznych?

In [3]:
categorical_features = df.select_dtypes(include=["object", "bool"]).columns
numerical_features = df.select_dtypes(include=["int64", "float64"]).columns

print(f"Liczba cech kategorycznych: {len(categorical_features)}")
print(f"Liczba cech numerycznych: {len(numerical_features)}")

Liczba cech kategorycznych: 11
Liczba cech numerycznych: 6


2. Czy zmienna wyjściowa jest kategoryczna, czy numeryczna?

In [None]:
output_variable_type = df["sentiment"].dtype
print(f"Typ zmiennej wyjściowej: {output_variable_type}")

Zmienna wyjściowa jest typu kategorycznego

3. Czy i ile w zbiorze jest brakujących wartości? Dla jakich zmiennych? Co z tego wynika?

In [4]:
missing_values = df.isnull().sum()
missing_values_count = missing_values.sum()
missing_values_per_feature = missing_values[missing_values > 0]

print(f"Liczba brakujących wartości: {missing_values_count}")
print("Brakujące wartości dla poszczególnych cech:")
print(missing_values_per_feature)

Liczba brakujących wartości: 42
Brakujące wartości dla poszczególnych cech:
reviewerName     9
reviewText      22
summary         11
dtype: int64


In [5]:
missing_values

overall                 0
vote                    0
image                   0
verified                0
reviewTime              0
reviewerID              0
asin                    0
reviewerName            9
reviewText             22
summary                11
unixReviewTime          0
category                0
sentiment               0
sentiment_numerical     0
textToSummaryRatio      0
reviewAge               0
hasImage                0
dtype: int64

Może to utrudnić dalszą analizę dodatkowo może to spowodować gorsze wyniki analizy.

4. Czy któreś z cech są skorelowane? Co z tego może wynikać?

In [None]:
correlation_matrix = df.corr()
print(correlation_matrix)

strong_correlation_threshold = 0.7

strong_correlations = correlation_matrix[
    abs(correlation_matrix) > strong_correlation_threshold
]
print(strong_correlations)

5. Czy któraś z cech koreluje ze zmienną wyjściową? Jeśli tak - która? Czy któraś nie koreluje?

In [None]:
df["reviewText"] = df["reviewText"].fillna("")

df["wordCount"] = df["reviewText"].apply(lambda x: len(x.split()))
df["uniqueWordRatio"] = df["reviewText"].apply(
    lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0
)
df["exclamationCount"] = df["reviewText"].apply(lambda x: x.count("!"))
df["questionCount"] = df["reviewText"].apply(lambda x: x.count("?"))
df["spaceCount"] = df["reviewText"].apply(lambda x: x.count(" "))
df["averageWordLength"] = df["reviewText"].apply(
    lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0
)

capital_letters_count = sum(1 for c in df["reviewText"] if c.isupper())
total_letters_count = sum(1 for c in df["reviewText"] if c.isalpha())

df["capitalLetterRatio"] = (
    capital_letters_count / total_letters_count if total_letters_count > 0 else 0
)

correlation_matrix = df.corr()
correlations_with_output = correlation_matrix["sentiment_numerical"]
print(correlations_with_output)

Nic nie koreluje ze sobą. Jest overall ale to dlatego że wartość sentiment_numerical bezpośrednio wychodzi z wartości overall

Głębsza analiza:

Częstość występowania słów kluczowych:

In [None]:
from collections import Counter
import nltk

nltk.download("stopwords")
nltk.download("punkt")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words("english"))


def get_words(val):
    reviews = df[df["sentiment_numerical"] == val]

    word_freq = Counter()

    for review_text in reviews["reviewText"]:
        words = word_tokenize(review_text.lower())

        filtered_words = [
            word for word in words if word.isalnum() and word not in stop_words
        ]

        word_freq.update(filtered_words)

    return word_freq


negative_words = get_words(-1)
neutral_words = get_words(0)
positive_words = get_words(1)

word_count = df["reviewText"].apply(lambda x: len(word_tokenize(x.lower())))
df["negative_word_freq"] = df["reviewText"].apply(
    lambda x: sum(
        [
            negative_words[word]
            for word in word_tokenize(x.lower())
            if word.isalnum() and word not in stop_words
        ]
    )
)
df["neutral_word_freq"] = df["reviewText"].apply(
    lambda x: sum(
        [
            neutral_words[word]
            for word in word_tokenize(x.lower())
            if word.isalnum() and word not in stop_words
        ]
    )
)
df["positive_word_freq"] = df["reviewText"].apply(
    lambda x: sum(
        [
            positive_words[word]
            for word in word_tokenize(x.lower())
            if word.isalnum() and word not in stop_words
        ]
    )
)

In [None]:
def safe_divide(a, b):
    return a / b if b != 0 else 0


df["negative_word_freq_normalized"] = df.apply(
    lambda x: safe_divide(x["negative_word_freq"], x["wordCount"]), axis=1
)
df["neutral_word_freq_normalized"] = df.apply(
    lambda x: safe_divide(x["neutral_word_freq"], x["wordCount"]), axis=1
)
df["positive_word_freq_normalized"] = df.apply(
    lambda x: safe_divide(x["positive_word_freq"], x["wordCount"]), axis=1
)

df = df.drop(["negative_word_freq", "neutral_word_freq", "positive_word_freq"], axis=1)

In [None]:
negative_word_freq = Counter(negative_words)
neutral_word_freq = Counter(neutral_words)
positive_word_freq = Counter(positive_words)

print(negative_word_freq)
print(neutral_word_freq)
print(positive_word_freq)

In [None]:
correlation_matrix = df.corr()
correlations_with_output = correlation_matrix["sentiment_numerical"]
print(correlations_with_output)

Analiza N-gramów:

In [None]:
from nltk import ngrams


def generate_ngrams(text, n):
    words = word_tokenize(text.lower())
    return list(ngrams(words, n))


def get_ngram_words(val, n):
    reviews = df[df["sentiment_numerical"] == val]
    ngram_freq = Counter()

    for ngrams in reviews[f"{n}-grams"]:
        ngram_freq.update(ngrams)

    return ngram_freq

In [None]:
df["2-grams"] = df["reviewText"].apply(lambda x: generate_ngrams(x, 2))

negative_bigrams = get_ngram_words(-1, 2)
neutral_bigrams = get_ngram_words(0, 2)
positive_bigrams = get_ngram_words(1, 2)

df["negative_bigram_freq"] = df["2-grams"].apply(
    lambda x: sum([negative_bigrams[bigram] for bigram in x])
)
df["neutral_bigram_freq"] = df["2-grams"].apply(
    lambda x: sum([neutral_bigrams[bigram] for bigram in x])
)
df["positive_bigram_freq"] = df["2-grams"].apply(
    lambda x: sum([positive_bigrams[bigram] for bigram in x])
)

In [None]:
df["3-grams"] = df["reviewText"].apply(lambda x: generate_ngrams(x, 3))

negative_trigrams = get_ngram_words(-1, 3)
neutral_trigrams = get_ngram_words(0, 3)
positive_trigrams = get_ngram_words(1, 3)

df["negative_trigram_freq"] = df["3-grams"].apply(
    lambda x: sum([negative_trigrams[trigrams] for trigrams in x])
)
df["neutral_trigram_freq"] = df["3-grams"].apply(
    lambda x: sum([neutral_trigrams[trigrams] for trigrams in x])
)
df["positive_trigram_freq"] = df["3-grams"].apply(
    lambda x: sum([positive_trigrams[trigrams] for trigrams in x])
)

In [None]:
df = df.drop(["2-grams", "3-grams"], axis=1)

correlation_matrix = df.corr()
correlations_with_output = correlation_matrix["sentiment_numerical"]
print(correlations_with_output)

Analiza składniowa:

In [None]:
import spacy

spacy.cli.download("en_core_web_sm")

nlp = spacy.load("en_core_web_sm")


def count_noun_phrases(text):
    doc = nlp(text)
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    return len(noun_phrases)


def count_adjectives(text):
    doc = nlp(text)
    adjectives = [token.text for token in doc if token.pos_ == "ADJ"]
    return len(adjectives)


def count_adj_noun_relations(text):
    doc = nlp(text)
    count = 0
    for token in doc:
        if token.pos_ == "ADJ" and token.head.pos_ == "NOUN":
            count += 1
    return count

In [None]:
# df['noun_phrase_count'] = df['reviewText'].apply(count_noun_phrases)
# df['adjective_count'] = df['reviewText'].apply(count_adjectives)
# df['adj_noun_relation_count'] = df['reviewText'].apply(count_adj_noun_relations)

"""
40 min for results:

overall                          0.944986
verified                         0.212653
unixReviewTime                   0.154403
sentiment_numerical              1.000000
textToSummaryRatio              -0.103077
reviewAge                       -0.154403
hasImage                         0.081214
wordCount                       -0.093744
uniqueWordRatio                  0.154451
exclamationCount                -0.053314
questionCount                   -0.120817
spaceCount                      -0.094012
averageWordLength                0.053776
capitalLetterRatio                    NaN
negative_word_freq_normalized   -0.025649
neutral_word_freq_normalized     0.036695
positive_word_freq_normalized    0.143411
negative_bigram_freq            -0.116719
neutral_bigram_freq             -0.124113
positive_bigram_freq            -0.065687
negative_trigram_freq           -0.131534
neutral_trigram_freq            -0.159608
positive_trigram_freq            0.082245
noun_phrase_count               -0.091599
adjective_count                 -0.073501
adj_noun_relation_count         -0.066093
Name: sentiment_numerical, dtype: float64
"""

In [None]:
# correlation_matrix = df.corr()
# correlations_with_output = correlation_matrix['sentiment_numerical']
# print(correlations_with_output)

overall                          0.944986 \
verified                         0.212653 \
unixReviewTime                   0.154403 \
sentiment_numerical              1.000000 \
textToSummaryRatio              -0.103077 \
reviewAge                       -0.154403 \
hasImage                         0.081214 \
wordCount                       -0.093744 \
uniqueWordRatio                  0.154451 \
exclamationCount                -0.053314 \
questionCount                   -0.120817 \
spaceCount                      -0.094012 \
averageWordLength                0.053776 \
capitalLetterRatio                    NaN \
negative_word_freq_normalized   -0.025649 \
neutral_word_freq_normalized     0.036695 \
positive_word_freq_normalized    0.143411 \
negative_bigram_freq            -0.116719 \
neutral_bigram_freq             -0.124113 \
positive_bigram_freq            -0.065687 \
negative_trigram_freq           -0.131534 \
neutral_trigram_freq            -0.159608 \
positive_trigram_freq            0.082245 \
noun_phrase_count               -0.091599 \
adjective_count                 -0.073501 \
adj_noun_relation_count         -0.066093 \
Name: sentiment_numerical, dtype: float64

Podsumowując korzystając z statystyki nie udało się uzyskać wyników które wskazywały by wpływ danej cechy na wartość sentymentu\

Zalecane jest użycie uczenia maszynowego w kolejnym etapie