In [1]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

df = pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv')
df.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1


In [3]:
def preprocess_text(text):
    # Tokenizacja
    tokens = word_tokenize(text)

    # Usuwanie stopwordsów
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if not w.lower() in stop_words]

    # Lematyzacja
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]


    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

df['reviewText'] = df['reviewText'].apply(preprocess_text)
df.head()

Unnamed: 0,reviewText,Positive
0,one best apps acording bunch people agree bomb...,1
1,pretty good version game free . LOTS different...,1
2,really cool game . bunch level find golden egg...,1
3,"silly game frustrating , lot fun definitely re...",1
4,terrific game pad . Hrs fun . grandkids love ....,1


In [4]:
# Analiza sentymentu wykorzystując NLTK Vader
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):

    scores = analyzer.polarity_scores(text)
    sentiment = 1 if scores['compound'] >= 0 else 0
    return sentiment

df['sentiment'] = df['reviewText'].apply(get_sentiment)
df.head()

Unnamed: 0,reviewText,Positive,sentiment
0,one best apps acording bunch people agree bomb...,1,1
1,pretty good version game free . LOTS different...,1,1
2,really cool game . bunch level find golden egg...,1,1
3,"silly game frustrating , lot fun definitely re...",1,1
4,terrific game pad . Hrs fun . grandkids love ....,1,1


In [12]:
from sklearn.metrics import  confusion_matrix

In [10]:
# Ewaluacja modelu (metryki, macierz konfuzji)

# Oblicz liczbę poprawnych przewidywań dla każdej klasy
pos_correct = sum(1 for true, pred in zip(df['Positive'], df['sentiment']) if true == 1 and pred == 1)
neg_correct = sum(1 for true, pred in zip(df['Positive'], df['sentiment']) if true == 0 and pred == 0)

# Oblicz całkowitą liczbę przykładów dla każdej klasy
total_pos = sum(1 for label in df['sentiment'] if label == 1)
total_neg = sum(1 for label in df['sentiment'] if label == 0)

# Oblicz precision dla każdej klasy
pos_precision = pos_correct / total_pos if total_pos else 0  # Uniknij dzielenia przez 0
neg_precision = neg_correct / total_neg if total_neg else 0

# Wyświetl wyniki
print("\nClass-wise Performance:")
print(f"Positive Precision: {pos_precision:.2%}")
print(f"Negative Precision: {neg_precision:.2%}")


Class-wise Performance:
Positive Precision: 83.99%
Negative Precision: 64.77%


In [13]:
cm = confusion_matrix(df['Positive'], df['sentiment'])
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 2079  2688]
 [ 1131 14102]]
