In [220]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import re
from sklearn.metrics import accuracy_score, classification_report

In [221]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\milos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\milos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [222]:
train_tweets = pd.read_csv("twitter_training.csv", quotechar='"', names=['tweet_id', 'theme', 'sentiment', 'text'])
test_tweets = pd.read_csv("twitter_validation.csv", quotechar='"', names=['tweet_id', 'theme', 'sentiment', 'text'])
train_tweets

Unnamed: 0,tweet_id,theme,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [223]:
train_tweets['sentiment'].value_counts()

Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: sentiment, dtype: int64

In [224]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

In [225]:
def preprocess_text(text):
    text = str(text).lower()
    text = ' '.join([x for x in text.split(' ') if x not in stop_words])
    text = ' '.join(lemmatizer.lemmatize(x) for x in text.split(' '))
    text = re.sub(r'[^a-zA-Z ]', '', text)
    return text 

In [226]:
train_tweets['text'] = train_tweets['text'].apply(lambda x: preprocess_text(x))
test_tweets['text'] = test_tweets['text'].apply(lambda x: preprocess_text(x))

In [227]:
model = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (MultinomialNB()))         
])

In [228]:
model.fit(train_tweets['text'].values.astype('U'), train_tweets['sentiment'])

In [229]:
predicted_sentiment = model.predict(test_tweets['text'])

In [230]:
def NaiveBayesSentimentTrain(data):
    count = {}
    data = data[['text', 'sentiment']]
    for i, x in data.iterrows():
        for word in x['text'].split(" "):
            if word.islower():
                if word not in count.keys():
                    count[word] = {"Positive":0, "Negative":0,"Neutral":0,"Irrelevant":0, "Total":0}
                count[word][x['sentiment']] += 1
                count[word]['Total'] += 1
    propabilities = {}
    for x in count:
        propabilities[x] = {"Positive":count[x]['Positive']/count[x]['Total'], "Negative":count[x]['Negative']/count[x]['Total'],"Neutral":count[x]['Neutral']/count[x]['Total'],"Irrelevant":count[x]['Irrelevant']/count[x]['Total']}
    return propabilities 

In [231]:
def NaiveBayesSentimentTest(data, train_data):
    data = data[['text', 'sentiment']]
    data['Positive'] = None
    data['Negative'] = None
    data['Neutral'] = None
    data['Irrelevant'] = None
    data['Guess'] = None
    for i, x in data.iterrows():
        x['Positive'] = 0
        x['Negative'] = 0
        x['Neutral'] = 0
        x['Irrelevant'] = 0
        for word in x['text'].split(" "):
            if word in train_data.keys():
                x['Positive'] += train_data[word]['Positive']
                x['Negative'] += train_data[word]['Negative']
                x['Neutral'] += train_data[word]['Neutral']
                x['Irrelevant'] += train_data[word]['Irrelevant']
        highest_value = max([x['Positive'], x['Negative'], x['Neutral'], x['Irrelevant']])
        if x['Positive'] >= highest_value:
            x['Guess'] = 'Positive'
        elif x['Negative'] >= highest_value:
            x['Guess'] = 'Negative'
        elif x['Neutral'] >= highest_value:
            x['Guess'] = 'Neutral'
        elif x['Irrelevant'] >= highest_value:
            x['Guess'] = 'Irrelevant'
    return data

In [232]:
train_data = NaiveBayesSentimentTrain(train_tweets)

In [233]:
test_results = NaiveBayesSentimentTest(test_tweets, train_data)
test_results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Positive'] = None


Unnamed: 0,text,sentiment,Positive,Negative,Neutral,Irrelevant,Guess
0,mentioned facebook struggling motivation go ru...,Irrelevant,3.458737,6.386487,4.718375,7.436401,Irrelevant
1,bbc news amazon bos jeff bezos reject claim c...,Neutral,2.131213,3.767757,7.163975,0.937055,Neutral
2,microsoft pay word function poorly samsungus c...,Negative,0.949566,3.416201,1.900838,0.733395,Negative
3,csgo matchmaking full closet hacking truly awf...,Negative,1.249302,4.35608,1.076956,1.317662,Negative
4,president slapping american face really commit...,Neutral,1.810249,2.339827,5.959513,1.890411,Neutral
...,...,...,...,...,...,...,...
995,toronto art culture capital canada its wonder...,Irrelevant,4.600595,6.104712,5.871897,8.422796,Irrelevant
996,actually good move tot bring viewersi one peop...,Irrelevant,7.04396,6.368252,4.354924,7.232863,Irrelevant
997,today sucked its time drink wine n play border...,Positive,4.290749,4.093927,3.25179,2.363535,Positive
998,bought fraction microsoft today small wins,Positive,2.389618,1.61361,1.175294,0.821477,Positive


Nasz model:

In [234]:
accuracy_score(test_results['sentiment'], test_results['Guess'])

0.802

In [235]:
print(classification_report(test_results['sentiment'], test_results['Guess']))

              precision    recall  f1-score   support

  Irrelevant       0.95      0.60      0.74       172
    Negative       0.66      0.95      0.78       266
     Neutral       0.93      0.74      0.83       285
    Positive       0.83      0.84      0.84       277

    accuracy                           0.80      1000
   macro avg       0.84      0.79      0.80      1000
weighted avg       0.84      0.80      0.80      1000



Model z biblioteki:

In [236]:
accuracy_score(test_tweets['sentiment'], predicted_sentiment)

0.804

In [237]:
print(classification_report(test_tweets['sentiment'], predicted_sentiment))

              precision    recall  f1-score   support

  Irrelevant       0.99      0.59      0.74       172
    Negative       0.70      0.94      0.80       266
     Neutral       0.94      0.70      0.80       285
    Positive       0.77      0.91      0.83       277

    accuracy                           0.80      1000
   macro avg       0.85      0.79      0.80      1000
weighted avg       0.84      0.80      0.80      1000

