In [86]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from sklearn.metrics import accuracy_score, classification_report

In [87]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\milos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\milos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [88]:
train_tweets = pd.read_csv("twitter_training.csv", quotechar='"', names=['tweet_id', 'theme', 'sentiment', 'text'])
test_tweets = pd.read_csv("twitter_validation.csv", quotechar='"', names=['tweet_id', 'theme', 'sentiment', 'text'])
train_tweets

Unnamed: 0,tweet_id,theme,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [89]:
train_tweets['sentiment'].value_counts()

Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: sentiment, dtype: int64

In [90]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

In [91]:
def preprocess_text(text):
    text = str(text).lower()
    text = ' '.join([x for x in text.split(' ') if x not in stop_words])
    text = ' '.join(lemmatizer.lemmatize(x) for x in text.split(' '))
    return text 

In [92]:
train_tweets['text'] = train_tweets['text'].apply(lambda x: preprocess_text(x))
test_tweets['text'] = test_tweets['text'].apply(lambda x: preprocess_text(x))

In [93]:
model = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (MultinomialNB()))         
])

In [94]:
model.fit(train_tweets['text'].values.astype('U'), train_tweets['sentiment'])

In [95]:
predicted_sentiment = model.predict(test_tweets['text'])

In [96]:
accuracy_score(test_tweets['sentiment'], predicted_sentiment)

0.816

In [97]:
print(classification_report(test_tweets['sentiment'], predicted_sentiment))

              precision    recall  f1-score   support

  Irrelevant       0.98      0.63      0.77       172
    Negative       0.72      0.95      0.82       266
     Neutral       0.92      0.73      0.81       285
    Positive       0.79      0.89      0.84       277

    accuracy                           0.82      1000
   macro avg       0.85      0.80      0.81      1000
weighted avg       0.84      0.82      0.81      1000

