In [10]:
import pandas as pd

df = pd.read_csv('./news_data.csv', encoding='ISO-8859-1', header=None, names=['sentiment', 'text'])

df

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [13]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word in stop_words]
    porter = PorterStemmer()
    stemmed_words = [porter.stem(word) for word in words]
    preprocessed_text = ' '.join(stemmed_words)
    return preprocessed_text

df['text'] = df['text'].apply(preprocess_text)

df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,sentiment,text
0,neutral,accord gran compani plan move product russia a...
1,neutral,technopoli plan develop stage area less squar ...
2,negative,intern electron industri compani elcoteq laid ...
3,positive,new product plant compani would increas capac ...
4,positive,accord compani updat strategi year baswar targ...
...,...,...
4841,negative,london marketwatch share price end lower londo...
4842,neutral,rinkuskiai beer sale fell per cent million lit...
4843,negative,oper profit fell eur mn eur mn includ vessel s...
4844,negative,net sale paper segment decreas eur mn second q...


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.4, random_state=42)

tfidf = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf.fit_transform(X_train).toarray()

X_test_tfidf = tfidf.transform(X_test).toarray()

In [26]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

nb_pred = nb.predict(X_test_tfidf)

gnb = GaussianNB()
gnb.fit(X_train_tfidf, y_train)

gnb_pred = gnb.predict(X_test_tfidf)

print('Multinomial Naive Bayes report:')
print(classification_report(y_test, nb_pred))

print('Gaussian Naive Bayes report:')
print(classification_report(y_test, gnb_pred))

Multinomial Naive Bayes report:
              precision    recall  f1-score   support

    negative       0.82      0.06      0.11       231
     neutral       0.68      0.97      0.80      1149
    positive       0.68      0.34      0.45       559

    accuracy                           0.68      1939
   macro avg       0.73      0.46      0.45      1939
weighted avg       0.70      0.68      0.62      1939

Gaussian Naive Bayes report:
              precision    recall  f1-score   support

    negative       0.31      0.50      0.39       231
     neutral       0.70      0.56      0.62      1149
    positive       0.36      0.42      0.39       559

    accuracy                           0.51      1939
   macro avg       0.46      0.50      0.47      1939
weighted avg       0.56      0.51      0.53      1939



In [32]:
import joblib

joblib.dump(nb, 'nb.pkl')

['nb.pkl']

In [None]:
# Test using unseen data

import pandas as pd

new_data = pd.read_csv('question12.csv', encoding='ISO-8859-1', header=None, names=['sentiment', 'text'])

new_data['text'] = new_data['text'].apply(preprocess_text)

new_data_features = tfidf.transform(new_data)

predictions = nb.predict(new_data_features)

print('Predictions:')
print(predictions)