#Sentiment Analysis with text processing

In [58]:
#Load necessary libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [59]:
#load the dataset
news_data = pd.read_csv("news_sentiment_analysis.csv")
news_data.head()

Unnamed: 0,Source,Author,Title,Description,URL,Published At,Sentiment,Type
0,stgnews,Bridger Palmer,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",https://www.stgeorgeutah.com/news/archive/2024...,2024-07-12T23:45:25+00:00,positive,Business
1,Zimbabwe Mail,Staff Reporter,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",https://www.thezimbabwemail.com/business/busin...,2024-07-12T22:59:42+00:00,neutral,Business
2,4-traders,,Musk donates to super pac working to elect Tru...,(marketscreener.com) Billionaire Elon Musk has...,https://www.marketscreener.com/business-leader...,2024-07-12T22:52:55+00:00,positive,Business
3,4-traders,,US FTC issues warning to franchisors over unfa...,(marketscreener.com) A U.S. trade regulator on...,https://www.marketscreener.com/quote/stock/MCD...,2024-07-12T22:41:01+00:00,negative,Business
4,PLANET,,Rooftop solar's dark side,4.5 million households in the U.S. have solar ...,https://www.npr.org/2024/07/12/1197961036/roof...,2024-07-12T22:28:19+00:00,positive,Business


In [72]:
# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to clean and preprocess text
def preprocess_text(text):
    if not isinstance(text, str):
      return ""  # Return an empty string if the text is not a string
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and non-alphabetic words, and apply stemming
    tokens = [stemmer.stem(word.lower()) for word in tokens if word.isalpha() and word.lower() not in stop_words]
    # Join tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the dataset
news_data['Processed Description'] = news_data['Description'].apply(preprocess_text)

In [62]:
# Function to extract features from text
def extract_features(text):
    words = word_tokenize(text)
    features = {}
    for word in words:
        features[word] = True
    return features

In [63]:
# Prepare training data
feature_sets = []
for description, sentiment in zip(news_data['Processed Description'], news_data['Sentiment']):
  feature_sets.append((extract_features(description), sentiment))

In [64]:
# Split the dataset into training and testing sets
train_set, test_set = train_test_split(feature_sets, test_size=0.2, random_state=42)

In [65]:
# Train NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_set)

In [76]:
# Convert the test set to a DataFrame
df_test_set = pd.DataFrame(test_set, columns=['Features', 'Labels'])

# Extract features and labels
test_features = df_test_set['Features'].tolist()
test_labels = df_test_set['Labels'].tolist()

# Predict sentiments on the test set
y_pred = []
for features in test_features:
  y_pred.append(classifier.classify(features))

In [77]:
# Evaluate the model
print("Accuracy:", accuracy(classifier, test_set))
print("\nClassification Report:\n", classification_report(test_labels, y_pred))

Accuracy: 0.7428571428571429

Classification Report:
               precision    recall  f1-score   support

    negative       0.55      0.78      0.65       131
     neutral       0.78      0.54      0.64       147
    positive       0.82      0.80      0.81       422

    accuracy                           0.74       700
   macro avg       0.72      0.71      0.70       700
weighted avg       0.76      0.74      0.74       700



In [85]:
# Create a DataFrame for comparison
description = []
for desc, _ in test_set:
  description.append(desc)

"""processed_desc = []
for desc, _ in test_set:
  processed_desc.append(preprocess_text(desc))"""

comparison = pd.DataFrame({
    'Description': description,
    'Actual Sentiment': test_labels,
    'Predicted Sentiment': y_pred
})
comparison['Match'] = comparison['Actual Sentiment'] == comparison['Predicted Sentiment']

# Print sample of comparison results
print("\nSample of Actual vs Predicted:")
print(comparison.sample(10))


Sample of Actual vs Predicted:
                                           Description Actual Sentiment  \
617  {'nobodi': True, 'want': True, 'bitten': True,...         positive   
290  {'vancouv': True, 'british': True, 'columbia':...         positive   
268  {'bend': True, 'ktvz': True, 'high': True, 'de...         positive   
61   {'allspr': True, 'global': True, 'invest': Tru...         positive   
96   {'pape': True, 'alé': True, 'niang': True, 'di...         positive   
667  {'depart': True, 'defens': True, 'select': Tru...         negative   
542  {'sikhumbuzo': True, 'moyo': True, 'smoyo': Tr...         negative   
355  {'samsung': True, 'frame': True, 'tv': True, '...         positive   
470  {'health': True, 'startup': True, 'prenuvo': T...         negative   
601  {'brunswick': True, 'resid': True, 'leann': Tr...         positive   

    Predicted Sentiment  Match  
617            negative  False  
290            positive   True  
268            positive   True  
61        

In [86]:
# Save comparison results to a new CSV file
comparison.to_csv('sentiment_comparison_results.csv', index=False)

In [88]:
#display the obtained result
result = pd.read_csv('sentiment_comparison_results.csv')
result

Unnamed: 0,Description,Actual Sentiment,Predicted Sentiment,Match
0,"{'photo': True, 'former': True, 'presid': True...",positive,positive,True
1,"{'capt': True, 'timothi': True, 'fillmor': Tru...",positive,negative,False
2,"{'date': True, 'creat': True, 'titl': True, 't...",neutral,positive,False
3,"{'skip': True, 'bayless': True, 'leav': True, ...",positive,positive,True
4,"{'manitoba': True, 'govern': True, 'provid': T...",positive,positive,True
...,...,...,...,...
695,"{'yearli': True, 'survey': True, 'medic': True...",neutral,neutral,True
696,"{'delta': True, 'chang': True, 'uniform': True...",negative,negative,True
697,"{'connectm': True, 'common': True, 'stock': Tr...",negative,negative,True
698,"{'one': True, 'interest': True, 'question': Tr...",positive,positive,True
