<a href="https://colab.research.google.com/github/Sam-Wadmare/ML-LAB/blob/main/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import nltk
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples

# Load positive and negative tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

print(f"Number of positive tweets: {len(positive_tweets)}")
print(f"Number of negative tweets: {len(negative_tweets)}")

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


Number of positive tweets: 5000
Number of negative tweets: 5000


In [5]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Added to download the missing resource


# Initialize stemmer and stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user mentions
    text = re.sub(r'@\w+', '', text)
    # Remove special characters and punctuation
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words and stem
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(processed_tokens)

# Apply preprocessing to positive and negative tweets
processed_positive_tweets = [preprocess_text(tweet) for tweet in positive_tweets]
processed_negative_tweets = [preprocess_text(tweet) for tweet in negative_tweets]

print("Preprocessing complete.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Preprocessing complete.


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Combine positive and negative tweets with labels
all_tweets = processed_positive_tweets + processed_negative_tweets
labels = [1] * len(processed_positive_tweets) + [0] * len(processed_negative_tweets) # 1 for positive, 0 for negative

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_tweets, labels, test_size=0.2, random_state=42)

# Convert text data to numerical feature vectors using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000) # Limit features to top 5000
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Multinomial Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.7475
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.78      0.75       988
           1       0.77      0.72      0.74      1012

    accuracy                           0.75      2000
   macro avg       0.75      0.75      0.75      2000
weighted avg       0.75      0.75      0.75      2000



In [7]:
import pandas as pd

# Create a DataFrame to display sample predictions
sample_data = {'Tweet': [X_test[i] for i in range(10)],
               'Actual Sentiment': [y_test[i] for i in range(10)],
               'Predicted Sentiment': [y_pred[i] for i in range(10)]}
sample_df = pd.DataFrame(sample_data)

# Map numerical labels to sentiment words for better readability
sample_df['Actual Sentiment'] = sample_df['Actual Sentiment'].map({1: 'Positive', 0: 'Negative'})
sample_df['Predicted Sentiment'] = sample_df['Predicted Sentiment'].map({1: 'Positive', 0: 'Negative'})

print("Sample Predictions:")
display(sample_df)

Sample Predictions:


Unnamed: 0,Tweet,Actual Sentiment,Predicted Sentiment
0,love feel emm think,Negative,Negative
1,thank guy,Positive,Positive
2,love lord better life lt 3,Positive,Positive
3,yeah better use offici account like,Positive,Positive
4,ok good night wish troy ugli met today ok toda...,Positive,Positive
5,surpris sound hellish would thing,Negative,Positive
6,dri hot scorch summer ff,Positive,Positive
7,sad pray,Negative,Negative
8,popol day,Negative,Positive
9,song week ducktail surreal exposur sotw jingli...,Positive,Positive
