In [None]:
import pandas as pd
import os

file_path = '/kaggle/input/datasett/_Tweetsinhealthcareofsentimentanalysis_Sentiment-Analysis.csv'

data = pd.read_csv(file_path, encoding='latin-1')

print("First few rows of the dataset:")
print(data.head())

sentiment_distribution = data['Sentiment'].value_counts()

print("\nSentiment Distribution:")
print(sentiment_distribution)

#https://www.kaggle.com/datasets/gunjansanjaykadam/datasett/data

In [None]:
print(data.columns)


In [None]:
import re
from collections import Counter
import matplotlib.pyplot as plt


data['hashtags'] = data['tweetsORG'].apply(lambda x: re.findall(r"#(\w+)", str(x)))

all_hashtags = [hashtag for hashtags_list in data['hashtags'] for hashtag in hashtags_list]

hashtag_counts = Counter(all_hashtags)
most_common_hashtags = hashtag_counts.most_common(10)

print("\nTop 10 Most Common Hashtags:")
print(most_common_hashtags)

hashtags, counts = zip(*most_common_hashtags)
plt.figure(figsize=(10, 5))
plt.barh(hashtags, counts, color='purple')
plt.xlabel("Count")
plt.title("Top 10 Most Common Hashtags")
plt.gca().invert_yaxis()
plt.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

positive_tweets = ' '.join(data[data['Sentiment'] == 'Positive']['tweetsORG'].astype(str))
negative_tweets = ' '.join(data[data['Sentiment'] == 'Negative']['tweetsORG'].astype(str))

positive_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(positive_tweets)
negative_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(negative_tweets)

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.title("Positive Sentiment Word Cloud")
plt.imshow(positive_wordcloud, interpolation='bilinear')
plt.axis("off")

plt.subplot(1, 2, 2)
plt.title("Negative Sentiment Word Cloud")
plt.imshow(negative_wordcloud, interpolation='bilinear')
plt.axis("off")

plt.show()


In [None]:
data['tweet_length'] = data['tweetsORG'].astype(str).apply(len)

length_by_sentiment = data.groupby('Sentiment')['tweet_length'].mean()

print("\nAverage Tweet Length by Sentiment:")
print(length_by_sentiment)


In [None]:
from collections import Counter

positive_word_counts = Counter(positive_tweets.lower().split())
negative_word_counts = Counter(negative_tweets.lower().split())

print("\nMost Frequent Words in Positive Tweets:")
print(positive_word_counts.most_common(10))

print("\nMost Frequent Words in Negative Tweets:")
print(negative_word_counts.most_common(10))


In [None]:
import matplotlib.pyplot as plt

sentiment_distribution = data['Sentiment'].value_counts()

plt.figure(figsize=(8, 6))
sentiment_distribution.plot(kind='bar', color=['skyblue', 'lightgreen', 'salmon'])
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Number of Tweets")
plt.xticks(rotation=45)
plt.show()


In [None]:
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.boxplot(x='Sentiment', y='tweet_length', data=data, palette="Set2")
plt.title("Tweet Length by Sentiment")
plt.xlabel("Sentiment")
plt.ylabel("Tweet Length (Character Count)")
plt.show()


In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(data.index, data['tweet_length'])
plt.title('Tweet Length Over Time')
plt.xlabel('Tweet Index')
plt.ylabel('Tweet Length')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X = data['tweetsORG']
y = data['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data size:", len(X_train))
print("Testing data size:", len(X_test))

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


def preprocess_text(text):
  text = text.lower()
  tokens = word_tokenize(text)
  tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
  tokens = [stemmer.stem(token) for token in tokens]
  return " ".join(tokens)

data['processed_tweets'] = data['tweetsORG'].apply(preprocess_text)

print(data[['tweetsORG', 'processed_tweets']].head())

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = data['processed_tweets']
y = data['Sentiment']

y = y.map({'Positive': 0, 'Negative': 1, 'Neutral': 2})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_sequence_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)

model = Sequential()
model.add(Embedding(5000, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
epochs = 5
batch_size = 32
history = model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1)

loss, accuracy = model.evaluate(X_test_pad, y_test)
print('Test accuracy:', accuracy)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

plt.show()

In [None]:
y_test = y_test.astype(str)
y_pred = y_pred.astype(str)


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


