In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Download NLTK stopwords
nltk.download('stopwords')

# Load the Twitter sentiment dataset
data = pd.read_csv('twitter_data.csv')

# Drop rows with missing values
data.dropna(inplace=True)

# Preprocess the data
stop_words = set(stopwords.words('english'))

# Remove stopwords and perform lowercasing
data['Cleaned_Text'] = data['clean_text'].apply(lambda x: ' '.join([word.lower() for word in str(x).split() if word.lower() not in stop_words]))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['Cleaned_Text'], data['category'], test_size=0.2, random_state=42)

# Convert text to numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the Naive Bayes classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_vectorized, y_train)

# Predict on the test set
y_pred_NB = naive_bayes.predict(X_test_vectorized)

# Generate classification report
report_NB = classification_report(y_test, y_pred_NB)
print(report_NB)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

        -1.0       0.73      0.56      0.63      2011
         0.0       0.85      0.55      0.67      2941
         1.0       0.63      0.89      0.74      3668

    accuracy                           0.70      8620
   macro avg       0.74      0.67      0.68      8620
weighted avg       0.73      0.70      0.69      8620



In [2]:
data.head()

Unnamed: 0,clean_text,category,Cleaned_Text
0,when modi promised “minimum government maximum...,-1.0,modi promised “minimum government maximum gove...
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,1.0,say vote modi welcome bjp told rahul main camp...
3,asking his supporters prefix chowkidar their n...,1.0,asking supporters prefix chowkidar names modi ...
4,answer who among these the most powerful world...,1.0,answer among powerful world leader today trump...


In [3]:
'''from sklearn.tree import DecisionTreeClassifier
clf=DecisionTreeClassifier()
clf.fit(X_train_vectorized,y_train)
y_pred_DT=clf.predict(X_test_vectorized)
report_DT = classification_report(y_test, y_pred_DT)
print(report_DT)'''

'from sklearn.tree import DecisionTreeClassifier\nclf=DecisionTreeClassifier()\nclf.fit(X_train_vectorized,y_train)\ny_pred_DT=clf.predict(X_test_vectorized)\nreport_DT = classification_report(y_test, y_pred_DT)\nprint(report_DT)'

In [4]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [5]:
# Load the dataset
dataset = pd.read_csv('twitter_data.csv')
dataset['category'] = dataset['category'].map({-1: 0, 0: 1, 1: 2})
# Preprocess the dataset
sentences = dataset['clean_text'].astype(str).values  # Convert to string
labels = dataset['category'].values

# Split the dataset into training and testing sets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Tokenize the sentences into words
train_words = [sentence.lower().split() for sentence in train_sentences]
test_words = [sentence.lower().split() for sentence in test_sentences]

In [6]:
# Train Word2Vec model
w2v_model = Word2Vec(train_words, vector_size=100, window=5, min_count=1)

# Get the vocabulary size
vocab_size = len(w2v_model.wv.index_to_key)

# Convert words to indices
train_sequences = [[w2v_model.wv.key_to_index[word] for word in sentence if word in w2v_model.wv.key_to_index] for sentence in train_words]
test_sequences = [[w2v_model.wv.key_to_index[word] for word in sentence if word in w2v_model.wv.key_to_index] for sentence in test_words]

# Pad sequences to have the same length
max_length = max(max(len(train_seq), len(test_seq)) for train_seq, test_seq in zip(train_sequences, test_sequences))
train_sequences = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_sequences = pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [7]:
unique_labels = np.unique(train_labels)
print(unique_labels)

[ 0.  1.  2. nan]


In [8]:
valid_indices = np.where(train_labels < 3)  # Filter out invalid labels
train_labels = train_labels[valid_indices]
train_sequences = train_sequences[valid_indices]

In [9]:
# Define the model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length, trainable=True))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(3, activation='softmax'))  # Assuming 3 classes: negative, neutral, positive

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Convert labels to one-hot vectors
train_labels_one_hot = to_categorical(train_labels, num_classes=3)
test_labels_one_hot = to_categorical(test_labels, num_classes=3)

# Train the model
model.fit(train_sequences, train_labels_one_hot, validation_data=(test_sequences, test_labels_one_hot), epochs=10, batch_size=16384)

IndexError: index -2147483648 is out of bounds for axis 1 with size 3

In [None]:
y_hat_LSTM=model.predict(test_sequences)

In [None]:
y_hat_LSTM

In [None]:
y_pred_LSTM=[]
for i in range(len(y_hat_LSTM)):
    y_pred_LSTM.append(np.argmax(y_hat_LSTM[i]))

In [None]:
# Calculate accuracy
accuracy = accuracy_score(test_labels, y_pred_LSTM)
print('Accuracy:', accuracy)

# Generate classification report
class_labels = ['negative', 'neutral', 'positive']
report = classification_report(test_labels, y_pred_LSTM, target_names=class_labels)
print('Classification Report:')
print(report)

# Generate confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
confusion_mat = confusion_matrix(test_labels, y_pred_LSTM)
sns.heatmap(confusion_mat,annot=True)

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import logging
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
df=pd.read_csv('twitter_data.csv')
df['category'] = df['category'].map({-1: 0, 0: 1, 1: 2})

In [None]:
token_var = ""
def token_text(text):
  marked_text = "[CLS] " + text + " [SEP]"

  # Tokenize our sentence with the BERT tokenizer.
  tokenized_text = tokenizer.tokenize(marked_text)

  # Print out the tokens.
 # print (tokenized_text)

  # Display the words with their indeces.
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  for tup in zip(tokenized_text, indexed_tokens):
      print('{:<12} {:>6,}'.format(tup[0], tup[1])) 
  segments_ids = [1] * len(tokenized_text)

  print (segments_ids)

  # Convert inputs to PyTorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])
  # Load pre-trained model (weights)
  model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
  model.eval()
  with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)
    hidden_states = outputs[2]

  

In [None]:
token_text(df['clean_text'][0])

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Load and preprocess the dataset
dataset = ...  # Load your financial_news dataset here
sentences = df['clean_text'].tolist()
sentiment_labels = df['category'].tolist()

# Define the parameters
max_seq_length = 100  # Maximum sequence length
num_classes = 3  # Number of sentiment classes (positive, negative, neutral)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the sentences
input_ids = []
attention_masks = []

for sentence in sentences:
    encoded_dict = tokenizer.encode_plus(
        sentences,
        add_special_tokens=True,
        max_length=max_seq_length,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='tf'
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = tf.concat(input_ids, axis=0)
attention_masks = tf.concat(attention_masks, axis=0)
sentiment_labels = tf.one_hot(sentiment_labels, depth=num_classes)

# Split the dataset into training and testing sets
split_ratio = 0.8  # 80% for training, 20% for testing
split_index = int(len(input_ids) * split_ratio)

train_input_ids = input_ids[:split_index]
train_attention_masks = attention_masks[:split_index]
train_labels = sentiment_labels[:split_index]
test_input_ids = input_ids[split_index:]
test_attention_masks = attention_masks[split_index:]
test_labels = sentiment_labels[split_index:]

# Load the BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Freeze the BERT layers
bert_model.trainable = False

# Define the model architecture
input_ids_layer = Input(shape=(max_seq_length,), dtype=tf.int32)
attention_masks_layer = Input(shape=(max_seq_length,), dtype=tf.int32)
bert_output = bert_model(input_ids_layer, attention_mask=attention_masks_layer)[0]
pooling_layer = tf.keras.layers.GlobalAveragePooling1D()(bert_output)
output_layer = Dense(units=num_classes, activation='softmax')(pooling_layer)

model = Model(inputs=[input_ids_layer, attention_masks_layer], outputs=output_layer)
model.summary()

# Compile and train the model
model.compile(optimizer=Adam(learning_rate=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(
    [train_input_ids, train_attention_masks],
    train_labels,
    epochs=3,
    batch_size=32,
    validation_data=([test_input_ids, test_attention_masks], test_labels)
)