In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [59]:
data = pd.read_csv("twitter_training.csv", header=None)

In [60]:
data.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [61]:
data.columns = ['candidate', 'region', 'sentiment', 'text']

In [62]:
data.isna().sum()

candidate      0
region         0
sentiment      0
text         686
dtype: int64

In [63]:
def preprocess_text(text):
    
    if isinstance(text, str):
        text = text.lower()
        # Add additional cleaning steps as needed
        # Remove punctuation, stop words, etc.
    else:
        # If the text is NaN, replace it with an empty string or handle as per your requirements
        text = ''
    return text

data['text'] = data['text'].apply(preprocess_text)

In [64]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenize and pad the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
MAX_SEQUENCE_LENGTH = 100

X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=MAX_SEQUENCE_LENGTH)
X_val = pad_sequences(tokenizer.texts_to_sequences(val_data['text']), maxlen=MAX_SEQUENCE_LENGTH)

# Map sentiment labels to numerical values
sentiment_mapping = {'Positive': 0, 'Neutral': 1, 'Negative': 2, 'Irrelevant':3}
train_labels = train_data['sentiment'].map(sentiment_mapping)
val_labels = val_data['sentiment'].map(sentiment_mapping)

In [65]:
data['sentiment'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [66]:
# Define the deep learning model
EMBEDDING_DIM = 50
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display the model summary
model.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 50)           1612200   
                                                                 
 spatial_dropout1d_3 (Spati  (None, 100, 50)           0         
 alDropout1D)                                                    
                                                                 
 lstm_3 (LSTM)               (None, 100)               60400     
                                                                 
 dense_3 (Dense)             (None, 3)                 303       
                                                                 
Total params: 1672903 (6.38 MB)
Trainable params: 1672903 (6.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [68]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Unique sentiment classes in the entire dataset
unique_sentiments = data['sentiment'].unique()

# Map sentiment labels to numerical values
sentiment_mapping = {sentiment: idx for idx, sentiment in enumerate(unique_sentiments)}
train_labels = train_data['sentiment'].map(sentiment_mapping).astype(int)
val_labels = val_data['sentiment'].map(sentiment_mapping).astype(int)

# Convert labels to categorical format
from tensorflow.keras.utils import to_categorical
num_classes = len(unique_sentiments)

train_labels = to_categorical(train_labels, num_classes=num_classes)
val_labels = to_categorical(val_labels, num_classes=num_classes)

# deep learning model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100))
model.add(Dense(num_classes, activation='softmax'))  # Adjust the output layer

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, train_labels, epochs=10, batch_size=64,
                    validation_data=(X_val, val_labels), callbacks=[EarlyStopping(patience=3)])

# Evaluate the model
eval_metrics = model.evaluate(X_val, val_labels)
print(f"Validation Accuracy: {eval_metrics[1]*100:.2f}%")


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Validation Accuracy: 88.79%
