In [3]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Step 1: Load Data
train_data = pd.read_csv(r'C:\Users\RAJVI\MACHINE LEARNING\multiple_LR\Homework\train_tweet.csv')
test_data = pd.read_csv(r'C:\Users\RAJVI\MACHINE LEARNING\multiple_LR\Homework\test_tweets.csv')

# Step 2: Data Preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.strip()  # Remove extra spaces
    return text

train_data['cleaned_text'] = train_data['tweet'].apply(preprocess_text)
test_data['cleaned_text'] = test_data['tweet'].apply(preprocess_text)

# Step 3: Tokenization and Padding
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data['cleaned_text'])

X_train = tokenizer.texts_to_sequences(train_data['cleaned_text'])
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
y_train = np.array(train_data['label'])

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 4: Create a Simple Model
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 5: Train the Model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

# Step 6: Evaluate the Model
val_predictions = (model.predict(X_val) > 0.5).astype(int)
accuracy = accuracy_score(y_val, val_predictions)
print(f'Validation Accuracy: {accuracy}')


Epoch 1/5




[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 25ms/step - accuracy: 0.9285 - loss: 0.2703 - val_accuracy: 0.9287 - val_loss: 0.2658
Epoch 2/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.9303 - loss: 0.2557 - val_accuracy: 0.9287 - val_loss: 0.2572
Epoch 3/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 27ms/step - accuracy: 0.9330 - loss: 0.2464 - val_accuracy: 0.9287 - val_loss: 0.2571
Epoch 4/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 29ms/step - accuracy: 0.9296 - loss: 0.2552 - val_accuracy: 0.9287 - val_loss: 0.2579
Epoch 5/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 32ms/step - accuracy: 0.9282 - loss: 0.2590 - val_accuracy: 0.9287 - val_loss: 0.2574
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step
Validation Accuracy: 0.9286719849835758
