In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
# Read the data from the Excel file
df = pd.read_csv('/content/Twitter_Data.csv')
# Change the dependent variable to categorical

df['category'] = df['category'].map({0: 'Neutral', -1: 'Negative', 1: 'Positive'})

# Check for missing values

print(df.isnull().sum())



# Drop all rows with missing values

df = df.dropna()

# Remove all symbols except alphanumeric

import re

df['Tweets'] = df['Tweets'].apply(lambda x: re.sub(r'[^\w\s]', '', x))



# Transform all words to lower case

df['Tweets'] = df['Tweets'].apply(lambda x: x.lower())



# Remove punctuation and stopwords

from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))



df['Tweets'] = df['Tweets'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

# Create a new column and find the length of each sentence

df['sentence_length'] = df['Tweets'].apply(lambda x: len(x.split()))

# Split the data into dependent and independent variables

X = df['Tweets']

y = df['category']

# One-hot encode the text data

from tensorflow.keras.preprocessing.text import Tokenizer



tokenizer = Tokenizer(num_words=10000)

tokenizer.fit_on_texts(X)



X_encoded = tokenizer.texts_to_sequences(X)

X_encoded = tf.keras.preprocessing.sequence.pad_sequences(X_encoded, padding='post', maxlen=250)

# Pad the text data from the front side

from tensorflow.keras.preprocessing.sequence import pad_sequences



X_padded = tokenizer.texts_to_sequences(X)

X_padded = pad_sequences(X_encoded, padding='post', maxlen=250)

# Build an LSTM model

model = tf.keras.models.Sequential([

  tf.keras.layers.Embedding(input_dim=10000, output_dim=128),

  tf.keras.layers.LSTM(128, dropout=0.2),

  tf.keras.layers.Dense(3, activation='softmax')

])



# Compile the model

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create dummy variables for the dependent variable

y_dummy = pd.get_dummies(y)

# Split the data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X_padded, y_dummy, test_size=0.25, random_state=42)

# Train the model

model.fit(X_train, y_train, epochs=10)

