In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('/content/training.csv')

In [3]:
df.shape

(16000, 2)

In [4]:
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [5]:
df = df[df['label'].isin([0, 1])]

In [6]:
df.isnull().sum()

Unnamed: 0,0
text,0
label,0


In [7]:
df.dropna(inplace=True)
df.isnull().sum()

Unnamed: 0,0
text,0
label,0


In [9]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,5362
0,4666


In [10]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import download

# Download necessary resources
download('stopwords')
download('wordnet')
download('omw-1.4')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
critical_terms = {"sad", "cry", "depressed", "hopeless",'am'}  # Add domain-specific terms
stop_words = stop_words - critical_terms

# Add domain-specific stopwords (example: 'life')
additional_stopwords = {'life', 'something', 'anything','aand','abt','ability', 'academic', 'able','account','advance'}  # Add more if needed
stop_words.update(additional_stopwords)

# Preprocessing Function
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
    # Remove non-alphabetic characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove underscores
    text = re.sub(r'_+', '', text)
    # Remove excessive repeated characters (e.g., "aaaa" -> "aa")
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    # Remove short words (e.g., single characters)
    words = [word for word in words if len(word) > 2]

    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join cleaned words into a single string
    return ' '.join(words).strip()

# Apply Preprocessing to the Entire Dataset
corpus = [preprocess_text(text) for text in df['text']]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


NEW 1

In [22]:
# prompt: create me an lstm model with this dataset

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)  # Adjust num_words as needed
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)

# Pad sequences to ensure uniform length
max_length = max([len(seq) for seq in sequences]) # Get the length of the longest sequence
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Prepare labels
labels = np.array(df['label'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


# Define the LSTM model
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
embedding_dim = 100  # Embedding dimension (adjust as needed)

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(128))  # LSTM layer with 128 units
model.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid activation for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.1)  # Adjust epochs and batch_size


# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Test Accuracy:', accuracy)

# Example prediction (you'll need to preprocess new text)
# new_text = "This is a sample text for testing"
# new_sequence = tokenizer.texts_to_sequences([preprocess_text(new_text)])
# padded_new_sequence = pad_sequences(new_sequence, maxlen=max_length, padding='post')
# prediction = model.predict(padded_new_sequence)
# print(f"Prediction for new text: {prediction}")

Epoch 1/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.5404 - loss: 0.6875 - val_accuracy: 0.8867 - val_loss: 0.3065
Epoch 2/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9378 - loss: 0.1888 - val_accuracy: 0.9390 - val_loss: 0.1610
Epoch 3/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9918 - loss: 0.0347 - val_accuracy: 0.9402 - val_loss: 0.1568
Epoch 4/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9931 - loss: 0.0280 - val_accuracy: 0.9440 - val_loss: 0.2427
Epoch 5/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9973 - loss: 0.0129 - val_accuracy: 0.9452 - val_loss: 0.1733
Epoch 6/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9974 - loss: 0.0120 - val_accuracy: 0.9402 - val_loss: 0.3044
Epoch 7/20
[1m113/113[0m 

In [42]:
# prompt: function to accept string as input and give prediction along with input from user

def predict_sentiment(text):
    """
    Predicts the sentiment of a given text using the trained LSTM model.
    """
    new_sequence = tokenizer.texts_to_sequences([preprocess_text(text)])
    padded_new_sequence = pad_sequences(new_sequence, maxlen=max_length, padding='post')
    prediction = model.predict(padded_new_sequence)
    sentiment = "Positive" if prediction[0][0] > 0.5 else "Negative"  # Classify as positive or negative
    return sentiment, prediction[0][0] # Return sentiment and prediction probability


# Example usage:
user_input = input("Enter a text: ")
sentiment, probability = predict_sentiment(user_input)
print(f"Input Text: {user_input}")
print(f"Predicted Sentiment: {sentiment}")
print(f"Prediction Probability: {probability:.4f}") # Display probability with 4 decimal places

Enter a text: Nothing seems to work out for me lately.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Input Text: Nothing seems to work out for me lately.
Predicted Sentiment: Negative
Prediction Probability: 0.2721
