<a href="https://colab.research.google.com/github/UdayBattula/Sentiment-analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
nltk.download('stopwords')

# Dataset URLs (IMDb reviews and Twitter sentiment datasets)
imdb_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
twitter_url = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/train_text.txt"

# Loading IMDb dataset (simplified)
!wget {imdb_url} -O imdb_data.tar.gz
!tar -xzf imdb_data.tar.gz  # Extracts dataset

# Loading Twitter sentiment dataset
twitter_data = pd.read_csv(twitter_url, delimiter='\t', names=['text'])
print("Twitter Dataset Sample:", twitter_data.head())

# Preprocessing example function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in text.split() if word.lower() not in stop_words]
    return ' '.join(tokens)

# Applying preprocessing on Twitter dataset
twitter_data['cleaned_text'] = twitter_data['text'].apply(preprocess_text)
print("Preprocessed Twitter Data Sample:", twitter_data.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


--2024-10-30 08:01:07--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘imdb_data.tar.gz’


2024-10-30 08:01:19 (7.23 MB/s) - ‘imdb_data.tar.gz’ saved [84125825/84125825]

Twitter Dataset Sample:                                                 text
0  QT @user In the original draft of the 7th book...
1  Ben Smith / Smith (concussion) remains out of ...
2  Sorry bout the stream last night I crashed out...
3  Chase Headley's RBI double in the 8th inning o...
4  @user Alciato: Bee will invest 150 million in ...
Preprocessed Twitter Data Sample:                                                 text  \
0  QT @user In the original draft of the 7th book...   
1  Ben Smith / Smith (concussion) remains out of ...   
2  Sorry bout the stream

In [None]:
# Import necessary libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

# Assuming you have loaded `twitter_data` DataFrame with 'text' and 'cleaned_text' columns

# Step 1: Add a dummy 'label' column (0 = negative, 1 = positive) for demonstration purposes
# (Replace with actual labels if available)
import numpy as np
twitter_data['label'] = np.random.randint(0, 2, size=len(twitter_data))

# Step 2: Splitting the Twitter dataset into training and testing
X = twitter_data['cleaned_text']
y = twitter_data['label']

# Vectorizing for Naive Bayes
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Naive Bayes Model Training
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

# Evaluation for Naive Bayes
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))

# Preparing data for LSTM
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=100)

# Splitting padded sequences
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# LSTM Model Definition
lstm_model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    LSTM(units=128),
    Dense(1, activation='sigmoid')
])

# Compiling and Training LSTM Model
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_lstm, y_train_lstm, epochs=5, batch_size=32, validation_data=(X_test_lstm, y_test_lstm))

# Evaluation for LSTM
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test_lstm, y_test_lstm)
print("LSTM Accuracy:", lstm_accuracy)


Naive Bayes Accuracy: 0.4981443298969072
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.49      0.49      3634
           1       0.50      0.50      0.50      3641

    accuracy                           0.50      7275
   macro avg       0.50      0.50      0.50      7275
weighted avg       0.50      0.50      0.50      7275





Epoch 1/5
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 160ms/step - accuracy: 0.4952 - loss: 0.6936 - val_accuracy: 0.5053 - val_loss: 0.6933
Epoch 2/5
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 157ms/step - accuracy: 0.5717 - loss: 0.6797 - val_accuracy: 0.4907 - val_loss: 0.7045
Epoch 3/5
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 155ms/step - accuracy: 0.6465 - loss: 0.6260 - val_accuracy: 0.4965 - val_loss: 0.7458
Epoch 4/5
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 156ms/step - accuracy: 0.7058 - loss: 0.5657 - val_accuracy: 0.4959 - val_loss: 0.7835
Epoch 5/5
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 156ms/step - accuracy: 0.7520 - loss: 0.4963 - val_accuracy: 0.4980 - val_loss: 0.8897
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 53ms/step - accuracy: 0.4966 - loss: 0.9018
LSTM Accuracy: 0.4980068802833557


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example Twitter data creation (replace this with your actual data loading code)
data = {
    'cleaned_text': ["I love this!", "This is bad", "Amazing work", "Not good at all", "I'm happy", "I am sad"],
    'label': [1, 0, 1, 0, 1, 0]  # 1 for positive, 0 for negative
}
twitter_data = pd.DataFrame(data)

# Data Preparation
X = twitter_data['cleaned_text']
y = twitter_data['label']

# Tokenization and Padding for GRU Model
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=100)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# GRU Model Definition
gru_model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    GRU(units=128),
    Dense(1, activation='sigmoid')
])

# Compiling the GRU Model
gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training the GRU Model
gru_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluation for GRU
loss, accuracy = gru_model.evaluate(X_test, y_test)
print("GRU Model Accuracy:", accuracy)
print("GRU Model Loss:", loss)

# Optional: Classification Report
y_pred = (gru_model.predict(X_test) > 0.5).astype("int32")
print("Classification Report:\n", classification_report(y_test, y_pred))


Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.5000 - loss: 0.6909 - val_accuracy: 0.0000e+00 - val_loss: 0.6947
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - accuracy: 0.7500 - loss: 0.6832 - val_accuracy: 0.0000e+00 - val_loss: 0.6958
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - accuracy: 1.0000 - loss: 0.6754 - val_accuracy: 0.0000e+00 - val_loss: 0.6970
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step - accuracy: 1.0000 - loss: 0.6674 - val_accuracy: 0.0000e+00 - val_loss: 0.6984
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - accuracy: 1.0000 - loss: 0.6589 - val_accuracy: 0.0000e+00 - val_loss: 0.6998
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - accuracy: 1.0000 - loss: 0.6498 - val_accuracy: 0.0000e+00 - val_loss: 0.7014
Epoch 7/10
[1m1/1[0m [32m━━

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import requests
import tarfile
import os

# Ensure nltk stopwords are downloaded
nltk.download('stopwords')

# Dataset URLs (IMDb reviews and Twitter sentiment datasets)
imdb_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
twitter_url = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/train_text.txt"

# Function to download and extract the IMDb dataset
def download_and_extract_imdb(url):
    if not os.path.exists('aclImdb'):
        response = requests.get(url)
        with open('imdb_data.tar.gz', 'wb') as file:
            file.write(response.content)
        with tarfile.open('imdb_data.tar.gz', 'r:gz') as tar:
            tar.extractall()
        print("IMDb dataset downloaded and extracted.")
    else:
        print("IMDb dataset already exists.")

# Downloading the IMDb dataset
download_and_extract_imdb(imdb_url)

# Load IMDb dataset
imdb_train_data = pd.read_csv('aclImdb/train/labeledBow.feat', sep='\t', header=None)

# Initialize empty lists for labels and text
labels = []
texts = []

# Process each row to extract labels and the corresponding text
for index, row in imdb_train_data.iterrows():
    parts = row[0].split(' ')
    label = int(parts[0])  # The first element is the label
    text = ' '.join(parts[1:])  # The rest is the feature representation

    labels.append(1 if label > 0 else 0)  # 1 for positive, 0 for negative
    texts.append(text)

# Create a DataFrame with the extracted labels and text
imdb_train_data = pd.DataFrame({'label': labels, 'text': texts})

# Check the processed data
print("Processed IMDb Dataset Sample:\n", imdb_train_data.head())

# Loading Twitter sentiment dataset
twitter_data = pd.read_csv(twitter_url, delimiter='\t', names=['text', 'label'])
twitter_data['label'] = twitter_data['label'].map({'positive': 1, 'negative': 0})

# Combining datasets
combined_data = pd.concat([imdb_train_data, twitter_data], ignore_index=True)

# Preprocessing example function
def preprocess_text(text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in text.lower().split() if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)

# Applying preprocessing on combined dataset
combined_data['cleaned_text'] = combined_data['text'].apply(preprocess_text)

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(combined_data['cleaned_text'], combined_data['label'], test_size=0.2, random_state=42)

# Tokenization and padding
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

# Building the GRU model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))
model.add(GRU(128, return_sequences=True))
model.add(GlobalAveragePooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the model
model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Evaluating the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


IMDb dataset already exists.
Processed IMDb Dataset Sample:
    label                                               text
0      1  0:9 1:1 2:4 3:4 4:6 5:4 6:2 7:2 8:4 10:4 12:2 ...
1      1  0:7 1:4 2:2 3:2 5:4 6:1 8:2 9:2 14:1 16:1 18:1...
2      1  0:4 1:4 2:4 3:7 4:2 5:1 6:1 7:1 9:1 10:1 13:1 ...
3      1  0:10 1:2 2:2 4:3 5:2 6:4 7:2 9:1 10:4 11:1 16:...
4      1  0:13 1:9 2:6 3:4 4:2 5:5 6:10 7:6 9:2 10:3 11:...
Epoch 1/5




[1m1381/1381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 175ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 2/5
[1m1381/1381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 172ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 3/5
[1m1381/1381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 172ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 4/5
[1m1381/1381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 172ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 5/5
[1m1381/1381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 172ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
[1m384/384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 34ms/step - accuracy: 0.0000e+00 - loss: nan
Test Accuracy: 0.0000


In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Embedding, Dropout, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from google.colab import files

# Ensure nltk stopwords and punkt tokenizer are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Upload the dataset
uploaded = files.upload()

# Get the filename from the uploaded files
filename = next(iter(uploaded))

# Load the dataset from the uploaded file
imdb_train_data = pd.read_csv(filename)

# Display the first few rows and column names to understand the structure
print(imdb_train_data.head())
print("Column names:", imdb_train_data.columns)

# Define column names based on the actual DataFrame structure
label_column = 'sentiment'  # Column containing the labels
text_column = 'text'        # Column containing the text

# Ensure the specified columns exist in the DataFrame
if label_column not in imdb_train_data.columns or text_column not in imdb_train_data.columns:
    raise ValueError(f"Check your column names: {imdb_train_data.columns}")

# Extract labels and features
labels = imdb_train_data[label_column].map({'positive': 1, 'negative': 0})  # Assuming sentiments are 'positive' and 'negative'
texts = imdb_train_data[text_column]

# Create DataFrame
imdb_train_data = pd.DataFrame({'label': labels, 'text': texts})

# Text preprocessing function
def preprocess_text(text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)

# Preprocess the dataset
imdb_train_data['cleaned_text'] = imdb_train_data['text'].apply(preprocess_text)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(imdb_train_data['cleaned_text'], imdb_train_data['label'], test_size=0.2, random_state=42)

# Tokenization and padding
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

# Building the GRU model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))
model.add(GRU(128, return_sequences=True))
model.add(Dropout(0.5))  # Add dropout for regularization
model.add(GlobalAveragePooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the model
model.fit(X_train_pad, y_train, epochs=10, batch_size=64, validation_split=0.1)

# Evaluating the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Saving sentiment_analysis.csv to sentiment_analysis (6).csv
   Year  Month  Day Time of Tweet  \
0  2018      8   18       morning   
1  2018      8   18          noon   
2  2017      8   18         night   
3  2022      6    8       morning   
4  2022      6    8          noon   

                                                text sentiment     Platform  
0              What a great day!!! Looks like dream.  positive    Twitter    
1     I feel sorry, I miss you here in the sea beach  positive    Facebook   
2                                     Don't angry me  negative     Facebook  
3  We attend in the class just for listening teac...  negative    Facebook   
4                  Those who want to go, let them go  negative   Instagram   
Column names: Index(['Year', 'Month', 'Day', 'Time of Tweet', 'text', 'sentiment',
       'Platform'],
      dtype='object')
Epoch 1/10




[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 303ms/step - accuracy: 0.2752 - loss: nan - val_accuracy: 0.2000 - val_loss: nan
Epoch 2/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 244ms/step - accuracy: 0.2718 - loss: nan - val_accuracy: 0.2000 - val_loss: nan
Epoch 3/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 241ms/step - accuracy: 0.2593 - loss: nan - val_accuracy: 0.2000 - val_loss: nan
Epoch 4/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 242ms/step - accuracy: 0.2466 - loss: nan - val_accuracy: 0.2000 - val_loss: nan
Epoch 5/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 241ms/step - accuracy: 0.2332 - loss: nan - val_accuracy: 0.2000 - val_loss: nan
Epoch 6/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 456ms/step - accuracy: 0.2457 - loss: nan - val_accuracy: 0.2000 - val_loss: nan
Epoch 7/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 261ms/step - a