In [4]:
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import LabelEncoder
#from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
df = pd.read_csv("Tweets.csv")
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,5.70306e+17,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,24/02/2015 11:35,,Eastern Time (US & Canada)
1,5.70301e+17,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,24/02/2015 11:15,,Pacific Time (US & Canada)
2,5.70301e+17,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,24/02/2015 11:15,Lets Play,Central Time (US & Canada)
3,5.70301e+17,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,24/02/2015 11:15,,Pacific Time (US & Canada)
4,5.70301e+17,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,24/02/2015 11:14,,Pacific Time (US & Canada)


In [6]:
# Taking the neccessary columns for sentiment analysis
df = df[['text','negativereason','airline_sentiment']]
df.head()

Unnamed: 0,text,negativereason,airline_sentiment
0,@VirginAmerica What @dhepburn said.,,neutral
1,@VirginAmerica plus you've added commercials t...,,positive
2,@VirginAmerica I didn't today... Must mean I n...,,neutral
3,@VirginAmerica it's really aggressive to blast...,Bad Flight,negative
4,@VirginAmerica and it's a really big bad thing...,Can't Tell,negative


In [7]:
# creating new column tweet using text and negative reason column of dataset
df['Tweet'] = df['text'] + df['negativereason'].fillna(' ')
df.head()

Unnamed: 0,text,negativereason,airline_sentiment,Tweet
0,@VirginAmerica What @dhepburn said.,,neutral,@VirginAmerica What @dhepburn said.
1,@VirginAmerica plus you've added commercials t...,,positive,@VirginAmerica plus you've added commercials t...
2,@VirginAmerica I didn't today... Must mean I n...,,neutral,@VirginAmerica I didn't today... Must mean I n...
3,@VirginAmerica it's really aggressive to blast...,Bad Flight,negative,@VirginAmerica it's really aggressive to blast...
4,@VirginAmerica and it's a really big bad thing...,Can't Tell,negative,@VirginAmerica and it's a really big bad thing...


In [8]:
# Function the remove the special charectors stop words and clean the tweets
def clean(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]','',text)
    text = text.lower()
    return text

In [9]:
df['cleaned_tweet'] = df['Tweet'].apply(clean)

In [10]:
df.head()

Unnamed: 0,text,negativereason,airline_sentiment,Tweet,cleaned_tweet
0,@VirginAmerica What @dhepburn said.,,neutral,@VirginAmerica What @dhepburn said.,what said
1,@VirginAmerica plus you've added commercials t...,,positive,@VirginAmerica plus you've added commercials t...,plus youve added commercials to the experienc...
2,@VirginAmerica I didn't today... Must mean I n...,,neutral,@VirginAmerica I didn't today... Must mean I n...,i didnt today must mean i need to take anothe...
3,@VirginAmerica it's really aggressive to blast...,Bad Flight,negative,@VirginAmerica it's really aggressive to blast...,its really aggressive to blast obnoxious ente...
4,@VirginAmerica and it's a really big bad thing...,Can't Tell,negative,@VirginAmerica and it's a really big bad thing...,and its a really big bad thing about itcant tell


In [11]:
# Encoding categorical values using LabelEncoder
encoder = LabelEncoder()
df['airline_sentiment'] = encoder.fit_transform(df['airline_sentiment'])

In [12]:
df.head()

Unnamed: 0,text,negativereason,airline_sentiment,Tweet,cleaned_tweet
0,@VirginAmerica What @dhepburn said.,,1,@VirginAmerica What @dhepburn said.,what said
1,@VirginAmerica plus you've added commercials t...,,2,@VirginAmerica plus you've added commercials t...,plus youve added commercials to the experienc...
2,@VirginAmerica I didn't today... Must mean I n...,,1,@VirginAmerica I didn't today... Must mean I n...,i didnt today must mean i need to take anothe...
3,@VirginAmerica it's really aggressive to blast...,Bad Flight,0,@VirginAmerica it's really aggressive to blast...,its really aggressive to blast obnoxious ente...
4,@VirginAmerica and it's a really big bad thing...,Can't Tell,0,@VirginAmerica and it's a really big bad thing...,and its a really big bad thing about itcant tell


In [13]:
# setting Feature and Target Variable to x and y
x = df['cleaned_tweet']
y = df['airline_sentiment']

In [14]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [15]:
vectorizer = CountVectorizer()
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [19]:
inputshape=x_train_vec.shape[1]
print(inputshape)

16067


In [18]:
x_train_vc= x_train_vec.toarray()
x_test_vc= x_test_vec.toarray()

In [20]:
model= Sequential([
    Dense(1000,activation='relu',input_shape=(inputshape,)),
    Dense(500,activation='relu'),
    Dense(100,activation='relu'),
    Dense(3,activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
model.summary()

In [23]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [25]:
model.fit(x_train_vc,y_train,epochs=10,batch_size=100,validation_data=(x_test_vc,y_test))

Epoch 1/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 471ms/step - accuracy: 0.8976 - loss: 0.2875 - val_accuracy: 0.9037 - val_loss: 0.2641
Epoch 2/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 455ms/step - accuracy: 0.9757 - loss: 0.0722 - val_accuracy: 0.9020 - val_loss: 0.3213
Epoch 3/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 460ms/step - accuracy: 0.9923 - loss: 0.0271 - val_accuracy: 0.8876 - val_loss: 0.3996
Epoch 4/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 474ms/step - accuracy: 0.9961 - loss: 0.0173 - val_accuracy: 0.8859 - val_loss: 0.4209
Epoch 5/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 457ms/step - accuracy: 0.9957 - loss: 0.0133 - val_accuracy: 0.8938 - val_loss: 0.4126
Epoch 6/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 463ms/step - accuracy: 0.9958 - loss: 0.0117 - val_accuracy: 0.8952 - val_loss: 0.4620
Epoch 7/10

<keras.src.callbacks.history.History at 0x7cf0b9c4a350>