In [1]:
import pandas as pd
import numpy as np
import re # Regex Library for removing tags and hyperlinks in tweets
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords # Stopwords corpus from nltk library
from nltk.stem import PorterStemmer # Stemmer to perform stemming on tokens
from nltk.tokenize import word_tokenize # Tokenizer to make tokens
import pickle # To save vectorizer in form of pickle file
from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF Vectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rudrransh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rudrransh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rudrransh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Read the original CSV file
df = pd.read_csv('Tweet_Data\original_tweet_data.csv', encoding='latin-1', header = None)

In [3]:
# Have a look at df
df.head

<bound method NDFrame.head of          0           1                             2         3   
0        0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  \
1        0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2        0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3        0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4        0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
...     ..         ...                           ...       ...   
1599995  4  2193601966  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599996  4  2193601969  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599997  4  2193601991  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599998  4  2193602064  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599999  4  2193602129  Tue Jun 16 08:40:50 PDT 2009  NO_QUERY   

                       4                                                  5  
0        _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1          scotthamil

In [4]:
# See columns of the df
df.columns

Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [5]:
# Set of stopwords
stopwords = set(stopwords.words('english'))

# Stemmer object
stemmer = PorterStemmer()

# Function to preprocess input tweets and return tokens
def preprocess_text(text):
    # Remove '@user' tags
    text = re.sub(r'@[^\s]+', '', text)

    # Remove hyperlinks
    text = re.sub(r'http\S+', '', text)

    # Lowercasing
    text = text.lower()

    # Punctuation removal
    text = re.sub(r'[^\w\s]', '', text)

    # Stopword removal and stemming
    tokens = word_tokenize(text)
    #tokens = [token for token in tokens if token not in stopwords]
    tokens = [stemmer.stem(token) for token in tokens] 

    return ' '.join(tokens)


In [6]:
# Apply preprocessing to 'text' column
df[6] = df[5].apply(preprocess_text)

# Save the tokens and 'target' column to a new CSV file
df[[0,6]].to_csv('Tweet_Data\\tokens_vs_target.csv', index=False)

In [7]:
# Deletes previous dataframe object to free up memory
del df

In [8]:
# Read the tokens file
df = pd.read_csv("Tweet_Data\\tokens_vs_target.csv", header = None)

In [9]:
# Having a look at tokens
df.head

<bound method NDFrame.head of          0                                                  1
0        0                                                  6
1        0  awww that a bummer you shoulda got david carr ...
2        0  is upset that he cant updat hi facebook by tex...
3        0  i dive mani time for the ball manag to save 50...
4        0       my whole bodi feel itchi and like it on fire
...     ..                                                ...
1599996  4  just woke up have no school is the best feel ever
1599997  4   thewdbcom veri cool to hear old walt interview â
1599998  4  are you readi for your mojo makeov ask me for ...
1599999  4  happi 38th birthday to my boo of alll time tup...
1600000  4                               happi charitytuesday

[1600001 rows x 2 columns]>

In [10]:
# Drop rows with empty token values
df = df.dropna(subset=[1])

df.shape

(1596865, 2)

In [11]:
# Retrieve column 1 and convert it to a list of token lists
token_lists = df[1]

token_lists

0                                                          6
1          awww that a bummer you shoulda got david carr ...
2          is upset that he cant updat hi facebook by tex...
3          i dive mani time for the ball manag to save 50...
4               my whole bodi feel itchi and like it on fire
                                 ...                        
1599996    just woke up have no school is the best feel ever
1599997     thewdbcom veri cool to hear old walt interview â
1599998    are you readi for your mojo makeov ask me for ...
1599999    happi 38th birthday to my boo of alll time tup...
1600000                                 happi charitytuesday
Name: 1, Length: 1596865, dtype: object

In [12]:
# Perform TF-IDF encoding and create vectors of size 50
vectorizer = TfidfVectorizer(max_features=50)
vectorizer.fit(token_lists)

# Save the fitted vectorizer using pickle for future use
with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)


In [13]:
# Transform the tokens to vectors
tfidf_vectors = vectorizer.transform(token_lists).toarray()

# Create a new DataFrame with the vectors and target
new_df = pd.DataFrame(tfidf_vectors)
new_df.insert(0, "target", df[0])

# Save the new DataFrame to a new CSV file
new_df.to_csv('Tweet_Data\\vectors_vs_tokens.csv', index=False)

In [14]:
# Having a look at vectors dataframe
new_df

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.400474,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.311545,0.000000
2,0.0,0.0,0.331252,0.000000,0.0,0.0,0.0,0.0,0.506489,0.000000,...,0.499945,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
3,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
4,0.0,0.0,0.395538,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1596860,4.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.466747,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1596861,4.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1596862,4.0,0.0,0.000000,0.427625,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.312176,0.420081
1596863,4.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000


In [15]:
# Deletes dataframe objects to free up memory
del df
del new_df

In [16]:
# Reads the vectors datafile
df = pd.read_csv('Tweet_Data\\vectors_vs_tokens.csv')

In [17]:
# Having a look at vectors dataframe
df

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.400474,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.311545,0.000000
2,0.0,0.0,0.331252,0.000000,0.0,0.0,0.0,0.0,0.506489,0.000000,...,0.499945,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
3,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
4,0.0,0.0,0.395538,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1596860,4.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.466747,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1596861,4.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1596862,4.0,0.0,0.000000,0.427625,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.312176,0.420081
1596863,4.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000


In [18]:
# Checking columns
df.columns

Index(['target', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35',
       '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47',
       '48', '49'],
      dtype='object')

In [19]:
# Extract the 'target' column as y_data in int type
y_data = df['target'].values.astype(int)

# Extract the remaining 50 columns as x_data
x_data = df.drop('target', axis=1).values


In [20]:
y_data

array([0, 0, 0, ..., 4, 4, 4])

In [21]:
# There were some garbage values in y_data when converting every value to int. Hence they were removed and marked as 2(neutral)
for i in range(len(y_data)):
    if y_data[i] not in [0, 2, 4]:
        y_data[i] = 2

In [22]:
y_data.shape

(1596865,)

In [23]:
x_data.shape

(1596865, 50)

In [24]:
x_data

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.31154506,
        0.        ],
       [0.        , 0.33125169, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.42762522, ..., 0.        , 0.31217624,
        0.42008136],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [25]:
# Split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=0)


In [26]:
x_train.shape

(1277492, 50)

In [27]:
y_train.shape

(1277492,)

In [28]:
# Defined the model used. 
model = Sequential()
model.add(BatchNormalization()) # Normalise the input vectors
model.add(Dense(256, activation='relu',input_shape = [50])) # Layer with 256 Neurons
model.add(Dropout(0.3)) # Leaves out 30 % data to prevent overfitting
model.add(BatchNormalization()) # Normalize the input vectors
model.add(Dense(256,activation='relu')) # Layer with 256 neurons
model.add(Dense(5, activation = 'softmax')) # Gives output in form of probability of being a value between 0 to 4

# Compile the model
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics=['accuracy'])



In [29]:
# Train the model
model.fit(x_train, y_train, batch_size=1000, epochs=10)

# Saves the model for future use
model.save('trained_model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
# Loads the saved model
model = keras.models.load_model('trained_model.h5')

In [31]:
# Make predictions on validation data
y_pred = model.predict(x_val)



In [32]:
# Convert probabilities to predicted labels
y_pred_labels = np.argmax(y_pred, axis=1)

# Print the resulting array
y_pred_labels

array([4, 4, 0, ..., 4, 0, 4], dtype=int64)

In [33]:
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred_labels)

# Calculate precision
precision = precision_score(y_val, y_pred_labels, average='weighted')

# Calculate recall
recall = recall_score(y_val, y_pred_labels, average='weighted')

# Calculate F1-score
f1 = f1_score(y_val, y_pred_labels, average='weighted')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.6571563657541495
Precision: 0.6559409706992578
Recall: 0.6571563657541495
F1-score: 0.656521095440594


  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
# Testing with a random input
input_text = 'I had a bad day at college today.'
# Preprocess the input text
input_tokens = preprocess_text(input_text)
# Display tokens
input_tokens

'i had a bad day at colleg today'

In [35]:
# Load the saved vectorizer from the file
with open('vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)
# Transform tokens to vector    
input_vector = vectorizer.transform([input_tokens]).toarray()
# Display the input vector
input_vector

array([[0.        , 0.        , 0.        , 0.54776373, 0.        ,
        0.        , 0.        , 0.        , 0.56198932, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.61977649, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

In [36]:
# Makes prediction on the input vector
output = np.argmax(model.predict([input_vector]), axis=1)
# Display output label
output



array([4], dtype=int64)