In [1]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rudrransh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rudrransh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rudrransh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Read the original CSV file
df = pd.read_csv('Tweet_Data\original_tweet_data.csv', encoding='latin-1', header = None)

In [3]:
df.head

<bound method NDFrame.head of          0           1                             2         3   
0        0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  \
1        0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2        0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3        0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4        0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
...     ..         ...                           ...       ...   
1599995  4  2193601966  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599996  4  2193601969  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599997  4  2193601991  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599998  4  2193602064  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599999  4  2193602129  Tue Jun 16 08:40:50 PDT 2009  NO_QUERY   

                       4                                                  5  
0        _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1          scotthamil

In [4]:
df.columns

Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [5]:
# Perform preprocessing steps
stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()


In [6]:
def preprocess_text(text):
    # Remove '@user' tags
    text = re.sub(r'@[^\s]+', '', text)

    # Remove hyperlinks
    text = re.sub(r'http\S+', '', text)

    # Lowercasing
    text = text.lower()

    # Punctuation removal
    text = re.sub(r'[^\w\s]', '', text)

    # Stopword removal and stemming
    tokens = word_tokenize(text)
    #tokens = [token for token in tokens if token not in stopwords]
    tokens = [stemmer.stem(token) for token in tokens] 

    return ' '.join(tokens)


In [7]:
# Apply preprocessing to 'text' column
df[6] = df[5].apply(preprocess_text)

# Save the tokens and 'target' column to a new CSV file
df[[0,6]].to_csv('Tweet_Data\\tokens_vs_target.csv', index=False)

In [8]:
del df

In [9]:
df = pd.read_csv("Tweet_Data\\tokens_vs_target.csv", header = None)

In [10]:
df.head

<bound method NDFrame.head of          0                                                  1
0        0                                                  6
1        0  awww that bummer shoulda got david carr third day
2        0  upset cant updat facebook text might cri resul...
3        0    dive mani time ball manag save 50 rest go bound
4        0                    whole bodi feel itchi like fire
...     ..                                                ...
1599996  4                         woke school best feel ever
1599997  4           thewdbcom cool hear old walt interview â
1599998  4                       readi mojo makeov ask detail
1599999  4  happi 38th birthday boo alll time tupac amaru ...
1600000  4                               happi charitytuesday

[1600001 rows x 2 columns]>

In [13]:
# Drop rows with empty values in column 1
df = df.dropna(subset=[1])

df.shape

(1593209, 2)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Retrieve column 1 and convert it to a list of token lists
token_lists = df[1]

token_lists

0                                                          6
1          awww that bummer shoulda got david carr third day
2          upset cant updat facebook text might cri resul...
3            dive mani time ball manag save 50 rest go bound
4                            whole bodi feel itchi like fire
                                 ...                        
1599996                           woke school best feel ever
1599997             thewdbcom cool hear old walt interview â
1599998                         readi mojo makeov ask detail
1599999    happi 38th birthday boo alll time tupac amaru ...
1600000                                 happi charitytuesday
Name: 1, Length: 1593209, dtype: object

In [15]:
# Perform TF-IDF encoding and create vectors of size 50
vectorizer = TfidfVectorizer(max_features=50)
vectorizer.fit(token_lists)


In [16]:
tfidf_vectors = vectorizer.transform(token_lists).toarray()

# Create a new DataFrame with the vectors and column 0
new_df = pd.DataFrame(tfidf_vectors)
new_df.insert(0, "target", df[0])

# Save the new DataFrame to a new CSV file
new_df.to_csv('Tweet_Data\\vectors_vs_tokens.csv', index=False)

In [17]:
new_df

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.506993,0.0,0.00000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.711692,0.0,0.000000,0.0,0.00000,0.0,0.000000,...,0.000000,0.702491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,0.637816,...,0.770189,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.74615,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1593204,4.0,0.0,0.0,0.000000,0.0,0.000000,0.0,1.00000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1593205,4.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1593206,4.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1593207,4.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,0.000000,...,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
del df
del new_df

In [19]:
df = pd.read_csv('Tweet_Data\\vectors_vs_tokens.csv')

df.head

<bound method NDFrame.head of          target    0    1         2    3         4    5        6    7   
0           0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.00000  0.0  \
1           0.0  0.0  0.0  0.000000  0.0  0.506993  0.0  0.00000  0.0   
2           0.0  0.0  0.0  0.711692  0.0  0.000000  0.0  0.00000  0.0   
3           0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.00000  0.0   
4           0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.74615  0.0   
...         ...  ...  ...       ...  ...       ...  ...      ...  ...   
1593204     4.0  0.0  0.0  0.000000  0.0  0.000000  0.0  1.00000  0.0   
1593205     4.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.00000  0.0   
1593206     4.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.00000  0.0   
1593207     4.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.00000  0.0   
1593208     4.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.00000  0.0   

                8  ...        40        41   42   43   44   45   46   47   48   
0        0.0

In [20]:
df.columns

Index(['target', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35',
       '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47',
       '48', '49'],
      dtype='object')

In [34]:
# Extract the 'target' column as y_data
y_data = df['target'].values.astype(int)

# Extract the remaining 50 columns as x_data
x_data = df.drop('target', axis=1).values


In [35]:
y_data

array([0, 0, 0, ..., 4, 4, 4])

In [49]:
for i in range(len(y_data)):
    if y_data[i] not in [0, 2, 4]:
        y_data[i] = 2

In [50]:
y_data.shape

(1593209,)

In [51]:
x_data.shape

(1593209, 50)

In [52]:
x_data

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.71169227, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [53]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=0)


In [54]:
x_train.shape

(1274567, 50)

In [55]:
y_train.shape

(1274567,)

In [56]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout

In [57]:
model = Sequential()
model.add(BatchNormalization())
model.add(Dense(256, activation='relu',input_shape = [50]))
model.add(Dropout(0.3))
model.add(BatchNormalization())
model.add(Dense(256,activation='relu'))
model.add(Dense(5, activation = 'softmax'))  

# Compile the model
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics=['accuracy'])



In [58]:
# Train the model
model.fit(x_train, y_train, batch_size=1000, epochs=10)

model.save('trained_model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [59]:
model = keras.models.load_model('trained_model.h5')

model.evaluate(x_val,y_val)



[0.6473031044006348, 0.6292830109596252]

In [60]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(x_val)



In [61]:
y_pred

array([[4.7987479e-01, 1.2874833e-07, 4.3951799e-03, 1.6359846e-07,
        5.1572967e-01],
       [5.1490748e-01, 1.0580252e-07, 4.0214318e-03, 1.5711383e-07,
        4.8107085e-01],
       [5.5279094e-01, 8.4532239e-08, 4.0918449e-03, 1.6516394e-07,
        4.4311696e-01],
       ...,
       [3.3980045e-01, 7.0389277e-08, 3.5985666e-03, 1.4605416e-07,
        6.5660077e-01],
       [5.0041711e-01, 4.5668354e-08, 2.6664031e-03, 6.6730671e-08,
        4.9691635e-01],
       [4.6689385e-01, 8.4905523e-08, 3.4474891e-03, 5.5547364e-08,
        5.2965856e-01]], dtype=float32)

In [62]:
y_val

array([4, 0, 4, ..., 4, 0, 4])

In [63]:
# Convert probabilities to predicted labels
y_pred_labels = np.argmax(y_pred, axis=1)

# Print the resulting array
y_pred_labels

array([4, 0, 0, ..., 4, 0, 4], dtype=int64)

In [64]:
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred_labels)

# Calculate precision
precision = precision_score(y_val, y_pred_labels, average='weighted')

# Calculate recall
recall = recall_score(y_val, y_pred_labels, average='weighted')

# Calculate F1-score
f1 = f1_score(y_val, y_pred_labels, average='weighted')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.6292830198153414
Precision: 0.6328407193670639
Recall: 0.6292830198153414
F1-score: 0.624059564340885


  _warn_prf(average, modifier, msg_start, len(result))


In [101]:
input_text = 'hand is itchy'

input_tokens = preprocess_text(input_text)

input_tokens


'hand itchi'

In [102]:
input_vector = vectorizer.transform([input_tokens]).toarray()

input_vector

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]])

In [103]:
output = np.argmax(model.predict([input_vector]), axis=1)

output



array([4], dtype=int64)