In [10]:
import pandas as pd
import sys
import pickle
from fasttext import load_model
import gensim
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
import collections
import numpy as np

In [11]:
data = pd.read_csv(r"..\Dataset1\AFF_Reviews_tokenized.csv")

In [12]:
len(data)

145150

In [13]:
data.head(20)

Unnamed: 0,Text,Score
0,long time fan original chip ahoy cooky like si...,3
1,really tasty chip lifetime say delectable one ...,5
2,vendor clearly attempt take advantage consumer...,1
3,order original beef jerky ounce bags pack get ...,1
4,try find drink replace soda drink daily like t...,3
5,best cooky ever eat would never know sugar fre...,4
6,far favorite brand soynut butter taste yummy i...,5
7,little disappointed particular offering usuall...,3
8,really much confidence commercial pet food ing...,3
9,saw available vine program compel order simply...,1


In [14]:
nan_count = data['Text'].isna().sum()
print(f"Number of NaN values in 'Text' column: {nan_count}")

Number of NaN values in 'Text' column: 1


In [15]:
df_cleaned = data.dropna(subset=['Text'])

In [16]:
nan_count = df_cleaned['Text'].isna().sum()
print(f"Number of NaN values in 'Text' column: {nan_count}")

Number of NaN values in 'Text' column: 0


In [17]:
for i in range(1,6):
    print((df_cleaned['Score'] == i).sum())

print(len(df_cleaned))

29030
29030
29030
29030
29029
145149


In [18]:
maxRowsToKeep = 29000
for i in range(1,6):
    rows_with_value = df_cleaned[df_cleaned['Score'] == i]
    rows_to_remove = rows_with_value.sample(n=(len(rows_with_value)-maxRowsToKeep))
    df_cleaned = df_cleaned.drop(rows_to_remove.index)

In [19]:
for i in range(1,6):
    print((df_cleaned['Score'] == i).sum())

print(len(df_cleaned))

29000
29000
29000
29000
29000
145000


In [20]:
def find_unique_words(strings):
        combined_string = ' '.join(strings)
        words = combined_string.split()
        unique_words = set(words)
        return unique_words

unique_words = find_unique_words(df_cleaned["Text"])
print(len(unique_words))

53527


In [21]:
# load the vectorizer model
vectorizer = gensim.models.KeyedVectors.load_word2vec_format(r'..\Vectorizer\FastText-300d-1M.vec')

In [22]:
# fit the keras tokenizer on the entire dataset
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df_cleaned['Text'])

In [23]:
len(tokenizer.word_index)

53527

In [24]:
# + 1 to account for padding token.
num_tokens = len(tokenizer.word_index) + 1

# Initialize a matrix of zeroes of size: vocabulary x embedding dimension.
embedding_dim = 300
embedding_matrix = np.zeros((num_tokens, embedding_dim))

for word, i in tokenizer.word_index.items():
  if vectorizer.has_index_for(word):
    embedding_matrix[i] = vectorizer[word].copy()

In [25]:
# Quick check
print(embedding_matrix[tokenizer.word_index['great']][:50])

[-0.0129 -0.0311  0.0133  0.0051 -0.0395 -0.0044 -0.0218 -0.0483  0.021
  0.0186 -0.0313  0.0012  0.0194 -0.0124  0.0116 -0.0149  0.0489  0.0029
  0.0437 -0.0069 -0.0129  0.0165 -0.0162  0.0322  0.0181 -0.01    0.0173
 -0.0312  0.0552 -0.0006 -0.0004 -0.0177  0.0048 -0.0616  0.0065 -0.0015
  0.0203 -0.0142 -0.0047  0.0054  0.0096  0.0071 -0.0081 -0.0085 -0.0088
  0.0129  0.0017 -0.0259  0.0174  0.0354]


In [26]:
with open(r'..\Dataset1\embeddingMatrixDS1.pkl', 'wb') as f:
    pickle.dump(embedding_matrix, f)

In [27]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df_cleaned['Text'], np.array(df_cleaned['Score']), test_size=0.15, random_state=42)

In [28]:
# Split the train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

In [29]:
print(collections.Counter(y_train))
print(collections.Counter(y_val))
print(collections.Counter(y_test))

Counter({1: 19811, 3: 19773, 2: 19729, 4: 19667, 5: 19620})
Counter({4: 4996, 5: 4962, 2: 4929, 1: 4909, 3: 4854})
Counter({5: 4418, 3: 4373, 2: 4342, 4: 4337, 1: 4280})


In [30]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [31]:
print(X_val_seq[0])

[115, 28, 169, 2, 3, 219, 1082, 148, 314, 5, 314, 119, 8, 662, 9, 589]


In [32]:
[tokenizer.index_word[x] for x in X_val_seq[0][:5]]

['k', 'cup', 'delicious', 'taste', 'good']

In [33]:
tokenizer.sequences_to_texts([X_val_seq[0]])[0][:300]

'k cup delicious taste good save alot money cream product cream right coffee plan buy soon'

In [34]:
def count_words(text):
    return len(str(text).split())

# Apply the function to the 'Text' column
df_cleaned['Word_Count'] = df_cleaned['Text'].apply(count_words)

# Find the row with the maximum word count
max_word_count_row = df_cleaned.loc[df_cleaned['Word_Count'].idxmax()]

print("Row with the highest word count:")
print(f"Score: {max_word_count_row['Score']}")
print(f"Word Count: {max_word_count_row['Word_Count']}")
print(f"Text (first 100 characters): {max_word_count_row['Text'][:100]}...")

# If you want to see all rows with the maximum word count (in case of ties):
max_word_count = df_cleaned['Word_Count'].max()
max_word_count_rows = df_cleaned[df_cleaned['Word_Count'] == max_word_count]

if len(max_word_count_rows) > 1:
    print(f"\nThere are {len(max_word_count_rows)} rows with the maximum word count of {max_word_count}.")
    for index, row in max_word_count_rows.iterrows():
        print(f"\nScore: {row['Score']}")
        print(f"Word Count: {row['Word_Count']}")
        print(f"Text (first 100 characters): {row['Text'][:100]}...")

Row with the highest word count:
Score: 2
Word Count: 249
Text (first 100 characters): leave general mill undo nature make decision ingredient iexcl ingredient vanilla ingredient roll oat...


In [35]:
MAX_TOKENS_LEN = 250
X_train_padded = keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=MAX_TOKENS_LEN, padding='post')
X_val_padded = keras.preprocessing.sequence.pad_sequences(X_val_seq, maxlen=MAX_TOKENS_LEN, padding='post')
X_test_padded = keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=MAX_TOKENS_LEN, padding='post')

In [36]:
with open(r'..\Dataset1\Dataset1Splits\X_train.pkl', 'wb') as f:
    pickle.dump(X_train_padded, f)

with open(r'..\Dataset1\Dataset1Splits\X_val.pkl', 'wb') as f:
    pickle.dump(X_val_padded, f)

with open(r'..\Dataset1\Dataset1Splits\X_test.pkl', 'wb') as f:
    pickle.dump(X_test_padded, f)

with open(r'..\Dataset1\Dataset1Splits\y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)

with open(r'..\Dataset1\Dataset1Splits\y_val.pkl', 'wb') as f:
    pickle.dump(y_val, f)

with open(r'..\Dataset1\Dataset1Splits\y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)