<a href="https://colab.research.google.com/github/Poorya0071/NLP_TensorFlow/blob/main/NLP_oversampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import string
import tensorflow as tf
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler

In [3]:
raw_data = pd.read_json('Musical_Instruments_5.json',lines=True)

X = raw_data['reviewText'] + "" + raw_data['summary']
raw_data['overall'] = raw_data['overall'].map({1:0,2:0,3:0,4:1,5:1})
print(raw_data['overall'].value_counts())

1    9022
0    1239
Name: overall, dtype: int64


In [4]:
raw_data["text"] = X
data = raw_data.sample(frac=1, random_state=42).reset_index()
data.drop('index', axis = 1, inplace = True)
X = data['text']
y = data['overall']
print(X)

0        I've been using these on my acoustic guitars (...
1        Sounds like a great concept and they seem well...
2        I recently ordered a wide variety of picks to ...
3        I have two of these stands, the electric guita...
4        This guitar sounds awesome and stays in tune v...
                               ...                        
10256    A year ago, I wrote a lengthy comparison of th...
10257    Okay well I lied in subject line, bad singers ...
10258    This mic is strong, reliable and produces the ...
10259    I'm an Irish-style DADGAD guitarist.  These pi...
10260    This is a great little amp especially for the ...
Name: text, Length: 10261, dtype: object


In [5]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(X,
                                                                            y,
                                                                            test_size=0.2, # dedicate 10% of samples to validation set
                                                                            random_state=42)

In [6]:
ros = RandomOverSampler(
    sampling_strategy='not majority', # samples all but majority class
    random_state=0,  # for reproducibility
)

In [7]:

X_res, y_res = ros.fit_resample(train_sentences.to_numpy().reshape(-1, 1), train_labels)
print(y_res.value_counts())

1    7213
0    7213
Name: overall, dtype: int64


In [10]:

train_df = pd.DataFrame()



In [15]:
pd.Series(X_res.squeeze())

0        I like this product. The screen is big and eas...
1        I wanted a guitar I could travel with, somethi...
2        i got pink cause nobody would steal it, and we...
3        This clamp on pair of miniature lamps does an ...
4        This string set offers typical D'Addario quali...
                               ...                        
14421    I bought this capo thinking it would look slic...
14422    Before I purchased the FBV I scoured the web t...
14423    These have the correct thickness and texture t...
14424    I just got these in and I have to say, very di...
14425    The strap locks do work... But they are kind o...
Length: 14426, dtype: object

In [16]:
train_df['text'] = X_res.squeeze()

In [17]:
train_df['target'] = y_res

In [18]:
train_df.head()

Unnamed: 0,text,target
0,I like this product. The screen is big and eas...,1
1,"I wanted a guitar I could travel with, somethi...",1
2,"i got pink cause nobody would steal it, and we...",1
3,This clamp on pair of miniature lamps does an ...,1
4,This string set offers typical D'Addario quali...,1


In [19]:
val_sentences

2507    Like it better than the other tuner I bought, ...
5159    I actually wasn't aware of this gauge of strin...
932     I haven't used this yet, but it seems solid an...
1190    Shipped in time. Just what I have expected. lo...
2619    Works just fine. Except the little knob that's...
                              ...                        
400     I picked up a couple of these straps, one for ...
2956    I purchased several of thes Fender Mini stands...
3614    In our acoustic guitar band, I often play intr...
3501    I needed a cheap mic stand that didn't break t...
6671    Simple. Effective.  Just what I wanted.  Seems...
Name: text, Length: 2053, dtype: object

In [20]:
vocab_size = 40000
embedding_dim = 128
max_length = 94
trunc_type='post'
pad_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_df['text'])
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(train_df['text'])
training_padded = pad_sequences(training_sequences,maxlen=max_length,
                                truncating=trunc_type, padding=pad_type)

validation_sequences = tokenizer.texts_to_sequences(val_sentences)
validation_padded = pad_sequences(validation_sequences,maxlen=max_length)

training_labels_final = np.array(train_df['target'])
validation_labels_final = np.array(val_labels)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
# Fit the model
num_epochs = 20
history = model.fit(training_padded, training_labels_final, epochs=num_epochs,
                    validation_data=(validation_padded, validation_labels_final))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 94, 128)           5120000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 5,120,129
Trainable params: 5,120,129
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import string
import tensorflow as tf
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler


data_math = pd.read_csv('raw_text.csv')
print(data_math.head())
print(data_math.label.value_counts())
X = data_math['text']
y = data_math['label']

train_sentences, val_sentences, train_labels, val_labels = train_test_split(X,
                                                                            y,
                                                                            test_size=0.2, # dedicate 10% of samples to validation set
                                                                            random_state=42)
print(train_labels.value_counts())

from imblearn.over_sampling import SMOTE,RandomOverSampler
sampling_strategy={'Linear Algebra':126,'Probability':104, 'CS':100, 'Diff. Eq.':100, 'Algorithms':100, 'Statistics':100, 'Calculus':100, 'Data Structures': 100, 'AI':100, 'Math for Eng.': 100, 'NLP':100 }
oversample = RandomOverSampler(sampling_strategy=sampling_strategy)
X_train,y_train= oversample.fit_resample(train_sentences.to_numpy().reshape(-1,1),train_labels)

print(y_train.value_counts())

label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(y_train.to_numpy())
val_labels_encoded = label_encoder.transform(val_labels.to_numpy())


print(train_labels_encoded)
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
print(class_names)
print(num_classes)

train_text = pd.Series(X_train.squeeze())

vocab_size = 40000
embedding_dim = 128
max_length = 94
trunc_type='post'
pad_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_text)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(train_text)
training_padded = pad_sequences(training_sequences,maxlen=max_length,
                                truncating=trunc_type, padding=pad_type)

validation_sequences = tokenizer.texts_to_sequences(val_sentences)
validation_padded = pad_sequences(validation_sequences,maxlen=max_length)

# training_labels_final = np.array(y_res)
# validation_labels_final = np.array(val_labels)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
# Fit the model
num_epochs = 20
history = model.fit(training_padded, train_labels_encoded, epochs=num_epochs,
                    validation_data=(validation_padded, val_labels_encoded))