# Natural Language Processing Project

## Add dependencies


In [71]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk


## Load the dataset

In [72]:
data = pd.read_csv('./TRAINING_DATA.txt', sep='\t', header=None)
data.columns = ['label', 'text']
print(data.head())



   label                                               text
0      1  Cuando conocí a Janice en 2013 , una familia n...
1      0  Hwang habló en Sur de este año por Southwest M...
2      1  Usted podría pensar Katy Perry y Robert Pattin...
3      1  Cualquiera que haya volado los cielos del crea...
4      1  Bueno , este cantante tendrá un LARGO tiempo p...


## download NLTK data

In [73]:
# Ensure you have the required NLTK data 
nltk.download('punkt') 
nltk.download('wordnet') 
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaime\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jaime\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaime\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Start preprocessing

In [74]:
# Preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stop words
    stop_words = set(stopwords.words('spanish'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Display the cleaned text
print(data[['text', 'cleaned_text']].head())

                                                text  \
0  Cuando conocí a Janice en 2013 , una familia n...   
1  Hwang habló en Sur de este año por Southwest M...   
2  Usted podría pensar Katy Perry y Robert Pattin...   
3  Cualquiera que haya volado los cielos del crea...   
4  Bueno , este cantante tendrá un LARGO tiempo p...   

                                        cleaned_text  
0  conoc janice familia necesitaba puntos promedi...  
1  hwang habl sur ao southwest music and medium c...  
2  usted podra pensar katy perry robert pattinson...  
3  cualquiera volado cielos creador escuchado act...  
4  bueno cantante tendr largo tiempo sentir an m ...  


## Feature Extraction using TF-IDF

In [75]:
# create a TfidfVectorizer object
vectorizer = TfidfVectorizer(ngram_range=(1, 3))

# Veectorize the cleaned text
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['label']

print(X.shape)


(14924, 271825)


In [76]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train the MultinomialMB Classifier

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

# Define parameter grid
param_grid = {'alpha': [0.1, 0.5, 1, 5, 10]}

# Initialize the Grid Search with cross-validation
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='f1')

# Fit Grid Search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Train and evaluate the model with best parameters
best_classifier = MultinomialNB(alpha=best_params['alpha'])
best_classifier.fit(X_train, y_train)
y_pred = best_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Best Parameters: {'alpha': 10}
Accuracy: 0.33
Precision: 0.30
Recall: 0.26
F1 Score: 0.28


## Train another model Random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
classifier.fit(X_train, y_train)

# Make predictions and evaluate the classifier
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Accuracy: 0.32
Precision: 0.34
Recall: 0.40
F1 Score: 0.37


## Gradient Boosting

In [35]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting classifier
classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train the classifier
classifier.fit(X_train, y_train)

# Make predictions and evaluate the classifier
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Accuracy: 0.50
Precision: 0.50
Recall: 0.81
F1 Score: 0.61


## Gradient Boosting using Grid Search

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0]
}

# Initialize the Grid Search with cross-validation
grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5, scoring='f1')

# Fit Grid Search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Train and evaluate the model with best parameters
best_classifier = GradientBoostingClassifier(**best_params)
best_classifier.fit(X_train, y_train)
y_pred = best_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


KeyboardInterrupt: 

## XGBoost

In [37]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Assuming X contains your TF-IDF or other features and y contains your labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Train the classifier
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.46
Precision: 0.47
Recall: 0.61
F1 Score: 0.53


## Neural Network

In [77]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build an improved neural network
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],), kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.01, l2=0.02)))
model.add(BatchNormalization())
model.add(Dropout(0.6))
model.add(Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.01, l2=0.02)))
model.add(BatchNormalization())
model.add(Dropout(0.6))
model.add(Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.01, l2=0.02)))
model.add(BatchNormalization())
model.add(Dropout(0.6))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5)

# Train the model with callbacks
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X_test_scaled, y_test), callbacks=[early_stopping, reduce_lr])

# Evaluate the model
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Accuracy: {accuracy:.2f}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m374/374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 206ms/step - accuracy: 0.5006 - loss: 182.1793 - val_accuracy: 0.5025 - val_loss: 170.0955 - learning_rate: 0.0010
Epoch 2/10
[1m374/374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 204ms/step - accuracy: 0.5072 - loss: 142.6664 - val_accuracy: 0.4935 - val_loss: 129.4114 - learning_rate: 0.0010
Epoch 3/10
[1m374/374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 207ms/step - accuracy: 0.4950 - loss: 110.2241 - val_accuracy: 0.4965 - val_loss: 71.6368 - learning_rate: 0.0010
Epoch 4/10
[1m374/374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 214ms/step - accuracy: 0.4894 - loss: 60.3074 - val_accuracy: 0.5022 - val_loss: 42.8375 - learning_rate: 0.0010
Epoch 5/10
[1m374/374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 217ms/step - accuracy: 0.5003 - loss: 39.1881 - val_accuracy: 0.4961 - val_loss: 36.5137 - learning_rate: 0.0010
Epoch 6/10
[1m374/374[0m [32m━━━━━━━━

## Save the Model

In [79]:


# Save the entire model to a HDF5 file
model.save('my_model2.h5')








## Transfer Learning

In [13]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name)

# Prepare the data
def encode_examples(ds, limit=-1):
    input_ids_list, token_type_ids_list, attention_mask_list, label_list = [], [], [], []

    for index, row in ds.iterrows():
        inputs = tokenizer.encode_plus(
            row['text'], max_length=128, padding='max_length', truncation=True, add_special_tokens=True)
        input_ids_list.append(inputs['input_ids'])
        token_type_ids_list.append(inputs['token_type_ids'])
        attention_mask_list.append(inputs['attention_mask'])
        label_list.append(row['label'])

    return tf.data.Dataset.from_tensor_slices(({
        'input_ids': tf.constant(input_ids_list),
        'token_type_ids': tf.constant(token_type_ids_list),
        'attention_mask': tf.constant(attention_mask_list)
    }, tf.constant(label_list)))

# Split the data
train_ds, val_ds = train_test_split(data, test_size=0.2, random_state=42)

# Encode the examples
train_dataset = encode_examples(train_ds).shuffle(100).batch(32).prefetch(tf.data.AUTOTUNE)
val_dataset = encode_examples(val_ds).batch(32).prefetch(tf.data.AUTOTUNE)

# Compile the model with explicit loss function and stronger regularization
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Train the model
model.fit(train_dataset, epochs=10, validation_data=val_dataset)

# Evaluate the model
loss, accuracy = model.evaluate(val_dataset)
print(f'Accuracy: {accuracy:.2f}')


  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: Failed to import transformers.models.bert.modeling_tf_bert because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

## Loading the model

In [80]:
from tensorflow.keras.models import load_model 
# Load the saved model 
l_model = load_model('./my_model2.h5')



## Preprocess new data

In [81]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer



def load_txt_file(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            label, text = line.strip().split('\t', 1)
            data.append([int(label), text])
    return pd.DataFrame(data, columns=['label', 'text'])


# Read the new data from the text file
real_data = load_txt_file('REAL_DATA.txt')


real_data['cleaned_text'] = real_data['text'].apply(preprocess_text)

# Using the same vectorizer I used up top
test = vectorizer.fit_transform(real_data['cleaned_text'])

# Using the scaler I used up top
test_scaled = scaler.fit_transform(test)




## Prediction

In [82]:
from tensorflow.keras.models import load_model
import numpy as np



# Make predictions
predictions = l_model.predict(test_scaled)

# Convert probabilities to class labels (assuming binary classification with threshold 0.5)
predicted_labels = (predictions > 0.5).astype("int32")

# Add predictions to the original data
real_data['label'] = predicted_labels

# Save the predictions to a new file
real_data.to_csv('predictions_with_labels.txt', sep='\t', index=False, header=False) 
print("Predictions saved to 'predictions_with_labels.txt'")



ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_22" is incompatible with the layer: expected axis -1 of input shape to have value 271825, but received input with shape (32, 48165)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 48165), dtype=float32)
  • training=False
  • mask=None