# Natural Language Processing Project

## Add dependencies


In [40]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk


## Load the dataset

In [41]:
data = pd.read_csv('./TRAINING_DATA.txt', sep='\t', header=None)
data.columns = ['label', 'text']
print(data.head())



   label                                               text
0      1  Cuando conocí a Janice en 2013 , una familia n...
1      0  Hwang habló en Sur de este año por Southwest M...
2      1  Usted podría pensar Katy Perry y Robert Pattin...
3      1  Cualquiera que haya volado los cielos del crea...
4      1  Bueno , este cantante tendrá un LARGO tiempo p...


## download NLTK data

In [42]:
# Ensure you have the required NLTK data 
nltk.download('punkt') 
nltk.download('wordnet') 
nltk.download('stopwords')

import stanza # Download the Spanish model 
stanza.download('es')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaime\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jaime\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaime\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 7.18MB/s]                    
INFO:stanza:Downloaded file to C:\Users\jaime\stanza_resources\resources.json
INFO:stanza:Downloading default packages for language: es (Spanish) ...
Downloading https://huggingface.co/stanfordnlp/stanza-es/resolve/v1.10.0/models/default.zip: 100%|██████████| 642M/642M [03:15<00:00, 3.29MB/s] 
INFO:stanza:Downloaded file to C:\Users\jaime\stanza_resources\es\default.zip
INFO:stanza:

## Start preprocessing

In [43]:
import re
import stanza


# Initialize the Spanish pipeline once
nlp = stanza.Pipeline('es')

# Preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stop words
    stop_words = set(stopwords.words('spanish'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Lemmatization using stanza
    def lemmatize_text(text):
        doc = nlp(text)
        return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])

    text = lemmatize_text(text)
    return text

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Display the cleaned text
print(data[['text', 'cleaned_text']].head())


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 8.12MB/s]                    
INFO:stanza:Downloaded file to C:\Users\jaime\stanza_resources\resources.json
INFO:stanza:Loading these models for language: es (Spanish):
| Processor    | Package           |
------------------------------------
| tokenize     | combined          |
| mwt          | combined          |
| pos          | combined_charlm   |
| lemma        | combined_nocharlm |
| constituency | combined_charlm   |
| depparse     | combined_charlm   |
| sentiment    | tass2020_charlm   |
| ner          | conll02           |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:s

KeyboardInterrupt: 

## Feature Extraction using TF-IDF

In [None]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming vectorizer is already fitted
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X = vectorizer.fit_transform(data['cleaned_text'])  # Example fit_transform on your training data

# Save the vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

y = data['label']

print(X.shape)


In [None]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(y_train.value_counts())

## Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build an improved neural network
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],), kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.01, l2=0.02)))
model.add(BatchNormalization())
model.add(Dropout(0.6))
model.add(Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.01, l2=0.02)))
model.add(BatchNormalization())
model.add(Dropout(0.6))
model.add(Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.01, l2=0.02)))
model.add(BatchNormalization())
model.add(Dropout(0.6))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5)

# Train the model with callbacks
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X_test_scaled, y_test), callbacks=[early_stopping, reduce_lr])

# Evaluate the model
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Accuracy: {accuracy:.2f}')


## Save the Model

In [None]:


# Save the entire model to a HDF5 file
model.save('my_model3.h5')






## Loading the model

In [None]:
from tensorflow.keras.models import load_model 
# Load the saved model 
l_model = load_model('./my_model3.h5')

## Preprocess new data

In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer



def load_txt_file(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            label, text = line.strip().split('\t', 1)
            data.append([int(label), text])
    return pd.DataFrame(data, columns=['label', 'text'])


# Read the new data from the text file
real_data = load_txt_file('REAL_DATA.txt')


real_data['cleaned_text'] = real_data['text'].apply(preprocess_text)

# Load the saved vectorizer 
with open('vectorizer.pkl', 'rb') as f: 
    vectorizer = pickle.load(f) 
# Transform the new data using the loaded vectorizer 
test = vectorizer.transform(real_data['cleaned_text'])

# Using the scaler I used up top
test_scaled = scaler.fit_transform(test)




## Prediction

In [None]:
from tensorflow.keras.models import load_model
import numpy as np



# Make predictions
predictions = l_model.predict(test_scaled)

# Convert probabilities to class labels (assuming binary classification with threshold 0.5)
predicted_labels = (predictions > 0.5).astype("int32")

# Add predictions to the original data
real_data['label'] = predictions

# Save the predictions to a new file
real_data.to_csv('predictions_with_labels_test.txt', sep='\t', index=False, header=False) 
print("Predictions saved to 'predictions_with_labels.txt'")
real_data['label'].value_counts()



## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting classifier
classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train the classifier
classifier.fit(X_train, y_train)

# Make predictions and evaluate the classifier
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


In [28]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer



def load_txt_file(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            label, text = line.strip().split('\t', 1)
            data.append([int(label), text])
    return pd.DataFrame(data, columns=['label', 'text'])


# Read the new data from the text file
real_data = load_txt_file('REAL_DATA.txt')


real_data['cleaned_text'] = real_data['text'].apply(preprocess_text)

# Load the saved vectorizer 
with open('vectorizer.pkl', 'rb') as f: 
    vectorizer = pickle.load(f) 
# Transform the new data using the loaded vectorizer 
test = vectorizer.transform(real_data['cleaned_text'])






In [None]:
from tensorflow.keras.models import load_model
import numpy as np



# Make predictions
new_predictions = classifier.predict(test)




# Add predictions to the original data
real_data['label'] = new_predictions
# Save the predictions to a new file
real_data.to_csv('REAL_DATA_WITH_PREDICTIONS.txt', sep='\t', index=False, header=False) 
print("Predictions saved to 'predictions_with_labels.txt'")
real_data['label'].value_counts()