<a href="https://colab.research.google.com/github/NxrFesdac/bourbaki-nlp-avanzado/blob/main/modulo1/TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# https://huggingface.co/datasets/nyu-mll/glue/tree/main/cola
df = pd.read_parquet('/content/sample_data/train-00000-of-00001.parquet')

print("First 5 rows of the DataFrame:")
print(df.head())

print("\nColumn names of the DataFrame:")
print(df.columns)

First 5 rows of the DataFrame:
                                            sentence  label  idx
0  Our friends won't buy this analysis, let alone...      1    0
1  One more pseudo generalization and I'm giving up.      1    1
2   One more pseudo generalization or I'm giving up.      1    2
3     The more we study verbs, the crazier they get.      1    3
4          Day by day the facts are getting murkier.      1    4

Column names of the DataFrame:
Index(['sentence', 'label', 'idx'], dtype='object')


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

print("NLTK stopwords and wordnet downloaded.")

NLTK stopwords and wordnet downloaded.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Identify text and target columns
text_column = 'sentence'
target_column = 'label'

# Initialize NLTK components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and special characters
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize words
    return ' '.join(tokens)

# Apply preprocessing to the text column
df['preprocessed_sentence'] = df[text_column].apply(preprocess_text)

# Split data into training and testing sets
X = df['preprocessed_sentence']
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer() # Limiting features for efficiency

# Fit the vectorizer on the training data and transform both training and testing data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF transformed training data shape: {X_train_tfidf.shape}")
print(f"TF-IDF transformed testing data shape: {X_test_tfidf.shape}")


Training data shape: (6840,)
Testing data shape: (1711,)
TF-IDF transformed training data shape: (6840, 4424)
TF-IDF transformed testing data shape: (1711, 4424)


In [None]:
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# L2 norm is by default
model = LogisticRegression(random_state=42)

# Train the model
model.fit(X_train_tfidf, y_train)

print("Logistic Regression model trained successfully.")

Logistic Regression model trained successfully.


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.70
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.06      0.10       506
           1       0.71      0.97      0.82      1205

    accuracy                           0.70      1711
   macro avg       0.56      0.51      0.46      1711
weighted avg       0.62      0.70      0.61      1711


Confusion Matrix:
[[  28  478]
 [  40 1165]]
