In [1]:
!pip install "tensorflow-text==2.19.*"



## **Importar dataset**

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_text as tf_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import collections

In [3]:
from google.colab import drive

drive.mount("/content/gdrive")
!pwd  # show current path

Mounted at /content/gdrive
/content


In [4]:
%cd "/content/gdrive/MyDrive/ESCUELA/IRS/7MO/IA-2/Modulo-2.2/EVIDENCIA"
!ls  # show current directory

/content/gdrive/MyDrive/ESCUELA/IRS/7MO/IA-2/Modulo-2.2/EVIDENCIA
ENT.ipynb      modelo_final.keras	   PRED
ent_xd.ipynb   PortafolioImp-ETL.ipynb	   prepared_datasets
ETL.ipynb      PortafolioImp-Modelo.ipynb  prueba_xd
fake_news.csv  PortafolioImp-Pred.ipynb


In [5]:
import pandas as pd
df = pd.read_csv("/content/gdrive/MyDrive/ESCUELA/IRS/7MO/IA-2/Modulo-2.2/EVIDENCIA/fake_news.csv")

##**Tranformaciones de los datos**

Vemos como esat estructurado el dataset

In [6]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,2619,Ex-CIA head says Trump remarks on Russia inter...,Former CIA director John Brennan on Friday cri...,politicsNews,"July 22, 2017",1.0
1,16043,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,"Jun 19, 2017",0.0
2,876,Federal Reserve governor Powell's policy views...,President Donald Trump on Thursday tapped Fede...,politicsNews,"November 2, 2017",1.0
3,19963,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,Hillary Clinton ally David Brock is offering t...,left-news,"Sep 17, 2016",0.0
4,10783,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,Pleading ignorance is a perfect ploy for Nancy...,politics,"May 26, 2017",0.0
5,18522,EU's Tusk appealed to Rajoy to avoid escalatio...,European Council President Donald Tusk appeale...,worldnews,"October 2, 2017",1.0
6,270,Country Guitarist Who Survived Vegas Shooting ...,"Caleb Keeter, a lifelong proponent of the Seco...",News,"October 2, 2017",0.0
7,7628,Clinton says 'there is no case here' in FBI em...,Democratic presidential candidate Hillary Clin...,politicsNews,"October 31, 2016",1.0
8,9599,ABC NEWS REPORTS: Las Vegas Massacre Suspect’s...,The investigation into the Las Vegas massacre ...,politics,"Oct 25, 2017",0.0
9,11234,BONKERS BERNIE SANDERS: Prioritizing Jobs Over...,https://www.youtube.com/watch?v=GPqQIlWksbgVer...,politics,"Apr 1, 2017",0.0


Vemos que tipo de datos tenemos y si hay valores nulos

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30016 entries, 0 to 30015
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  30016 non-null  object 
 1   title       30016 non-null  object 
 2   text        30016 non-null  object 
 3   subject     30000 non-null  object 
 4   date        29984 non-null  object 
 5   label       29984 non-null  float64
dtypes: float64(1), object(5)
memory usage: 1.4+ MB


Contamos cuantos valores nulos tiene cada atributo

In [8]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
subject,16
date,32
label,32


Quitamos valores nulos de los datos

In [9]:
df_clean = df.dropna()

Creamos un nuevo dataframe con los atributos que vamos a necesitar

In [10]:
df = df_clean[['text', 'label']].dropna()

Vemos como esta la estructura de este nuevo dataframe

In [11]:
df

Unnamed: 0,text,label
0,Former CIA director John Brennan on Friday cri...,1.0
1,How did this man come to OWN this store? There...,0.0
2,President Donald Trump on Thursday tapped Fede...,1.0
3,Hillary Clinton ally David Brock is offering t...,0.0
4,Pleading ignorance is a perfect ploy for Nancy...,0.0
...,...,...
30011,The chief executive of the U.S. Aerospace Indu...,1.0
30012,The following are highlights of the maiden pol...,1.0
30013,If there s one thing President Barack Obama is...,0.0
30014,The Syrian army and its allies have taken full...,1.0


##**Division del dataframe**

In [12]:
df_shuffle = df.sample(frac=1, random_state=42).reset_index(drop=True)

Definimos que la division sea train(70%), validation(15%), test(15%), nos aseguramos de darle un shuffle a las osbervaciones y que los 3 dataset esten balanceados.

In [13]:
df_train, df_temp = train_test_split(df_shuffle, test_size=0.3, random_state=42, stratify=df_shuffle["label"])
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42, stratify=df_temp["label"])

Revisamos las dimensiones finales de cada uno

In [14]:
df_train.shape, df_val.shape, df_test.shape

((20988, 2), (4498, 2), (4498, 2))

##**Vectorización de los datos**

En este caso la vectorización la vamos hacer con `'int'` para enumerar los tokens

In [15]:
from tensorflow.keras.layers import TextVectorization

MAX_SEQUENCE_LENGTH = 250
VOCAB_SIZE = 20000

int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

Separamos el texto y las etiquetas de cada dataset

In [16]:
texts = df_train["text"].astype(str).values
labels = df_train["label"].values

texts_test = df_test["text"].astype(str).values
labels_test = df_test["label"].values

texts_val = df_val["text"].astype(str).values
labels_val = df_val["label"].values

Ahora convertimos los datasets a tensorflow.data.Dataset y los agrupamos en batches

In [17]:
BATCH_SIZE = 32

train_ds = tf.data.Dataset.from_tensor_slices((texts, labels)).batch(BATCH_SIZE)
test_ds = tf.data.Dataset.from_tensor_slices((texts_test, labels_test)).batch(BATCH_SIZE)
val_ds = tf.data.Dataset.from_tensor_slices((texts_val, labels_val)).batch(BATCH_SIZE)

Para sacar el vocab vamos a solo usar el dataset de train para que el modelo solo entrene con ese vocabulario

In [18]:
train_text = train_ds.map(lambda text, label: text)
int_vectorize_layer.adapt(train_text)

A partir de la tokenizacion de train, tokenizamos validation y test

In [19]:
def int_vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return int_vectorize_layer(text), label

In [20]:
train_ds = train_ds.map(int_vectorize_text)
val_ds = val_ds.map(int_vectorize_text)
test_ds = test_ds.map(int_vectorize_text)

Con esto mantenemos los datos en memoria y prepara los datos del siguiente lote mientras entrena el actual

In [21]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [22]:
int_train_ds = configure_dataset(train_ds)
int_val_ds = configure_dataset(val_ds)
int_test_ds = configure_dataset(test_ds)

Sacamos el vocabulario que obtuvimos del dataset de train

In [23]:
vocab = int_vectorize_layer.get_vocabulary()

##**Importacion de datasets y vocabulario**

In [24]:
import os
import tensorflow as tf

base_dir = "/content/gdrive/MyDrive/ESCUELA/IRS/7MO/IA-2/Modulo-2.2/EVIDENCIA/prepared_datasets"
os.makedirs(base_dir, exist_ok=True)

train_dir = os.path.join(base_dir, "train_dataset")
val_dir   = os.path.join(base_dir, "val_dataset")
test_dir  = os.path.join(base_dir, "test_dataset")

int_train_ds.save(train_dir)
int_val_ds.save(val_dir)
int_test_ds.save(test_dir)

print(f" Datasets guardados correctamente en:\n {base_dir}")

 Datasets guardados correctamente en:
 /content/gdrive/MyDrive/ESCUELA/IRS/7MO/IA-2/Modulo-2.2/EVIDENCIA/prepared_datasets


In [25]:
with open(os.path.join(base_dir, "vocab.txt"), "w", encoding="utf-8") as f:
    f.write("\n".join(vocab))
print("Vocabulario guardado")

Vocabulario guardado
