In [3]:
data_load = pd.read_csv('news_labeled_dataset_II.csv')

In [2]:
import re # regular expressions to handle with text patterns
import string # to help with cleaning punctuiations
import pandas as pd 

from sklearn.feature_extraction.text import TfidfVectorizer # convert text into numeric values
from sklearn.model_selection import train_test_split # training and test samples
from sklearn.neighbors import KNeighborsClassifier # the KNN from Scikit-Learn 
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score # to evaluate how accurate the predictions are 
from nltk.corpus import stopwords 


def read_data(file_name):
    # Load the dataset with comma delimimiter
    data = pd.read_csv(file_name, delimiter=',')
    return data

def clean_text(text):
    text = str(text).lower() #converts everything into string, then into lowercase
    text = re.sub(f'[{string.punctuation}]', '', text) # "replace(pattern, replacement, original_text)" - removes all string ponctuations using regex
    text = re.sub(r'\s+', ' ', text).strip() # raw string r' '; whitespace \s; one or more times +; replace them with one single space ' '; in text; also, remove spaces at beginning or and of the text .strip(). 
    return text # returns the clean text in: lowercase + without punctuations + without multiple withespaces

def preprocess_text_data(data, custom_stopwords=None):
    
    data['full_text'] = data['titulo'].fillna('') + ' ' + data['texto_noticia'].fillna('') 
    # creates a new colum where it join the 'title' and 'text' columns from the dataset together (also cleans any possible NaN value by replacing it for a empty string ''
    data['full_text'] = data['full_text'].apply(clean_text)
    
    stop_words_pt = stopwords.words('portuguese')

    
    vectorizer = TfidfVectorizer(stop_words = stop_words_pt, max_features=1000)
    X = vectorizer.fit_transform(data['full_text']) # [fit: learn from data / transform: convert data into numbers] result - matrix of numbers ready to train the model
    Y = data['label'].fillna('')
    return X, Y, vectorizer
    
def split_data(X, Y): 
    # 0.8 train / 0.2 test
    x_train, x_test, y_train, y_test = train_test_split(
    # Spliting with the labels (0 and 1) / using strtify 
        X,
        Y,
        train_size = 0.8,
        shuffle = True,
        random_state = 42,
        stratify = Y
    )

    return x_train, x_test, y_train, y_test
    
def fit_model(x_train, y_train, k=5):
    # “These 240 news articles are labeled: some are about fraud, others are not. 
    # learn to distinguish between them, using the words/features they contain.”

    # Implement KNN classifier
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(x_train, y_train)

    # Predict on the train set (evaluate how well the training data was memorized)
    preds_in_train = knn.predict(x_train)

    #calculate the accuracy of preds on the train data
    train_accuracy = accuracy_score(y_train, preds_in_train)

    return train_accuracy, knn
   

In [7]:
import re # regular expressions to handle with text patterns
import string # to help with cleaning punctuiations
import pandas as pd 

from sklearn.feature_extraction.text import TfidfVectorizer # convert text into numeric values
from sklearn.model_selection import train_test_split # training and test samples
from sklearn.neighbors import KNeighborsClassifier # the KNN from Scikit-Learn 
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score # to evaluate how accurate the predictions are 
from nltk.corpus import stopwords 

from sklearn.linear_model import LogisticRegression


def read_data(file_name):
    # Load the dataset with comma delimimiter
    data = pd.read_csv(file_name, delimiter=',')
    return data

def clean_text(text):
    text = str(text).lower() #converts everything into string, then into lowercase
    text = re.sub(f'[{string.punctuation}]', '', text) # "replace(pattern, replacement, original_text)" - removes all string ponctuations using regex
    text = re.sub(r'\s+', ' ', text).strip() # raw string r' '; whitespace \s; one or more times +; replace them with one single space ' '; in text; also, remove spaces at beginning or and of the text .strip(). 
    return text # returns the clean text in: lowercase + without punctuations + without multiple withespaces

def preprocess_text_data(data, custom_stopwords=None):
    
    data['full_text'] = data['titulo'].fillna('') + ' ' + data['texto_noticia'].fillna('') 
    # creates a new colum where it join the 'title' and 'text' columns from the dataset together (also cleans any possible NaN value by replacing it for a empty string ''
    data['full_text'] = data['full_text'].apply(clean_text)
    
    stop_words_pt = stopwords.words('portuguese')

    
    vectorizer = TfidfVectorizer(stop_words = stop_words_pt, max_features=1000)
    X = vectorizer.fit_transform(data['full_text']) # [fit: learn from data / transform: convert data into numbers] result - matrix of numbers ready to train the model
    Y = data['label'].fillna('')
    return X, Y, vectorizer
    
def split_data(X, Y): 
    # 0.8 train / 0.2 test
    x_train, x_test, y_train, y_test = train_test_split(
    # Spliting with the labels (0 and 1) / using strtify 
        X,
        Y,
        train_size = 0.8,
        shuffle = True,
        random_state = 42,
        stratify = Y
    )

    return x_train, x_test, y_train, y_test
    
def fit_model(x_train, y_train, classifier):
    # “These 240 news articles are labeled: some are about fraud, others are not. 
    # learn to distinguish between them, using the words/features they contain.”

    # Implement LR classifier
    classifier.fit(x_train, y_train)

    # Predict on the train set (evaluate how well the training data was memorized)
    preds_in_train = classifier.predict(x_train)

    #calculate the accuracy of preds on the train data
    train_accuracy = accuracy_score(y_train, preds_in_train)

    return train_accuracy, classifier
   

In [17]:
#Run the Flow

# read the data
data = pd.read_csv('news_labeled_dataset_II.csv')

# convert text into numbers
X, Y, vectorizer = preprocess_text_data(data)

# split the data (80/20)
x_trainval, x_test, y_trainval, y_test = split_data(X, Y)

# split the data again, now, the train data for validation(70/30) (divisão adicional entre treino e validação)
x_train, x_val, y_train, y_val = train_test_split(
    x_trainval,
    y_trainval,
    train_size = 0.7,
    random_state = 42,
    stratify = y_trainval
)


# fit on train data
lr = LogisticRegression()

train_accuracy, lr_model = fit_model(x_train, y_train, lr)
print(f'Logistic Regression accuracy on train data: {train_accuracy}')

clas_preds = lr_model.predict(x_val) # classification predictions
val_accuracy = accuracy_score(y_val, clas_preds) # compares the predicts with the original values to see the accuracy
print(f'Validation accuracy: {val_accuracy}')



#JUST PRESS THE BUTTON AND SEE WHAT HAPPENS...

Logistic Regression accuracy on train data: 1.0
Validation accuracy: 0.8611111111111112


In [None]:
#Find Hyperparameters