In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from textstat.textstat import textstatistics
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
import os
import pickle

In [2]:
df = pd.read_csv("../data/cleaned_news.csv")

In [3]:
def extract_features(text):
    tokens = word_tokenize(text)
    sentences = sent_tokenize(text)
    tagged = pos_tag(tokens)
    pos_counts = nltk.FreqDist(tag for _, tag in tagged)
    readability = textstatistics()
    features = {
        'Pronouns': sum(1 for _, tag in tagged if tag in ['PRP', 'PRP$']),
        'TO': pos_counts.get('TO', 0),
        'Key_conectors': sum(1 for word in tokens if word.lower() in ['and', 'but', 'or', 'so']),
        'Flesch_Kincaid_Grade_Level': readability.flesch_kincaid_grade(text),
        'Flesch_Reading_Ease': readability.flesch_reading_ease(text),
        'CLI': readability.coleman_liau_index(text),
        'add_info': 0,
        'Linsear_write_formula': readability.linsear_write_formula(text),
        'Determiners': pos_counts.get('DT', 0),
        'ARI': readability.automated_readability_index(text),
        'Number_of_Words': len(tokens),
        'LIWC_pronouns': 0,
        'Negations': sum(1 for word in tokens if word.lower() in ['not', 'no', 'never', 'none']),
        'NNP': pos_counts.get('NNP', 0),
        'TPP': 0,
        'PRP': pos_counts.get('PRP', 0),
        'Positive_Words': 0,
        'Coleman_Liau_Index': readability.coleman_liau_index(text),
        'DT': pos_counts.get('DT', 0),
        'RB': pos_counts.get('RB', 0),
        'Number_of_Words_per_Sentence': np.mean([len(word_tokenize(sentence)) for sentence in sentences]),
        'CC': pos_counts.get('CC', 0),
        'Number_of_Types': len(set(tokens))
    }
    return pd.Series(features)

In [4]:
text_transformer = CountVectorizer(max_features=1000, ngram_range=(1,1), stop_words='english')


In [5]:
other_transformer = FunctionTransformer(extract_features)

In [6]:
transformer = ColumnTransformer(transformers=[
    ('text', text_transformer, 'cleaned_text'),
    ('other', other_transformer, None)
], remainder='passthrough')

In [7]:
features_df = df['cleaned_text'].apply(extract_features).apply(pd.Series)

In [8]:
X_count = df['cleaned_text'].values
X_custom_features = features_df.values

In [9]:
count_vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X_count_vectorized = count_vectorizer.fit_transform(X_count)


In [10]:
X_combined = np.hstack((X_count_vectorized.toarray(), X_custom_features))

In [11]:
print(X_combined.shape)

(44898, 1023)


In [12]:
classifier_location = "../src/fnClassification/Models/"
os.makedirs(classifier_location, exist_ok=True)  
model_filename = os.path.join(classifier_location, 'cvec_1.pkl')

with open(model_filename, 'wb') as file:
    pickle.dump(count_vectorizer, file)

print(f"CVEC model saved to: {model_filename}")

CVEC model saved to: ../src/fnClassification/Models/cvec_1.pkl
