Extract Features

In [1]:
import nltk
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import textstat
from collections import Counter
from nltk.probability import FreqDist

nltk.download('vader_lexicon')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def extract_features(text):
    # Basic NLP processing
    blob = TextBlob(text)
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    
    # Tokenization
    word_tokens = word_tokenize(text)
    sentence_tokens = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    non_stop_words = [word for word in word_tokens if word.lower() not in stop_words]

    # Basic Text Features
    num_words = len(word_tokens)
    num_sentences = len(sentence_tokens)
    num_unique_words = len(set(word_tokens))
    num_chars = len(text)
    num_non_stop_words = len(non_stop_words)
    
    # Lexical Diversity
    lexical_diversity = num_unique_words / num_words if num_words > 0 else 0
    
    # Sentiment Scores
    compound = sentiment['compound']
    pos = sentiment['pos']
    neu = sentiment['neu']
    neg = sentiment['neg']
    
    # Readability Scores
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    smog_index = textstat.smog_index(text)
    
    # Part-of-Speech Tags and Named Entity Recognition
    pos_tags = blob.tags
    noun_count = len([word for word, tag in pos_tags if tag.startswith('NN')])
    verb_count = len([word for word, tag in pos_tags if tag.startswith('VB')])
    adj_count = len([word for word, tag in pos_tags if tag.startswith('JJ')])
    
    # Syntactic Complexity
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0
    sentence_complexity = textstat.linsear_write_formula(text)
    
    # Word Frequency Distribution
    freq_dist = FreqDist(non_stop_words)
    most_common_words = freq_dist.most_common(5)
    
    # Initialize a list for common word counts with zeros
    common_word_counts = [0] * 5  # Assuming you want the top 5 words
    
    # Fill in the actual counts for words that are present
    for i, (word, count) in enumerate(most_common_words):
        common_word_counts[i] = count
    
    return [
        blob.sentiment.polarity, 
        blob.sentiment.subjectivity,
        compound,
        num_words,
        num_sentences,
        num_unique_words,
        num_chars,
        num_non_stop_words,
        lexical_diversity,
        pos,
        neu,
        neg,
        noun_count,
        verb_count,
        adj_count,
        flesch_reading_ease,
        smog_index,
        avg_sentence_length,
        sentence_complexity
    ] + common_word_counts  # Adding the most common word counts to the feature vector

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\26936\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\26936\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\26936\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\26936\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\26936\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\26936\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-

Extract Features 2

In [None]:
# Might fail due to torch version incompatible, consider a simpler version by calling 'extract_features' instead
import spacy
from transformers import pipeline
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.util import ngrams
from textblob import TextBlob
import textstat
from collections import Counter
from nltk.probability import FreqDist

# Load spaCy's English NLP model
nlp = spacy.load('en_core_web_sm')

# Setup transformer pipeline for sentiment analysis
# Might fail on some torch version !
sentiment_analysis = pipeline("sentiment-analysis")

def extract_combined_features(text):
    # spaCy processing
    doc = nlp(text)
    
    # Transformer sentiment analysis
    transformer_sentiment = sentiment_analysis(text)[0]

    # TextBlob processing
    blob = TextBlob(text)

    # NLTK processing
    word_tokens = word_tokenize(text)
    sentence_tokens = sent_tokenize(text)
    pos_tags = pos_tag(word_tokens)
    named_entities = ne_chunk(pos_tags)
    stop_words = set(stopwords.words('english'))
    non_stop_words = [word for word in word_tokens if word.lower() not in stop_words]
    freq_dist = FreqDist(non_stop_words)

    # NLTK Sentiment Analysis
    sia = SentimentIntensityAnalyzer()
    nltk_sentiment = sia.polarity_scores(text)

    # TextBlob Sentiment
    tb_polarity = blob.sentiment.polarity
    tb_subjectivity = blob.sentiment.subjectivity

    # Basic metrics
    num_tokens = len(doc)
    num_sentences = len(list(doc.sents))
    num_nouns = len([token for token in doc if token.pos_ == 'NOUN'])
    num_verbs = len([token for token in doc if token.pos_ == 'VERB'])
    num_adjectives = len([token for token in doc if token.pos_ == 'ADJ'])
    num_entities = len(doc.ents)
    entity_types = Counter(ent.label_ for ent in doc.ents)

    # Dependency parsing metrics
    root_verbs = len([token for token in doc if token.dep_ == 'ROOT' and token.pos_ == 'VERB'])

    # Transformer sentiment scores
    transformer_sentiment_label = transformer_sentiment['label']
    transformer_sentiment_score = transformer_sentiment['score']

    # Readability Scores
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    smog_index = textstat.smog_index(text)
    coleman_liau_index = textstat.coleman_liau_index(text)
    automated_readability_index = textstat.automated_readability_index(text)

    # NLTK Frequency Counts
    pos_freq = Counter(tag for word, tag in pos_tags)

    # Collecting features
    features = [
        tb_polarity, 
        tb_subjectivity,
        nltk_sentiment['compound'],
        len(word_tokens),
        len(sentence_tokens),
        len(set(word_tokens)),
        len(text),
        len(non_stop_words),
        len(set(non_stop_words)) / len(word_tokens) if word_tokens else 0,
        nltk_sentiment['pos'],
        nltk_sentiment['neu'],
        nltk_sentiment['neg'],
        flesch_reading_ease,
        smog_index,
        coleman_liau_index,
        automated_readability_index,
        len(sentence_tokens) / len(word_tokens) if word_tokens else 0,
        num_tokens,
        num_sentences,
        num_nouns,
        num_verbs,
        num_adjectives,
        num_entities,
        root_verbs,
        transformer_sentiment_label,
        transformer_sentiment_score
    ]

    # Adding NLTK entity counts and POS frequency
    for entity_label in ['PERSON', 'ORGANIZATION', 'LOCATION']:
        features.append(pos_freq.get(entity_label, 0))

    # Adding spaCy entity type counts
    for entity_type in ['PERSON', 'NORP', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']:
        features.append(entity_types.get(entity_type, 0))

    return features

RFE

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Input, concatenate, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Load data
train_data = pd.read_csv('..\\raw_data\\raw_data\\fulltrain.csv', header=None, names=['label', 'text'])
test_data = pd.read_csv('..\\raw_data\\raw_data\\balancedtest.csv', header=None, names=['label', 'text'])

X_texts = train_data['text'].values
y = train_data['label'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# Tokenize text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_texts)

# Set a reasonable max length for padding
max_length = 1000  # Adjust based on your dataset distribution

X_train_texts, X_val_texts, y_train, y_val = train_test_split(
    X_texts, y, test_size=0.2, random_state=42
)

# Encode labels for both training and validation sets
y_train_encoded = label_encoder.transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

y_train_categorical = to_categorical(y_train_encoded)
y_val_categorical = to_categorical(y_val_encoded)

# Process both training and validation text data
X_train_seq = tokenizer.texts_to_sequences(X_train_texts)
X_val_seq = tokenizer.texts_to_sequences(X_val_texts)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length, truncating='post')

# Simulate LIWC-like features
# Might fail due to torch version incompatible, consider a simpler version by calling 'extract_features' instead
additional_features_train = np.array([extract_combined_features(text) for text in X_train_texts])
additional_features_val = np.array([extract_combined_features(text) for text in X_val_texts])

# Feature Selection with RFE
selector = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=5, step=1)


LSTM (Single model)

In [None]:
selector.fit(additional_features_train, y_train_encoded)  # Fit RFE on the training set

# Transform feature sets to include selected features
additional_features_train_selected = selector.transform(additional_features_train)
additional_features_val_selected = selector.transform(additional_features_val)

# Fit and transform with MinMaxScaler
scaler = MinMaxScaler()
additional_features_train_scaled = scaler.fit_transform(additional_features_train_selected)
additional_features_val_scaled = scaler.transform(additional_features_val_selected)

# Define model architecture with Functional API to handle multiple inputs
text_input = Input(shape=(max_length,), dtype='int32', name='text_input')
additional_input = Input(shape=(5,), name='additional_input')

# Text branch using LSTM
embedded_text = Embedding(input_dim=5000, output_dim=100, input_length=max_length)(text_input)
lstm_text = LSTM(64)(embedded_text)  # Using LSTM to process text

# Combine branches
combined = concatenate([lstm_text, additional_input])

# Output layer
predictions = Dense(units=len(label_encoder.classes_), activation='softmax')(combined)

# Create model
model = Model(inputs=[text_input, additional_input], outputs=predictions)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Determine the number of unique classes
num_classes = len(np.unique(y_train_encoded))

# Create a dictionary for class weights
class_weights = {i: 1.0 for i in range(num_classes)}
class_weights[1] = 2.2  # Set the weight for the second class

# Now, when you fit the model, include the class_weight argument
model.fit(
    [X_train_pad, additional_features_train_scaled], y_train_categorical,
    epochs=2,
    validation_data=([X_val_pad, additional_features_val_scaled], y_val_categorical),
    batch_size=32,
    class_weight=class_weights
)

# Model summary to see the architecture
model.summary()