In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
import tensorflow as tf 
import re 
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import seaborn as sns 
from sklearn.preprocessing import MinMaxScaler
plt.style.use('ggplot')
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from keras.models import Model
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding, LSTM, Bidirectional, Dropout, Concatenate
from keras.preprocessing.text import Tokenizer
from keras.regularizers import l2
from keras.callbacks import EarlyStopping

In [2]:
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import pickle

In [3]:
df = pd.read_csv("truthseeker_sentiments.csv")

In [4]:
y = df['BinaryNumTarget']

In [5]:
df.pop('BinaryNumTarget')

0         1.0
1         1.0
2         1.0
3         1.0
4         1.0
         ... 
134178    0.0
134179    0.0
134180    0.0
134181    0.0
134182    0.0
Name: BinaryNumTarget, Length: 134183, dtype: float64

In [6]:
X = df[['text', 'anger', 'disgust', 'fear', 'joy', 'neutral',
       'sadness', 'surprise', 'positive_word_count', 'negative_word_count',
       'vader_scores', 'textblob_scores', 'flair_scores', 'affin_score',
       'pattern_score']].copy()

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# Clean the text column
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Join the tokens back into a single string
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text

In [9]:
# Apply the clean_text function to the 'text' column
X['text'] = X['text'].apply(clean_text)

In [10]:
tokenizer = Tokenizer()

In [11]:
tokenizer.fit_on_texts(X['text'])

In [12]:
X_sequences = tokenizer.texts_to_sequences(X['text'])

In [13]:
max_sequence_length = max(len(sequence) for sequence in X_sequences)

In [14]:
X_padded = tf.keras.preprocessing.sequence.pad_sequences(X_sequences, maxlen = max_sequence_length)

In [15]:
X_additional = X[['anger', 'disgust', 'fear', 'joy', 'neutral',
       'sadness', 'surprise', 'positive_word_count', 'negative_word_count',
       'vader_scores', 'textblob_scores', 'flair_scores', 'affin_score',
       'pattern_score']]

In [16]:
# Convert additional columns to numpy array
X_additional = np.array(X_additional)

In [17]:
# Combine text data and additional columns
X_combined = np.concatenate((X_padded, X_additional), axis=1)

In [18]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=1/4, random_state=42)

In [19]:
# Define the model
#The model architecture is defined using the functional API of Keras, allowing for multiple input layers.
#Two input layers are defined: input_text for the tokenized and padded text data, and 
#input_additional for the additional columns.
    
input_text = Input(shape=(max_sequence_length,))
input_additional = Input(shape=(X_additional.shape[1],))

In [20]:
# Create the Embedding layer and Bi-LSTM layer
embedding = Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_sequence_length)(input_text)
lstm = Bidirectional(LSTM(64, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01)))(embedding)

# Concatenate LSTM output with the additional columns input
concatenated = Concatenate()([lstm, input_additional])

# Apply Dropout layer to the concatenated output
dropout = Dropout(0.5)(concatenated)

# Create the output layer with sigmoid activation for binary classification
output = Dense(1, activation='sigmoid')(dropout)

# Create the model with both input layers and the output layer
model = Model(inputs=[input_text, input_additional], outputs=output)

In [21]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
# Train the model with early stopping
model.fit([X_train[:, :max_sequence_length], X_train[:, max_sequence_length:]], y_train,
          validation_data=([X_test[:, :max_sequence_length], X_test[:, max_sequence_length:]], y_test),
          epochs=10, batch_size=64, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

In [None]:
y_pred = model.predict([X_test[:, :max_sequence_length], X_test[:, max_sequence_length:]])

In [None]:
y_pred_labels = (y_pred > 0.5).astype(int)

In [None]:
accuracy = accuracy_score(y_test, y_pred_labels)

In [None]:
precision = precision_score(y_test, y_pred_labels)

In [None]:
recall = recall_score(y_test, y_pred_labels)

In [None]:
f1 = f1_score(y_test, y_pred_labels)

In [None]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

In [None]:
from flair.models import TextClassifier
from flair.data import Sentence
from afinn import Afinn
from pattern.en import sentiment
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from pattern.en import sentiment

In [None]:
def calculate_sentiment_features_text(text):

    # Initialize the VADER SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    
    flair_classifier = TextClassifier.load('en-sentiment')
    
    afinn = Afinn()

    # Calculate positive and negative word counts, and sentiment score for the text
    positive_count = len([word for word in text.split() if sid.polarity_scores(word)['compound'] > 0])
    negative_count = len([word for word in text.split() if sid.polarity_scores(word)['compound'] < 0])
    
    #using vader and textblob
    sentiment_score_vader = sid.polarity_scores(text)['compound']
    sentiment_score_textblob = TextBlob(text).sentiment.polarity
    
    # Flair method
    flair_sentence = Sentence(text)
    flair_classifier.predict(flair_sentence)
    flair_score = flair_sentence.labels[0].score
    
    # Afinn
    affin_score = afinn.score(text)
    
    # Pattern
    pattern_score = sentiment(text)[0]
    
    # Create a DataFrame
    data = {
        'anger': [1 if emotion == "anger" else 0],
        'disgust': [1 if emotion == "disgust" else 0],
        'fear': [1 if emotion == "fear" else 0],
        'joy': [1 if emotion == "joy" else 0],
        'neutral': [1 if emotion == "neutral" else 0],
        'sadness': [1 if emotion == "sadness" else 0],
        'surprise': [1 if emotion == "surprise" else 0],
        'positive_word_count': [positive_count],
        'negative_word_count': [negative_count],
        'vader_scores': [sentiment_score_vader],
        'textblob_scores': [sentiment_score_textblob],
        'flair_scores': [flair_score],
        'affin_score': [affin_score],
        'pattern_score': [pattern_score],
    }

    return pd.DataFrame(data)

In [None]:
def classify_emotions_text(text, batch_size=8):
    predicted_emotions = []
    for i in range(0, len(text), batch_size):
        batch_statements = text[i:i + batch_size]
        inputs = tokenizer(batch_statements, padding=True, truncation=True, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**inputs)

        predicted_labels = torch.argmax(outputs.logits, dim=1)
        label_to_emotion = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
        batch_emotions = [label_to_emotion[label.item()] for label in predicted_labels]
        predicted_emotions.extend(batch_emotions)

    return predicted_emotions

In [None]:
model_name = "j-hartmann/emotion-english-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
text_input = "NEW YORK (Reuters) - A federal appeals court in Virginia on Thursday rejected a bid by President Donald Trumpâ€™s administration to prevent the U.S. military from accepting transgender recruits starting Jan. 1. The administration had urged the appeals court to suspend an order by a federal judge in Baltimore for the armed forces to begin accepting transgender recruits on that date. The administration has said the Jan. 1 start date was causing the armed forces to scramble to revise their policies at the risk of harming military readiness.  In a brief two-paragraph order, the three-judge panel of the Richmond-based 4th U.S. Circuit Court of Appeals said it was denying the administrationâ€™s request while the appeal proceeds. All three judges are Democratic appointees.  The courtâ€™s action could prompt the administration to ask the conservative-majority U.S. Supreme Court to intervene. â€œWe disagree with the courtâ€™s ruling and are currently evaluating the next steps,â€ U.S. Justice Department spokeswoman Lauren Ehrsam said in a statement. Several transgender service members, backed by the American Civil Liberties Union, filed suit in Maryland after Trump said in July he would ban transgender people from the military, a move that would reverse a policy of the Republican presidentâ€™s Democratic predecessor Barack Obama to accept them. Trump cited concern over military focus and medical costs. So far, three federal judges around the country have issued injunctions blocking Trumpâ€™s ban. His administration has appealed all three rulings.  Joshua Block, an ACLU attorney who represents the plaintiffs in the Maryland case, said he was happy the appeals court saw through the governmentâ€™s â€œsmokescreenâ€ to further delay enlistment.  Thursdayâ€™s action was in response to the administrationâ€™s appeal of a Nov. 21 ruling by U.S. District Judge Marvin Garbis, who said that the transgender prohibition likely violates the plaintiffsâ€™ constitutional right to equal protection under the law. The Garbis ruling followed a similar one on Oct. 30 by another federal judge in Washington, D.C. A third judge in Seattle also ruled against the administration on Dec. 11. In an August memorandum, Trump gave the military until March 2018 to revert to a policy prohibiting openly transgender people from joining the military and authorizing their discharge. The memo also halted the use of government funds for sex-reassignment surgery for active-duty military personnel. The Obama administration had set a deadline of July 1 of this year to begin accepting transgender recruits. But Trumpâ€™s defense secretary, James Mattis, postponed that date to Jan. 1, which the presidentâ€™s ban then put off indefinitely. The Trump administration said in legal papers that the armed forces are not prepared to train thousands of personnel on the medical standards needed to process transgender applicants and might have to accept â€œsome individuals who are not medically fit for service.â€ The Pentagon on Dec. 8 issued guidelines to recruitment personnel in order to enlist transgender applicants by Jan. 1. The memo outlined medical requirements and specified how the applicantsâ€™ sex would be identified and even which undergarments they would wear. The banâ€™s challengers said the memo contradicted the claim that the military was not ready.  The Justice Department disagreed, telling the court on Wednesday that â€œall this memorandum shows is that the military is scrambling to comply with the injunction.â€ The lawsuitâ€™s lead plaintiff Brock Stone, 34, has served in the U.S. Navy for 11 years, including a nine-month deployment to Afghanistan, and wants to remain for at least 20 years, according to court papers.  "

In [None]:
df = pd.DataFrame({'text_column': [text_input]})

In [None]:
predicted_emotions = classify_emotions(df['text_column'].tolist())

In [None]:
df = pd.concat([df, calculate_features_with_sentiment(df['text_column'].iloc[0])], axis=1)

In [None]:
df

In [None]:
pickle.dump(model, open('model.pkl', 'wb'))

In [None]:
pickled_model = pickle.load(open('model.pkl', 'rb'))