In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score, KFold
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
# Task 1.2
# Load the dataset
df = pd.read_csv(r"C:\Users\User\Downloads\68.csv")
df.head()

In [None]:
#Task 1.3.iii)
#This had to be done before the other tasks to make sure 
# the preprocessing could funtion properly

# Check for missing values and outliers
missing_values = df.isnull().sum()
print(missing_values)

# Handling missing values (fill with empty strings)
df['headline'] = df['headline'].fillna('')
df['short_description'] = df['short_description'].fillna('')

In [None]:
#task 1.3.i)
# Preprocess the text

# Set stop words and inititlze lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if pd.isna(text):
        return ''
    text = text.lower()  # Lowercasing
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Removing punctuation
    tokens = word_tokenize(text)  # Tokenizing
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Removing stopwords and lemmatizing
    return ' '.join(tokens)

# Apply preprocessing to 'headline' and 'short_description'
df['headline'] = df['headline'].apply(preprocess)
df['short_description'] = df['short_description'].apply(preprocess)

In [None]:
# Function to get common words
def get_common_words(texts):
    all_words = ' '.join(texts).split()
    return Counter(all_words).most_common(10)

# Apply Fuction 
science_headlines = df[df['category'] == 'SCIENCE']['headline']
queer_headlines = df[df['category'] == 'QUEER VOICES']['headline']

print("Most common words in SCIENCE headlines:")
print(get_common_words(science_headlines))

print("Most common words in QUEER VOICES headlines:")
print(get_common_words(queer_headlines))


In [None]:
# Task 1.3.ii)
#  Analyze other features
df['headline_length'] = df['headline'].apply(lambda x: len(word_tokenize(x)))
sns.histplot(df, x='headline_length', hue='category', kde=True)
plt.title('Headline Length Distribution by Category')
plt.show()

df['description_length'] = df['short_description'].apply(lambda x: len(word_tokenize(x)))
sns.histplot(df, x='description_length', hue='category', kde=True)
plt.title('Description Length Distribution by Category')
plt.show()

In [None]:
#task 1.3.iii) again to find incorrect data types
# 
# # Check for blank values
print(df.isnull().sum())

# Check for incorrect data types
print(df.dtypes)

# Outliers can be visualized using the histogram created earlier

Observations from the Dataset:

The SCIENCE category had a more even spread of common words while in the QUEER VOICES category the word gay was by far the most common word, followed by several other less prevelent top ten words. This was true for both the headline and short_description category. The common words are very different between SCIENCE and QUEER VOICES SO I believe they should be easy enough to categorize correctly. There are no missing values in the important columns like headline or short description so that should be fine. The coloums have the right data type.

In [None]:
#This step had to be done as there were NaN cells in some columns
# Drop rows with NaN values in specific columns
df = df.dropna(subset=['headline', 'short_description'])

df = df.dropna()

In [None]:

# Task 2.4.i
# Split data into train (70%), validation (15%), and test (15%) sets
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)
valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)



I chose this split because it is common practice. The 70 percent train data is enough, the 15 percent validation is a good enough size to validate well and the 15 percent testing is enough data that the model hasnt seen to assess if it performs well.

In [None]:
# Task 2.4.ii
# Save splits to csv files
train_data.to_csv('train.csv', index=False)
valid_data.to_csv('valid.csv', index=False)
test_data.to_csv('test.csv', index=False)

#index=False removes the unneccesary index making the dataset cleaner.

In [None]:
# Task 2.5 

# Preprocessing steps: lowercasing, removing punctuation, lemmatization
def preprocess_data(df):
    df['cleaned_headline'] = df['headline'].apply(preprocess)
    df['cleaned_short_description'] = df['short_description'].apply(preprocess)
    return df

train_data = preprocess_data(train_data)
valid_data = preprocess_data(valid_data)
test_data = preprocess_data(test_data)

# Load train and validation sets (and the test set for later)
# This also takes both headline and short descrition and converts 
# them to text so that both features can be used in the model.

train_data['text'] = train_data['cleaned_headline'] + ' ' + train_data['cleaned_short_description']
valid_data['text'] = valid_data['cleaned_headline'] + ' ' + valid_data['cleaned_short_description']
test_data['text'] = test_data['cleaned_headline'] + ' ' + test_data['cleaned_short_description']

I chose to do some preprocessing to the 3 data sets to ensure it is the same acrossed all. I made everything lowercase, removed punctionaiton and reduced all words to the root form with lemitizaiton. I did this so that only the words and their meanings would be analyzed. That way the data is better for the models to categorize correctly. Additionally I dropped NaN data as mentioned above.

In [None]:
#task 2.6 
# Vectorize and train Logistic Regression model
#Pipeline is a way to transform and fit the data to make sure that everything goes in sequence.
#TfidfVectorizer converts the data into numbers using the (TF-IDF). This gives words mathmatical importance
log_reg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(random_state=42))
])

#This trains the logistic regression model

log_reg_pipeline.fit(train_data['text'], train_data['category'])


In [None]:
# This evaluates on the validation set. This will be done again with the other models below.
val_preds = log_reg_pipeline.predict(valid_data['text'])
print(f"Logistic Regression Validation Accuracy: {accuracy_score(valid_data['category'], val_preds)}")


In [None]:
#task 2.6
# Vectorize and train Random Forest model
# Same process as mentioned above with pipeline and the vectorizer.
rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(random_state=42))
])

# Random forrest model training.
rf_pipeline.fit(train_data['text'], train_data['category'])


In [None]:
# This evaluates the random forrest on the validation set. This will be done again with the other models below.
val_preds_rf = rf_pipeline.predict(valid_data['text'])
print(f"Random Forest Validation Accuracy: {accuracy_score(valid_data['category'], val_preds_rf)}")


I chose to use the two classifiers: Logistic regression and Random Forrest. I chose them because they are both models that lend themselves well to text data and categorical data. They are disticnt from eachother and I thought they would be good choices for this project. The random forrest is supposed to be a bit more accurate and complex as well so I wanted to see if that was true.

In [None]:
#task 2.7
# I chose to use the long short term memory deep learning model and train it for catergoization.

# Preparing the data for the LSTM model
train_data['text'] = train_data['cleaned_headline'] + ' ' + train_data['cleaned_short_description']
valid_data['text'] = valid_data['cleaned_headline'] + ' ' + valid_data['cleaned_short_description']
test_data['text'] = test_data['cleaned_headline'] + ' ' + test_data['cleaned_short_description']

X_train = train_data['text'].astype(str)
X_valid = valid_data['text'].astype(str)
X_test = test_data['text'].astype(str)
y_train = train_data['category']
y_valid = valid_data['category']
y_test = test_data['category']

# Tokenizing the text data because LSTM models require it.
tokenizer = Tokenizer(num_words=5000, lower=True, oov_token='<UNK>')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_valid_seq = tokenizer.texts_to_sequences(X_valid)

# Padding sequences to ensure equal length because LSTM models require it.
# This makes sure that all the data that goes into the model has the same length.
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_valid_pad = pad_sequences(X_valid_seq, maxlen=100)

# Encoding the labels to make categories have a numeric valuess
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_valid_enc = label_encoder.transform(y_valid)

# Building the LSTM model
model = Sequential()
#this is the embedding layer
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
# This spatial dropout helps with overfittling 
model.add(SpatialDropout1D(0.2))
#This helps the model with memory 
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
# This makes the classification binary, with probabilty between 0 and  1
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the LSTM model, I chose 4 epochs to start with a batch size of 32
history = model.fit(X_train_pad, y_train_enc, epochs=4, batch_size=32, validation_data=(X_valid_pad, y_valid_enc))

# Save the trained model for further steps
model.save('lstm_model.h5')

Task 3.8 
I chose Accuracy as my primary metric. I made this decision because the 50-50 split of the dataset. I think it will be the best and most simple way to represnt whether the model is accurate. I will also use the F1 score to check model performance in later steps.

In [None]:
#task 3.9
# Evaluate Logistic Regression
train_preds = log_reg_pipeline.predict(train_data['text'])
print(f"Logistic Regression Train Accuracy: {accuracy_score(train_data['category'], train_preds)}")
print(f"Logistic Regression Validation Accuracy: {accuracy_score(valid_data['category'], val_preds)}")

# Evaluate Random Forest
train_preds_rf = rf_pipeline.predict(train_data['text'])
print(f"Random Forest Train Accuracy: {accuracy_score(train_data['category'], train_preds_rf)}")
print(f"Random Forest Validation Accuracy: {accuracy_score(valid_data['category'], val_preds_rf)}")

# Evaluate LSTM Modelon the validation set

# Load the trained LSTM model
model = load_model('lstm_model.h5')

# Evaluate the model on the validation set
val_loss, val_accuracy = model.evaluate(X_valid_pad, y_valid_enc)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

Each model performed well! 
logistic regression the lowest: 91 percent accuracy on validation
Random forrest was in the middle with 94 percent accuracy on validation 
The LTSM model was 95 percent accurate with validation. 
The logistic regression model lost 4 percentage points between the training accuracy and the validation accuracy.
The random forrest lost 6 percent between the training and validaiton accuracy. The training for random forrest was 100 percent accurate.

In [None]:
#task 3.10
# Predict on validation set
val_preds_log_reg = log_reg_pipeline.predict(valid_data['text'])
val_preds_rf = rf_pipeline.predict(valid_data['text'])
y_pred_probs = model.predict(X_valid_pad)
y_pred_lstm = (y_pred_probs > 0.5).astype("int32").flatten()

# Create a DataFrame for comparison
comparison_df = valid_data.copy()
comparison_df['log_reg_preds'] = val_preds_log_reg
comparison_df['rf_preds'] = val_preds_rf
comparison_df['lstm_preds'] = y_pred_lstm
comparison_df['actual'] = valid_data['category']

# Find rows where predictions differ from the actual labels
log_reg_errors = comparison_df[comparison_df['actual'] != comparison_df['log_reg_preds']]
rf_errors = comparison_df[comparison_df['actual'] != comparison_df['rf_preds']]
lstm_errors = comparison_df[comparison_df['actual'] != comparison_df['lstm_preds']]

# Display error examples
print("Logistic Regression Errors:")
print(log_reg_errors.head())
print('-----')

print("Random Forest Errors:")
print(rf_errors.head())
print('-----')

print("LSTM Errors:")
print(lstm_errors.head())

# Additional insights into common errors
common_errors = comparison_df[
    (comparison_df['log_reg_preds'] != comparison_df['actual']) &
    (comparison_df['rf_preds'] != comparison_df['actual']) &
    (comparison_df['lstm_preds'] != comparison_df['actual'])
]

print("Common Errors Across All Models:")
print(common_errors.head())


Task 3.10 The error analysis showed that the logistic regression and the random forrest struggled more than the LSTM model. The logistic regression and the random forrest classified some of the same Science headlines as queer voices. While the LSTM model also had some errors, they were less than the other two and different in the sense that they were more random. This analysis reinforces the idea that deep learning models are superior to tradtional ML models.

In [None]:
#task 3.11
# Adjusting Logistic Regression (changing regularization parameter from 1 to 0.5)
log_reg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(C=0.5, random_state=42))
])

log_reg_pipeline.fit(train_data['text'], train_data['category'])
val_preds = log_reg_pipeline.predict(valid_data['text'])
print(f"Adjusted Logistic Regression Validation Accuracy: {accuracy_score(valid_data['category'], val_preds)}")

# Adjusting Random Forest (increasing number of trees)
rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42))
])

rf_pipeline.fit(train_data['text'], train_data['category'])
val_preds_rf = rf_pipeline.predict(valid_data['text'])
print(f"Adjusted Random Forest Validation Accuracy: {accuracy_score(valid_data['category'], val_preds_rf)}")

# Adjusting LSTM Model (adding more layers and change the number of epochs)
# Adding more LSTM layers and changing epochs from 4 to 10
model_improved = Sequential()
model_improved.add(Embedding(input_dim=5000, output_dim=128))
model_improved.add(SpatialDropout1D(0.2))
model_improved.add(LSTM(100, return_sequences=True))
model_improved.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) 
model_improved.add(Dense(1, activation='sigmoid'))

model_improved.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the improved model
history_improved = model_improved.fit(X_train_pad, y_train_enc, epochs=10, batch_size=32, validation_data=(X_valid_pad, y_valid_enc))

# Evaluate the improved model on the validation set
val_loss_improved, val_accuracy_improved = model_improved.evaluate(X_valid_pad, y_valid_enc)
print(f"Improved Validation Loss: {val_loss_improved}")
print(f"Improved Validation Accuracy: {val_accuracy_improved}")

# Save the improved model for further steps
model_improved.save('improved_lstm_model.h5')

task 3.11
Changes:
I changed the regularization from 1 to 0.5 for the logistic regression to see if that would improve it.

I added more trees to the random forrest to hope to improve the accuracy from 100 to 200

I added an addtitional layers to the ltsm as well as more epochs: from 4 to 10 hoping to improve accuracy.

Results:
the logistic regression model became worse, with 87 percent accuracy when compaired to the original 91 percent.

the random forrest model stayed about the same it was originally 93.67 and moved to 93.73.

the LSTM deep learning model came out about the same as well going from the original 95.27 to the updated 95.37

The LSTM Came out the winner again with 95 percent accuracy

In [None]:

# task 3.12


# Load the improved model
model = load_model('improved_lstm_model.h5')

# Evaluate the model on the validation set
val_loss, val_accuracy = model.evaluate(X_valid_pad, y_valid_enc, verbose=0)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

# predictons on the validation set
y_pred_probs = model.predict(X_valid_pad)
y_pred = (y_pred_probs > 0.5).astype("int32")  # Convert probabilities to binary class predictions

# Calculate and print metrics
val_accuracy_manual = accuracy_score(y_valid_enc, y_pred)
print(f"Validation Accuracy (Manual): {val_accuracy_manual}")

# classification report
print("Classification Report:")
print(classification_report(y_valid_enc, y_pred))

The results from the cross validation were good! The validation loss was pretty low .28 and the validation accuracy was high, 95 percent. 
The f1 score for Science was really high at 97 but the f1 score for Queer voices was not as good but still high at 90 percent.

In [None]:


# Load and preprocess the test set
test_data = pd.read_csv('test.csv')

# Tokenizing the text data because LSTM models require it.
tokenizer = Tokenizer(num_words=5000, lower=True, oov_token='<UNK>')
tokenizer.fit_on_texts(X_test)

X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences to ensure equal length because LSTM models require it.
# This makes sure that all the data that goes into the model has the same length.
X_test_pad = pad_sequences(X_test_seq, maxlen=100)



# Load the trained LSTM model
model = load_model('improved_lstm_model.h5')

# Predict on the test set
y_test_pred_probs = model.predict(X_test_pad)
y_test_pred = (y_test_pred_probs > 0.5).astype("int32")

# Encode the true labels using LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(train_data['category'])  # Fit on training labels
y_test = test_data['category']
y_test_enc = label_encoder.transform(y_test)  # Transform true labels

# If the model was trained with encoded labels
y_test_pred_encoded = y_test_pred.flatten()  # Flatten the predictions if necessary

# Print classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test_enc, y_test_pred_encoded, target_names=label_encoder.classes_))


    
   

The results for the test were significantly lower than the cross validation. The test accuracy was 70 percent, down 25 percent from 95. Additionally the f1 score for SCIENCE was abismal, only 19 percent. Im not sure why this model performed so poorly on the test data.

In [None]:
# Task 3.14
# Load and preprocess the test set
test_data = pd.read_csv('test.csv')

# Tokenizing the text data
tokenizer = Tokenizer(num_words=5000, lower=True, oov_token='<UNK>')

# Combine train and validation text for tokenizer
combined_texts = pd.concat([X_train, X_valid], axis=0)
tokenizer.fit_on_texts(combined_texts)

# Prepare test data
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

# Encode the test labels using the same LabelEncoder fitted on training data
label_encoder = LabelEncoder()
label_encoder.fit(train_data['category'])  # Fit on training labels
y_test = test_data['category']
y_test_enc = label_encoder.transform(y_test)  # Transform true labels

# Combine training and validation data and retrain the model
X_combined = pd.concat([X_train, X_valid])
y_combined = pd.concat([train_data['category'], valid_data['category']], axis=0)

# Tokenize and pad the combined data
X_combined_seq = tokenizer.texts_to_sequences(X_combined)
X_combined_pad = pad_sequences(X_combined_seq, maxlen=100)
y_combined_enc = label_encoder.transform(y_combined)

# Define and compile the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Retrain the model with the combined training and validation data
model.fit(X_combined_pad, y_combined_enc, epochs=10, batch_size=32, validation_split=0.1)

# Save the retrained model
model.save('retrained_lstm_model.h5')

# Evaluate the retrained model on the test set
# Load the retrained model
model = load_model('retrained_lstm_model.h5')

# Predict on the test set
y_test_pred_probs = model.predict(X_test_pad)
y_test_pred = (y_test_pred_probs > 0.5).astype("int32")

# Print classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test_enc, y_test_pred, target_names=label_encoder.classes_))



The retraining with the train and validation sets made a huge impact on the performence of the model. It went back to being 95 percent accurate and dramatically improved the f1 score on SCIENCE. Apparently there are some major faults with the model but with this retraining the model is again very accurate. 