In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle
#import numpy as np
#import seaborn as sns
#import matplotlib.pyplot as plt
#import re
#import string 
#import openai

In [2]:
# Set max row and column display
pd.options.display.max_rows=None
pd.options.display.max_columns=None

In [3]:
# Set paths for article CSV files
file_part1 = "Resources/df_merged_articles_clean_part1.csv"
file_part2 = "Resources/df_merged_articles_clean_part2.csv"
file_part3 = "Resources/df_merged_articles_clean_part3.csv"
file_part4 = "Resources/df_merged_articles_clean_part4.csv"

In [4]:
# Read the merged article CSV files into DataFrames
df_merged_articles_clean_part1 = pd.read_csv(file_part1)
df_merged_articles_clean_part2 = pd.read_csv(file_part2)
df_merged_articles_clean_part3 = pd.read_csv(file_part3)
df_merged_articles_clean_part4 = pd.read_csv(file_part4)
print("Input files have been read in")

Input files have been read in


In [5]:
# Concatenate article CSV files into merged dataframe
df_merged_articles_clean = pd.concat([df_merged_articles_clean_part1, df_merged_articles_clean_part2,
                                     df_merged_articles_clean_part3, df_merged_articles_clean_part4],
                                     ignore_index=True)

In [6]:
# Print first 5 rows of df_merged_articles_clean dataframe
df_merged_articles_clean.head()

Unnamed: 0,title,text,cleaned_text,subject,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,donald trump wish american happy new year leav...,News,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,house intelligence committee chairman devin nu...,News,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",friday revealed former milwaukee sheriff david...,News,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",christmas day donald trump announced would bac...,News,0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,pope francis used annual christmas day message...,News,0


In [7]:
# Display shape of df_merged_articles_clean dataframe
df_merged_articles_clean.shape

(44898, 5)

In [8]:
# Display info for of df_merged_articles_clean dataframe
df_merged_articles_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         44898 non-null  object
 1   text          44898 non-null  object
 2   cleaned_text  44182 non-null  object
 3   subject       44898 non-null  object
 4   class         44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [9]:
# Examine df_merged_articles_clean dataframe to determine if null cleaned_text rows should be dropped or not
df_null_cleaned_text_rows = df_merged_articles_clean.loc[df_merged_articles_clean['cleaned_text'].isnull()]
print(df_null_cleaned_text_rows.head())

                                                   title  \
9358   https://100percentfedup.com/served-roy-moore-v...   
10923  TAKE OUR POLL: Who Do You Think President Trum...   
11041  Joe Scarborough BERATES Mika Brzezinski Over “...   
11190  WATCH TUCKER CARLSON Scorch Sanctuary City May...   
11225  MAYOR OF SANCTUARY CITY: Trump Trying To Make ...   

                                                    text cleaned_text  \
9358   https://100percentfedup.com/served-roy-moore-v...          NaN   
10923                                                             NaN   
11041                                                             NaN   
11190                                                             NaN   
11225                                                             NaN   

        subject  class  
9358   politics      0  
10923  politics      0  
11041  politics      0  
11190  politics      0  
11225  politics      0  


In [10]:
# Rows with null cleaned_text field contain article text that are URLs or blank.
# These rows can be deleted with a small impact on the overall row count (rows reduced from 44898 to 44182).
df_merged_articles_revised = df_merged_articles_clean.dropna(subset=['cleaned_text'])
print(df_merged_articles_revised.head())
print(df_merged_articles_revised.shape)


                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text  \
0  Donald Trump just couldn t wish all Americans ...   
1  House Intelligence Committee Chairman Devin Nu...   
2  On Friday, it was revealed that former Milwauk...   
3  On Christmas day, Donald Trump announced that ...   
4  Pope Francis used his annual Christmas Day mes...   

                                        cleaned_text subject  class  
0  donald trump wish american happy new year leav...    News      0  
1  house intelligence committee chairman devin nu...    News      0  
2  friday revealed former milwaukee sheriff david...    News      0  
3  christmas day donald trump announced would 

In [11]:
# Split data into X and y for modeling

X = df_merged_articles_revised['cleaned_text']
y = df_merged_articles_revised['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [12]:
# Display X and y train and test information
print('---------- X_train info ----------')
print(X_train.shape)
print(X_train.head())
print('\n---------- y_train info ----------')
print(y_train.shape)
print(y_train.head())
print('\n---------- y_train info ----------')
print(X_test.shape)
print(X_test.head())
print('\n---------- y_test info ----------')
print(y_test.shape)
print(y_test.head())

---------- X_train info ----------
(30927,)
5669     donald trump slogan make america great fan che...
6671     suffering debilitating loss five northeastern ...
3655     donald trump throwing twitter tantrum early mo...
32530    washington reuters republican presidential can...
26492    washington reuters le three month president do...
Name: cleaned_text, dtype: object

---------- y_train info ----------
(30927,)
5669     0
6671     0
3655     0
32530    1
26492    1
Name: class, dtype: int64

---------- y_train info ----------
(13255,)
30774    washington reuters bitter election campaign fi...
4012     donald trump petty thinskinned wannabe dictato...
7168     walmart become perfect symbol corporate greed ...
25307    following statement posted verified twitter ac...
7441     atheist group found truly genius way troll chr...
Name: cleaned_text, dtype: object

---------- y_test info ----------
(13255,)
30774    1
4012     0
7168     0
25307    1
7441     0
Name: class, dtype: int64


In [13]:
# Calculate TF-IDF for article text using TfidfVectorizer()
tfid_vectorizer = TfidfVectorizer(stop_words="english")
X_train_tfidf = tfid_vectorizer.fit_transform(X_train)
X_test_tfidf = tfid_vectorizer.transform(X_test)

# Get X_train_tfidf matrix information
print('-------------------- X_train_tfidf matrix information  --------------------')
print(f"Matrix shape: {X_train_tfidf.shape}")
print(f"Total number of documents: {X_train_tfidf.shape[0]}")
print(f"Total number of unique words (tokens): {X_train_tfidf.shape[1]}")

# Get X_test_tfidf matrix information
print('-------------------- X_test_tfidf matrix information  --------------------')
print(f"Matrix shape: {X_test_tfidf.shape}")
print(f"Total number of documents: {X_test_tfidf.shape[0]}")
print(f"Total number of unique words (tokens): {X_test_tfidf.shape[1]}")


-------------------- X_train_tfidf matrix information  --------------------
Matrix shape: (30927, 165713)
Total number of documents: 30927
Total number of unique words (tokens): 165713
-------------------- X_test_tfidf matrix information  --------------------
Matrix shape: (13255, 165713)
Total number of documents: 13255
Total number of unique words (tokens): 165713


In [14]:
# Initialize the Passive Aggressive Classifier
pac_model = PassiveAggressiveClassifier(max_iter=50)

# Train the model
pac_model.fit(X_train_tfidf, y_train)

# Make prediction with test data
y_pred = pac_model.predict(X_test_tfidf)

# Evaluate the model
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy}")

# Check prediction with training data
y_pred_train = pac_model.predict(X_train_tfidf)
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Train Accuracy: {train_accuracy}")

Test Accuracy: 0.9944926442851754
Train Accuracy: 1.0


In [15]:
# Generate a classification report
pac_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(pac_report)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      6853
           1       0.99      1.00      0.99      6402

    accuracy                           0.99     13255
   macro avg       0.99      0.99      0.99     13255
weighted avg       0.99      0.99      0.99     13255



In [16]:
# Generate a confusion matrix
# [True Postives    False Postives]
# [False Negative   True Negatives]

pac_conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(pac_conf_matrix)

Confusion Matrix:
[[6811   42]
 [  31 6371]]


In [17]:
# Save models using pickle

# Save the trained PassiveAggressiveClassifer model
pickle.dump(pac_model, open('Resources/pa_classfier.pkl', 'wb'))

# Save the TD-IDF vectorizer
pickle.dump(tfid_vectorizer, open('Resources/tfid_vectorizer.pkl', 'wb'))

print("pac_model and tfid_vectorizer saved")

pac_model and tfid_vectorizer saved


In [18]:
# Save X_train, X_test, y_train, y_test data files for use in other notebooks, if needed
X_train.to_csv('Resources/X_train.csv', index=False)
y_train.to_csv('Resources/y_train.csv', index=False)
X_test.to_csv('Resources/X_test.csv', index=False)
y_test.to_csv('Resources/y_test.csv', index=False)
print("X_train, X_test, y_train, y_test files saved")

X_train, X_test, y_train, y_test files saved
