In [1]:
# Import Python libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer # let's remove this @ujjawal
from nltk.stem import WordNetLemmatizer # instead we will use lemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
# Disable SSL certificate verification
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

### Data Preprocessing
#### Data combining from different datasets

In [3]:
# Following is the list of topics for which the medium data is available
topic_list = [
    "health",
    "technology",
    "politics",
    "sports",
    "business",
    "entertainment",
    "environment",
    "lifestyle",
    "programming",
    "education",
]

# Create a empty dataframe
df = pd.DataFrame(columns=["title", "topic"])

# Read the data from the csv files and store it in a dataframe
for topic in topic_list:
    csv_name = f"../data/{topic}_news_data.csv"
    print(csv_name)
    new_df = pd.read_csv(f"{csv_name}")
    df = pd.concat([df, new_df], ignore_index=True)


print(f"\nShape of the dataframe: {df.shape}")

../data/health_news_data.csv
../data/technology_news_data.csv
../data/politics_news_data.csv
../data/sports_news_data.csv
../data/business_news_data.csv
../data/entertainment_news_data.csv
../data/environment_news_data.csv
../data/lifestyle_news_data.csv
../data/programming_news_data.csv
../data/education_news_data.csv

Shape of the dataframe: (4448, 3)


### Dataframe Exploration
#### Initial understanding of the structure, content, and data types within a dataframe.

In [4]:
# Using head() to display the first 5 rows of the dataset
df.head()

Unnamed: 0.1,title,topic,Unnamed: 0
0,I Asked Leading Covid Scientists — Off the Rec...,health,0.0
1,Autopsy Findings of Vaccinated People (With Co...,health,1.0
2,Latest Autopsy Study on mRNA Vaccine Recipient...,health,2.0
3,From Infection to Recovery: How Long It Lasts,health,3.0
4,A Tough Covid Challenge: Reinforcing Our Wall ...,health,4.0


In [5]:
# Using tail() to display the last 5 rows of the dataset
df

Unnamed: 0.1,title,topic,Unnamed: 0
0,I Asked Leading Covid Scientists — Off the Rec...,health,0.0
1,Autopsy Findings of Vaccinated People (With Co...,health,1.0
2,Latest Autopsy Study on mRNA Vaccine Recipient...,health,2.0
3,From Infection to Recovery: How Long It Lasts,health,3.0
4,A Tough Covid Challenge: Reinforcing Our Wall ...,health,4.0
...,...,...,...
4443,“Balancing Act: How Students Can Deal with Stu...,education,404.0
4444,SCOTUS Likely To Negate Student Loan Forgivene...,education,405.0
4445,"I, The Ghost of a Woman Who Died in an 18th-Ce...",education,406.0
4446,The Latest Student Loan News: What Borrowers N...,education,407.0


In [6]:
# Using info() to display the information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4448 entries, 0 to 4447
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       4448 non-null   object 
 1   topic       4448 non-null   object 
 2   Unnamed: 0  4448 non-null   float64
dtypes: float64(1), object(2)
memory usage: 104.4+ KB


In [7]:
# Using dtypes to display the data types of the columns
df.dtypes

title          object
topic          object
Unnamed: 0    float64
dtype: object

In [8]:
# Using columns to display the columns of the dataset
df.columns

Index(['title', 'topic', 'Unnamed: 0'], dtype='object')

In [9]:
# Remove the rows with no values ("Unnamed: 0")
# Specify the columns to remove
columns_to_remove = ["Unnamed: 0"]

# Remove the specified columns
df = df.drop(columns=columns_to_remove)

### Handling Duplicates
#### Identify and remove duplicate rows if they exist in the dataframe.

In [10]:
# Check Shape of the dataset before handling duplicates
print("Shape of the dataset before handling duplicates: ", df.shape)

# Drop duplicate rows if there is any duplicate row
df = df.drop_duplicates()

# Check Shape of the dataset after handling duplicates
print("Shape of the dataset after handling duplicates: ", df.shape)

Shape of the dataset before handling duplicates:  (4448, 2)
Shape of the dataset after handling duplicates:  (4245, 2)


### Handling Missing Values
#### Identify missing values for each column

In [11]:
# Check for missing values
df.isnull().sum()

title    0
topic    0
dtype: int64

### Check If Imbalanced Classes
#### We try to categorize title data with topic so let's see if there is imbalance data or not

In [12]:
# Count the number of rows according to unique topics
df["topic"].value_counts()

topic
health           546
business         527
technology       510
sports           478
programming      455
education        408
environment      401
politics         378
entertainment    273
lifestyle        269
Name: count, dtype: int64

### Text Processing

In [13]:
# Perform text preprocessing
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shanover\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# Create a function to perform text preprocessing

import re
import string

lemmatizer = WordNetLemmatizer()

def remove_punctuations(text):
    # Remove punctuations and special characters using regex and string module
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"[\–\—]", "", text)  # Remove special characters like dashes
    return text

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # remove punctuation and special chars
    text = remove_punctuations(text)

    # Tokenization
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    #stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    lemmatized_tokens = []
    
    # Apply lemma
    for token, pos_tag in nltk.pos_tag(tokens):
        # Map POS tag to WordNet tag
        wn_tag = nltk.corpus.wordnet.NOUN
        if pos_tag.startswith('J'):
            wn_tag = nltk.corpus.wordnet.ADJ
        elif pos_tag.startswith('V'):
            wn_tag = nltk.corpus.wordnet.VERB
        elif pos_tag.startswith('R'):
            wn_tag = nltk.corpus.wordnet.ADV
        # Lemmatize token
        lemmatized_token = lemmatizer.lemmatize(token, pos=wn_tag)
        lemmatized_tokens.append(lemmatized_token)
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [15]:
# Apply text preprocessing to the 'title' column
df["preprocessed_title"] = df["title"].apply(preprocess_text)
df.head()

Unnamed: 0,title,topic,preprocessed_title
0,I Asked Leading Covid Scientists — Off the Rec...,health,ask lead covid scientist record virus ’ origin...
1,Autopsy Findings of Vaccinated People (With Co...,health,autopsy finding vaccinate people covid vaccine...
2,Latest Autopsy Study on mRNA Vaccine Recipient...,health,late autopsy study mrna vaccine recipient germ...
3,From Infection to Recovery: How Long It Lasts,health,infection recovery long last
4,A Tough Covid Challenge: Reinforcing Our Wall ...,health,tough covid challenge reinforce wall immunity


### Dataset Preparation for Training

In [16]:
# Separate the preprocessed text (features) and the corresponding topics (labels)
X = df["preprocessed_title"]
y = df["topic"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Build Machine Learning Model

In [17]:
# Convert the preprocessed text into numerical representations using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [18]:
# Create an instance of the SVM model
model = SVC()

In [19]:
# Train the model
model.fit(X_train_tfidf, y_train)

### Model Evaluation

In [20]:
# Make predictions on the testing data
y_pred = model.predict(X_test_tfidf)

In [21]:
# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5535924617196702


In [22]:
## Saving the model
import joblib
import os

# Assuming you have trained and obtained your model

# Specify the directory path for saving models
models_dir = '..\models'

def saveModel(models_dir, filename, model):

    # Create the models directory if it doesn't exist
    os.makedirs(models_dir, exist_ok=True)

    # Specify the file path for saving the model
    model_path = os.path.join(models_dir, filename + '.joblib')

    # Save the model
    joblib.dump(model, model_path)

    print(f"Model saved successfully at {model_path}")
    

# Saving SVC model:
saveModel('../models','svc_title_to_category_55', model)

Model saved successfully at ../models\svc_title_to_category_55.joblib


## Ensemble Voting

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Initialize individual models
svc = SVC()
nb = MultinomialNB()
rf = RandomForestClassifier()

# Define the voting classifier with the individual models
voting_classifier = VotingClassifier(estimators=[('svc', svc), ('nb', nb), ('rf', rf)])

# Train the voting classifier
voting_classifier.fit(X_train_tfidf, y_train)

# Predict on the test set using the ensemble model
ensemble_predictions = voting_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, ensemble_predictions)
print("Ensemble Accuracy:", accuracy)


Ensemble Accuracy: 0.5594817432273262


In [24]:
# Save ensemble model
saveModel('../models','ensemble_title2category_56', voting_classifier)

Model saved successfully at ../models\ensemble_title2category_56.joblib


#### A bit more better accuracy achieved using Ensemble voting: 56.18%

## Using Sequential NN

In [25]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score

# Assume you have your TF-IDF encoded features in X and corresponding labels in y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit vectorizer and transform training data
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()

# Transform test data using the fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test).toarray()

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define the neural network model
model = Sequential()
model.add(Dense(128, input_shape=(X_train_tfidf.shape[1],), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_tfidf, y_train_encoded, epochs=10, batch_size=32, verbose=1)

# Predict on the test set
predictions = model.predict(X_test_tfidf)
predicted_classes = np.argmax(predictions, axis=1)

# Decode predictions
y_pred = label_encoder.inverse_transform(predicted_classes)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Neural Network Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Accuracy: 0.5394581861012956


## Remark:

#### After using NN, the accuracy didn't improved. There seems to be an issue in the pred_y, test_y

In [26]:
series_test = y_test.value_counts()

In [27]:
# Convert the NumPy array to a Pandas Series
series = pd.Series(y_pred)

# Get the unique value counts as a Pandas Series
series_pred = series.value_counts()
value_counts

NameError: name 'value_counts' is not defined

### checking the distribution between test_y, pred_y

In [28]:
# Align the labels of the two Series
series1, series2 = series_test.align(series_pred, fill_value=0)

# Compare the two aligned Series
comparison = series1.compare(series2)

# Print the comparison result
print(comparison)

               self  other
business        107    117
education        82     73
entertainment    60     65
environment      57     73
health          130    120
lifestyle        43     39
politics         83     87
programming      78     93
sports          102     86
technology      107     96


### Running SVC Model on new news-titles

In [None]:
# Preprocess new titles
new_titles = [
    "Cancer drug found after using AI",
    "Is this the end of coding? AI Rules over restaurant jobs",
]
new_titles_preprocessed = [preprocess_text(title) for title in new_titles]

# Convert the preprocessed titles into numerical representations
new_titles_tfidf = vectorizer.transform(new_titles_preprocessed)

# Predict the topics of the new titles
new_titles_predictions = model.predict(new_titles_tfidf)
print("New Titles Predictions:", new_titles_predictions)

### Running Ensemble Voting Classifier on new news-titles

In [None]:
new_titles = [
    "Critiano scores a hattrick",
    "Is this the end of coding? AI Rules over restaurant jobs",
]
new_titles_preprocessed = [preprocess_text(title) for title in new_titles]

# Convert the preprocessed titles into numerical representations
new_titles_tfidf = vectorizer.transform(new_titles_preprocessed)

# Predict the topics of the new titles
new_titles_predictions = voting_classifier.predict(new_titles_tfidf)
print("New Titles Predictions:", new_titles_predictions)

In [None]:
Topic modeling,
CLustering
LDA, NMF