In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import nltk
import re
from textblob import TextBlob

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Load your dataset
df = pd.read_csv('Merged_Dataset.csv')  
print(df.head())

[nltk_data] Downloading package wordnet to C:\Users\dhany/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dhany/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


                                               title  \
0  u.s. budget fight looms, republican flip fisca...   
1  u.s. military accept transgender recruit monda...   
2  senior u.s. republican senator: 'let mr. muell...   
3  fbi russia probe helped australian diplomat ti...   
4  trump want postal service charge 'much more' a...   

                                                text       subject  \
0  washington (reuters) - head conservative repub...  politicsNews   
1  washington (reuters) - transgender people allo...  politicsNews   
2  washington (reuters) - special counsel investi...  politicsNews   
3  washington (reuters) - trump campaign adviser ...  politicsNews   
4  seattle washington (reuters) - president donal...  politicsNews   

         date  subject_encoded  label  
0  2017-12-31                6      1  
1  2017-12-29                6      1  
2  2017-12-31                6      1  
3  2017-12-30                6      1  
4  2017-12-29                6      1  


In [2]:
nltk.download('punkt_tab')
nltk.download('punkt')  
nltk.download('stopwords')  

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', str(text))  # Remove punctuation/numbers
    tokens = word_tokenize(text.lower())  # Tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

# Now apply the preprocessing
df['clean_text'] = df['text'].apply(preprocess_text)
print(df.head())


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dhany/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\dhany/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhany/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                               title  \
0  u.s. budget fight looms, republican flip fisca...   
1  u.s. military accept transgender recruit monda...   
2  senior u.s. republican senator: 'let mr. muell...   
3  fbi russia probe helped australian diplomat ti...   
4  trump want postal service charge 'much more' a...   

                                                text       subject  \
0  washington (reuters) - head conservative repub...  politicsNews   
1  washington (reuters) - transgender people allo...  politicsNews   
2  washington (reuters) - special counsel investi...  politicsNews   
3  washington (reuters) - trump campaign adviser ...  politicsNews   
4  seattle washington (reuters) - president donal...  politicsNews   

         date  subject_encoded  label  \
0  2017-12-31                6      1   
1  2017-12-29                6      1   
2  2017-12-31                6      1   
3  2017-12-30                6      1   
4  2017-12-29                6      1

In [3]:
# TF-IDF Vectorization with N-grams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])

# Convert to DataFrame for feature merging
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(df.head())

                                               title  \
0  u.s. budget fight looms, republican flip fisca...   
1  u.s. military accept transgender recruit monda...   
2  senior u.s. republican senator: 'let mr. muell...   
3  fbi russia probe helped australian diplomat ti...   
4  trump want postal service charge 'much more' a...   

                                                text       subject  \
0  washington (reuters) - head conservative repub...  politicsNews   
1  washington (reuters) - transgender people allo...  politicsNews   
2  washington (reuters) - special counsel investi...  politicsNews   
3  washington (reuters) - trump campaign adviser ...  politicsNews   
4  seattle washington (reuters) - president donal...  politicsNews   

         date  subject_encoded  label  \
0  2017-12-31                6      1   
1  2017-12-29                6      1   
2  2017-12-31                6      1   
3  2017-12-30                6      1   
4  2017-12-29                6      1

In [4]:
df['date'] = pd.to_datetime(df['date'], dayfirst=False)
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['weekday'] = df['date'].dt.weekday
print(df.head())

                                               title  \
0  u.s. budget fight looms, republican flip fisca...   
1  u.s. military accept transgender recruit monda...   
2  senior u.s. republican senator: 'let mr. muell...   
3  fbi russia probe helped australian diplomat ti...   
4  trump want postal service charge 'much more' a...   

                                                text       subject       date  \
0  washington (reuters) - head conservative repub...  politicsNews 2017-12-31   
1  washington (reuters) - transgender people allo...  politicsNews 2017-12-29   
2  washington (reuters) - special counsel investi...  politicsNews 2017-12-31   
3  washington (reuters) - trump campaign adviser ...  politicsNews 2017-12-30   
4  seattle washington (reuters) - president donal...  politicsNews 2017-12-29   

   subject_encoded  label                                         clean_text  \
0                6      1  washington reuters head conservative republica...   
1               

In [5]:
# Sentiment Score 
def get_sentiment(text):
    return TextBlob(str(text)).sentiment.polarity

df['sentiment'] = df['clean_text'].apply(get_sentiment)
print(df.head())

                                               title  \
0  u.s. budget fight looms, republican flip fisca...   
1  u.s. military accept transgender recruit monda...   
2  senior u.s. republican senator: 'let mr. muell...   
3  fbi russia probe helped australian diplomat ti...   
4  trump want postal service charge 'much more' a...   

                                                text       subject       date  \
0  washington (reuters) - head conservative repub...  politicsNews 2017-12-31   
1  washington (reuters) - transgender people allo...  politicsNews 2017-12-29   
2  washington (reuters) - special counsel investi...  politicsNews 2017-12-31   
3  washington (reuters) - trump campaign adviser ...  politicsNews 2017-12-30   
4  seattle washington (reuters) - president donal...  politicsNews 2017-12-29   

   subject_encoded  label                                         clean_text  \
0                6      1  washington reuters head conservative republica...   
1               

In [6]:
 # Combine All Features
meta_features = df[['day', 'month', 'year', 'weekday', 'sentiment']].reset_index(drop=True)
final_features = pd.concat([meta_features, tfidf_df], axis=1)
print(df.head())

                                               title  \
0  u.s. budget fight looms, republican flip fisca...   
1  u.s. military accept transgender recruit monda...   
2  senior u.s. republican senator: 'let mr. muell...   
3  fbi russia probe helped australian diplomat ti...   
4  trump want postal service charge 'much more' a...   

                                                text       subject       date  \
0  washington (reuters) - head conservative repub...  politicsNews 2017-12-31   
1  washington (reuters) - transgender people allo...  politicsNews 2017-12-29   
2  washington (reuters) - special counsel investi...  politicsNews 2017-12-31   
3  washington (reuters) - trump campaign adviser ...  politicsNews 2017-12-30   
4  seattle washington (reuters) - president donal...  politicsNews 2017-12-29   

   subject_encoded  label                                         clean_text  \
0                6      1  washington reuters head conservative republica...   
1               

In [7]:
from gensim.models import Word2Vec

# Train your own Word2Vec model (or load a pre-trained one)
tokenized = df['clean_text'].apply(lambda x: x.split())
w2v_model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=2, workers=4)


In [8]:
def get_average_vector(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

df['w2v_vector'] = tokenized.apply(lambda x: get_average_vector(x, w2v_model, 100))
print(df.head())

                                               title  \
0  u.s. budget fight looms, republican flip fisca...   
1  u.s. military accept transgender recruit monda...   
2  senior u.s. republican senator: 'let mr. muell...   
3  fbi russia probe helped australian diplomat ti...   
4  trump want postal service charge 'much more' a...   

                                                text       subject       date  \
0  washington (reuters) - head conservative repub...  politicsNews 2017-12-31   
1  washington (reuters) - transgender people allo...  politicsNews 2017-12-29   
2  washington (reuters) - special counsel investi...  politicsNews 2017-12-31   
3  washington (reuters) - trump campaign adviser ...  politicsNews 2017-12-30   
4  seattle washington (reuters) - president donal...  politicsNews 2017-12-29   

   subject_encoded  label                                         clean_text  \
0                6      1  washington reuters head conservative republica...   
1               