# Text Analytics Assignment

# Bag of Words


### Importing Libraries

In [49]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

### Loading data and Preprocessing

In [50]:
# Load the CSV file and preprocess the headlines
data = pd.read_csv("news.csv")

# Set up the Porter stemmer
stemmer = PorterStemmer()

# Preprocess the headlines
preprocessed_headlines = []
for headline in data['headline']:
    # Tokenize the headline
    tokens = word_tokenize(headline.lower())

    # Remove stop words and stem the tokens
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]

    # Add the preprocessed headline to the list
    preprocessed_headlines.append(" ".join(tokens))



### Training Model and Results

In [59]:
# Create the bag-of-words representation
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(preprocessed_headlines)

In [60]:
def preprocess_bow(given_headline):
    tokens = word_tokenize(given_headline.lower())
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    preprocessed_given_headline = " ".join(tokens)
    given_headline_bow = vectorizer.transform([preprocessed_given_headline])
    return given_headline_bow

In [61]:
input1 = preprocess_bow("Ariana Grande Drops Wise Feminist Truths On")

cosine_similarities = cosine_similarity(input1, bow_matrix)[0]
most_similar_headline_indices = cosine_similarities.argsort()[:-6:-1]

print("Top 5 similar headlines (bag-of-words with stemming):")
for index in most_similar_headline_indices:
    print(data.iloc[index]['headline'])

Top 5 similar headlines (bag-of-words with stemming):
Ariana Grande Drops Wise Feminist Truths On Twitter
Britney Is Thoroughly Unimpressed With Ariana Grande's Impression
Ariana Grande Heads To Baltimore For NBC's 'Hairspray Live'
Ariana Grande Takes A Tumble At The Billboard Music Awards
Ariana Grande Issues 'Donut Fiasco' Apology Video That Doesn't Explain Donut-Licking


# Word2Vec

### Importing Libraries loading Data

In [25]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

# Load the CSV file and preprocess the headlines
data = pd.read_csv("news.csv")



### Preprocessing

In [26]:
# Preprocess the headlines
preprocessed_headlines = []
for headline in data['headline']:
    # Tokenize the headline
    tokens = word_tokenize(headline.lower())

    # Remove stop words
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Add the preprocessed headline to the list
    preprocessed_headlines.append(tokens)


### Model Training

In [27]:
# Train the Word2Vec model
model = Word2Vec(preprocessed_headlines, vector_size=100, window=5, min_count=1, workers=4)

# Find the top 5 most similar headlines to a given headline using the Word2Vec approach and Cosine Similarity
given_headline = "Ariana Grande Drops Wise Feminist"
given_headline_tokens = word_tokenize(given_headline.lower())
given_headline_tokens = [token for token in given_headline_tokens if token not in stop_words]
given_headline_embedding = model.wv[given_headline_tokens]

similar_headlines = []
for i, headline in enumerate(preprocessed_headlines):
    headline_embedding = model.wv[headline]
    similarity = cosine_similarity(given_headline_embedding, headline_embedding)
    similar_headlines.append((i, similarity[0][0]))

# Sort the similar headlines by similarity score and take the top 5
similar_headlines.sort(key=lambda x: x[1], reverse=True)
top_5_headlines = similar_headlines[1:6]

print("Top 5 similar headlines (Word2Vec with cosine similarity):")
for headline in top_5_headlines:
    print(data.iloc[headline[0]]['headline'])

Top 5 similar headlines (Word2Vec with cosine similarity):
Ariana Grande Takes A Tumble At The Billboard Music Awards
Ariana Grande Issues 'Donut Fiasco' Apology Video That Doesn't Explain Donut-Licking
Ariana Grande Heads To Baltimore For NBC's 'Hairspray Live'
Princess Diana Challenged Postpartum Depression Stigma Over 20 Years Ago
Creative Community Taking Women's March to Sundance


# GloVe

### Importing Libraries

In [30]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


### Loading Data and preprpcessing

In [31]:
# Load the CSV file and preprocess the headlines
data = pd.read_csv("news.csv")

# Preprocess the headlines
preprocessed_headlines = []
for headline in data['headline']:
    # Tokenize the headline
    tokens = word_tokenize(headline.lower())

    # Remove stop words
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Add the preprocessed headline to the list
    preprocessed_headlines.append(tokens)

# Load the pre-trained GloVe word embeddings
glove_path = "glove.6B.50d.txt"
glove_embeddings = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        embedding = np.array([float(val) for val in values[1:]])
        glove_embeddings[word] = embedding

# Compute the embeddings for each headline
embedding_size = len(glove_embeddings['the'])
headline_embeddings = []
for headline in preprocessed_headlines:
    headline_embedding = np.zeros(embedding_size)
    num_tokens = 0
    for token in headline:
        if token in glove_embeddings:
            headline_embedding += glove_embeddings[token]
            num_tokens += 1
    if num_tokens > 0:
        headline_embedding /= num_tokens
    headline_embeddings.append(headline_embedding)



### Results

In [32]:
# Find the top 5 most similar headlines to a given headline using GloVe embeddings and Cosine Similarity
given_headline = "Ariana Grande Drops Wise Feminist Truths On"
given_headline_tokens = word_tokenize(given_headline.lower())
given_headline_tokens = [token for token in given_headline_tokens if token not in stop_words]
given_headline_embedding = np.zeros(embedding_size)
num_tokens = 0
for token in given_headline_tokens:
    if token in glove_embeddings:
        given_headline_embedding += glove_embeddings[token]
        num_tokens += 1
if num_tokens > 0:
    given_headline_embedding /= num_tokens

similar_headlines = []
for i, headline_embedding in enumerate(headline_embeddings):
    if np.any(headline_embedding):
        similarity = cosine_similarity([given_headline_embedding], [headline_embedding])
        similar_headlines.append((i, similarity[0][0]))

# Sort the similar headlines by similarity score and take the top 5
similar_headlines.sort(key=lambda x: x[1], reverse=True)
top_5_headlines = similar_headlines[1:6]

print("Top 5 similar headlines (GloVe with cosine similarity):")
for headline in top_5_headlines:
    print(data.iloc[headline[0]]['headline'])
    #print(headline[1])

Top 5 similar headlines (GloVe with cosine similarity):
Is My Hairstyle a Feminist Statement?
Britney Is Thoroughly Unimpressed With Ariana Grande's Impression
Ariana Grande Issues 'Donut Fiasco' Apology Video That Doesn't Explain Donut-Licking
This 'La La Land'-esque YouTube Love Story Has Us All Choked Up
The Writing Life: Elections and Intuition


# SVD & LSA

### Importing Libararies

In [41]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Loading Data and Preprocessing

In [44]:
# Load data
df = pd.read_csv('news.csv')

# Preprocess data
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

df['headline_clean'] = df['headline'].apply(preprocess)



### Training the model

In [45]:
# Create document-term matrix
vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(df['headline_clean'])

# Apply SVD
svd = TruncatedSVD(n_components=300)
lsa = make_pipeline(svd, Normalizer(copy=False))
dtm_lsa = lsa.fit_transform(dtm)

# Find similar headlines
def find_similar_headlines(query, num_results=5):
    query_dtm = vectorizer.transform([preprocess(query)])
    query_lsa = lsa.transform(query_dtm)
    cosine_similarities = np.dot(query_lsa, dtm_lsa.T)
    similar_indices = cosine_similarities.argsort()[0][-num_results-1:-1][::-1]
    return df.iloc[similar_indices]['headline'].values


### Results

In [46]:
# Test function
input_headline = "Ariana Grande Drops Wise Feminist"
similar_headlines = find_similar_headlines(input_headline)
print("Similar headlines:\n", similar_headlines)


Similar headlines:
 ['Ariana Grande Drops Wise Feminist Truths On Twitter'
 'Is My Hairstyle a Feminist Statement?'
 "Ariana Grande Heads To Baltimore For NBC's 'Hairspray Live'"
 'Ariana Grande Takes A Tumble At The Billboard Music Awards'
 "Ariana Grande Issues 'Donut Fiasco' Apology Video That Doesn't Explain Donut-Licking"]


# Streamlit