# Gensim Word2Vec

## Load Packages and Corpus

In [None]:
#import libraries

from nltk.tokenize import sent_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer
import nltk
import glob
from pathlib import Path
import os
import gensim
import pandas as pd
from gensim.models import Word2Vec
from tqdm import tqdm
import multiprocessing
import re
import string

In [None]:
##Get current working directory 
path = os.getcwd()
print(path)

#Change working directory
path = os.chdir("INSERT PATH")

#Upload dataframeâˆš
df = pd.read_csv('INSERT NAME.csv')

df

## Preprocessing

In [None]:
from nltk import WordNetLemmatizer
wnl = WordNetLemmatizer()

#Can choose to lemmatize clean text with or without stopwords
df['Text'] = df['Text'].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in x.split() ]))
df

In [None]:
#setting up tokenizer

tokenizer = TreebankWordTokenizer()
files_S = df['Text'].tolist()
titles_S = df['Title'].tolist()

In [None]:
#tokenizer function

def make_sentences_S(list_text):
    all_txt_S = []
    counter = 0
    for txt in tqdm(list_text, desc="Preprocessing"):
        lower_txt = txt.lower()
        lemmas = [wnl.lemmatize(word) for word in lower_txt]
        sentences_S = sent_tokenize(lower_txt)
        sentences_S = [tokenizer.tokenize(sent) for sent in sentences_S]
        all_txt_S += sentences_S
        counter += 1
    return all_txt_S

In [None]:
#preprocessing
sentences_S = make_sentences_S(files_S)

## Run Word2Vec

In [None]:
Subset_model=gensim.models.Word2Vec(
sentences_S, 
sg=1,
min_count=2,
vector_size=300,
workers=5)

In [None]:
Subset_model.save("Subset_model_real")

## Get Similarities + Analogies

In [None]:
Subset_model.wv.most_similar("river", topn=5)

In [None]:
Subset_model.wv.most_similar("toxic", topn=5)

In [None]:
Subset_model.wv.most_similar("water", topn=5)

In [None]:
Subset_model.wv.most_similar("polluted", topn=5)

In [None]:
Subset_model.wv.most_similar("catastrophe", topn=5)

In [None]:
Subset_model.wv.most_similar("foam", topn=5)

In [None]:
Subset_model.wv.most_similar("forest", topn=5)

In [None]:
Subset_model.wv.similarity('water','toxic')

In [None]:
Subset_model.wv.similarity('water','sewer')

In [None]:
Subset_model.wv.similarity('water','oxygen')

In [None]:
Subset_model.wv.similarity('water','eastward')

In [None]:
Subset_model.wv.similarity('river','sewer')

In [None]:
Subset_model.wv.similarity('creek','sewer')

In [None]:
Subset_model.wv.similarity('creek', 'toxic')

In [None]:
Subset_model.wv.similarity('stream', 'toxic')

In [None]:
Subset_model.wv.similarity('river', 'toxic')

In [None]:
Subset_model.wv.similarity('air', 'toxic')

In [None]:
Subset_model.wv.similarity('polluted', 'toxic')

In [None]:
Subset_model.wv.similarity('forest', 'toxic')

In [None]:
Subset_model.wv.similarity('river','sewer')

In [None]:
Subset_model.wv.similarity('ocean','sewer')

In [None]:
result = Subset_model.wv.most_similar(positive=['river', 'canal'], negative=['ocean'])
print("{}: {:.4f}".format(*result[0]))

In [None]:
result = Subset_model.wv.most_similar(positive=['river', 'foam'], negative=['air'])
print("{}: {:.4f}".format(*result[0]))

In [None]:
result = Subset_model.wv.most_similar(positive=['river', 'sewage'], negative=['ocean'])
print("{}: {:.4f}".format(*result[0]))

In [None]:
result = Subset_model.wv.most_similar(positive=['water', 'toxic'], negative=['air'])
print("{}: {:.4f}".format(*result[0]))

In [None]:
result = Subset_model.wv.most_similar(positive=['river', 'flood'], negative=['air'])
print("{}: {:.4f}".format(*result[0]))

In [None]:
result = Subset_model.wv.most_similar(positive=['creek', 'sewer'], negative=['ocean'])
print("{}: {:.4f}".format(*result[0]))

In [None]:
result = Subset_model.wv.most_similar(positive=['river', 'polluted'], negative=['ocean'])
print("{}: {:.4f}".format(*result[0]))

In [None]:
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#Setup for plot

base_words = ['toxic', 'pollution', 'river', 'water']
toxic_similar_S = [i[0] for i in Subset_model.wv.most_similar(positive='toxic', topn=5)]
pollution_similar_S = [i[0] for i in Subset_model.wv.most_similar(positive='pollution', topn=5)]
river_similar_S = [i[0] for i in Subset_model.wv.most_similar(positive='river', topn=5)]
water_similar_S = [i[0] for i in Subset_model.wv.most_similar(positive='water', topn=5)]


In [None]:
all_words_S = np.hstack((base_words, toxic_similar_S, pollution_similar_S, river_similar_S, water_similar_S))
print(all_words_S)

In [None]:
# Calculate similarities
base_words = ['toxic', 'pollution', 'river', 'water', 'sewer', 'sewage', 'cleanse', 'clean', 'dirty', 'foam', 'infect', 'ocean', 'sea']

similarities = []
for word in base_words:
    sim_words = Subset_model.wv.most_similar(word, topn=15)
    similarities.append(sim_words)
    
similar_w_df = pd.DataFrame(similarities)
similar_w_df = similar_w_df.T
similar_w_df.columns = base_words
similar_w_df.to_csv('subset_each_word_similarities.csv')

In [None]:
# Calculate similarities
base_words = ['toxic', 'pollution', 'river', 'water', 'sewer', 'sewage', 'cleanse', 'clean', 'dirty', 'foam', 'infect', 'ocean', 'sea']

similarities = []
for word in base_words:
    for other_word in base_words:
        similarity_score = Subset_model.wv.similarity(word, other_word)
        similarities.append((word, other_word, similarity_score))

# Create a DataFrame
df = pd.DataFrame(similarities, columns=['Word 1', 'Word 2', 'Similarity'])
df

In [None]:
df.to_csv('INSERT NAME.csv')

## Plot Similar Words

In [None]:
#plot with color coding

labels = [i for i in all_words_S]
tokens = Subset_model.wv[labels]
tokens = tokens.astype(float)

tsne_model = TSNE(init='pca', learning_rate='auto', perplexity=15)
new_values = tsne_model.fit_transform(tokens)


x= []
y= []
for value in new_values:
    x.append(value[0])
    y.append(value[1])
    
for word in all_words_S:
    i=labels.index(word)
    plt.annotate(labels[i],
                xy=(x[i], y[i]),
                xytext=(5, 2),
                textcoords='offset points',
                ha='right',
                va='bottom')
    if word in toxic_similar_S:
        plt.scatter(x[i], y[i], color='gold')
    elif word == 'toxic':
         plt.scatter(x[i], y[i], color='goldenrod')
    elif word in pollution_similar_S:
         plt.scatter(x[i], y[i], color='indianred')
    elif word == 'pollution':
         plt.scatter(x[i], y[i], color='brown')
    elif word in river_similar_S:
         plt.scatter(x[i], y[i], color='silver')
    elif word == 'river':
         plt.scatter(x[i], y[i], color='grey')
    elif word in water_similar_S:
         plt.scatter(x[i], y[i], color='lightblue')
    elif word == 'water':
        plt.scatter(x[i], y[i], color='royalblue')
    plt.title("Word2Vec Embeddings from Water Novel Subset", fontweight='bold' )