In [1]:
# !pip install -r requirement.txt

In [1]:
import gensim.downloader as gen
from gensim.models import KeyedVectors


In [None]:
# Load and save pre-trained Word2Vec model
word2vec_model = gen.load("word2vec-google-news-300")
word2vec_model.save("/home/mukesh/pikachu/classification_nlp/emb_model/word2vec_model.bin")



# Load and save the pre-trained GloVe model
glove_model = gen.load("glove-wiki-gigaword-300")
glove_model.save("/home/mukesh/pikachu/classification_nlp/emb_model/glove_model.bin")


# Load and save the pre-trained FastText model
fasttext_model = gen.load("fasttext-wiki-news-subwords-300")
fasttext_model.save("/home/mukesh/pikachu/classification_nlp/emb_model/fasttext_model.bin")


In [None]:
# Read data
df_pos = open("data/Train.pos", "r", encoding="latin-1").read()
df_neg = open("data/Train.neg", "r", encoding="latin-1").read()

In [None]:
# Create lists for positive and negative sentences
df_pos_list = [i for i in df_pos.split("\n") if len(i) >= 2]
df_neg_list = [i for i in df_neg.split("\n") if len(i) >= 2]


In [None]:
from gensim.models import KeyedVectors
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd

In [None]:
loaded_word2vec_model = KeyedVectors.load("emb_model/word2vec_model.bin")
loaded_glove_model = KeyedVectors.load("emb_model/glove_model.bin")
loaded_fasttext_model = KeyedVectors.load("emb_model/fasttext_model.bin")

In [None]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /home/mukesh/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
contractions = {
    "it’s": "it is", "it's": "it is", "don't": "do not", "i'm": "i am", "you're": "you are",
    "he's": "he is", "she's": "she is", "we're": "we are", "they're": "they are", "isn't": "is not",
    "aren't": "are not", "wasn't": "was not", "weren't": "were not", "hasn't": "has not",
    "haven't": "have not", "hadn't": "had not", "won't": "will not", "wouldn't": "would not",
    "can't": "cannot", "couldn't": "could not", "shouldn't": "should not", "mustn't": "must not"
}

In [None]:
def expand_contractions(text):
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    return text

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = expand_contractions(text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [re.sub(r'(.)\1{2,}', r'\1', word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/mukesh/nltk_data...


[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Preprocess the text data
df_pos_preprocessed = [preprocess_text(sentence) for sentence in df_pos_list]
df_neg_preprocessed = [preprocess_text(sentence) for sentence in df_neg_list]


In [None]:
# Create a DataFrame for positive and negative data
positive_df = pd.DataFrame({
    'original_text': df_pos_list,
    'processed_text': df_pos_preprocessed,
    'level': 'positive'
})

negative_df = pd.DataFrame({
    'original_text': df_neg_list,
    'processed_text': df_neg_preprocessed,
    'level': 'negative'
})

In [None]:
# Concatenate both DataFrames
final_df = pd.concat([positive_df, negative_df], ignore_index=True)

# Save the DataFrame to a CSV file
final_df.to_csv("processed_data.csv", index=False)



In [None]:
def get_sentence_embedding(sentence, model):
    words = sentence.split()
    word_embeddings = []
    
    for word in words:
        if word in model:
            word_embeddings.append(model[word])
        else:
            word_embeddings.append(np.zeros(model.vector_size))

    # Calculate the mean of the embeddings; if no embeddings, return a zero vector
    if len(word_embeddings) == 0:
        return np.zeros(model.vector_size)
    
    return np.mean(word_embeddings, axis=0)

In [None]:
# Generate embeddings for positive and negative sentences
w2v_pos_embeddings = [get_sentence_embedding(sentence, loaded_word2vec_model) for sentence in df_pos_preprocessed]
w2v_neg_embeddings = [get_sentence_embedding(sentence, loaded_word2vec_model) for sentence in df_neg_preprocessed]

glove_pos_embeddings = [get_sentence_embedding(sentence, loaded_glove_model) for sentence in df_pos_preprocessed]
glove_neg_embeddings = [get_sentence_embedding(sentence, loaded_glove_model) for sentence in df_neg_preprocessed]

fasttext_pos_embeddings = [get_sentence_embedding(sentence, loaded_fasttext_model) for sentence in df_pos_preprocessed]
fasttext_neg_embeddings = [get_sentence_embedding(sentence, loaded_fasttext_model) for sentence in df_neg_preprocessed]


In [None]:
# # Generate embeddings for all sentences in final_df
final_df['w2v_embedding'] = final_df['processed_text'].apply(lambda x: get_sentence_embedding(x, loaded_word2vec_model))
final_df['glove_embedding'] = final_df['processed_text'].apply(lambda x: get_sentence_embedding(x, loaded_glove_model))

final_df['fasttest_embedding'] = final_df['processed_text'].apply(lambda x: get_sentence_embedding(x, loaded_fasttext_model))


# # Save the DataFrame with embeddings to a CSV file
final_df.to_csv("text_embedding.csv", index=False)


In [None]:
final_df.head()

Unnamed: 0,original_text,processed_text,level,w2v_embedding,glove_embedding,fasttest_embedding
0,the rock is destined to be the 21st century's ...,rock destined century new conan going make spl...,positive,"[0.03640926585477941, 0.06906666475183823, -0....","[0.05452945433995303, -0.07681117634124615, -0...","[-0.0015215861653012004, -0.020201347056118882..."
1,"the gorgeously elaborate continuation of "" the...",gorgeously elaborate continuation lord ring tr...,positive,"[-0.049177689985795456, 0.008171775124289772, ...","[-0.1680255799490789, -0.07400813410905274, 0....","[0.003546609088185836, -0.01697812804989305, 0..."
2,effective but too-tepid biopic,effective tootepid biopic,positive,"[0.08540852864583333, -0.07674153645833333, -0...","[0.10935333867867787, -0.15938666959603628, 0....","[0.003958086551089461, -0.03526200043658415, -..."
3,if you sometimes like to go to the movies to h...,sometimes like go movie fun wasabi good place ...,positive,"[0.010335286, -0.0048828125, -0.010218303, 0.1...","[-0.030760799, 0.069041885, 0.09241887, -0.071...","[-0.017284378, -0.017631331, 0.014068676, 0.03..."
4,"emerges as something rare , an issue movie tha...",emerges something rare issue movie thats hones...,positive,"[0.090576171875, 0.02848229041466346, -0.06612...","[-0.002130763, 0.058085773, 0.05994094, -0.108...","[0.005082369, 0.0033200698, 0.039461907, 0.005..."
