# Libraries

In [None]:
# Import the necessary libraries and packages
import pandas as pd  # For data manipulation with DataFrames
!pip install transformers  # Install the 'transformers' library
import nltk  # Natural Language Toolkit for text processing
import time  # For time-related operations
import timeit  # For timing the execution of code
nltk.download('punkt')  # Download the 'punkt' dataset used by NLTK
import gensim  # Library for topic modeling and document similarity
import os  # Provides functions for interacting with the operating system
import torch  # PyTorch for deep learning
import warnings  # To handle warnings in the code

import matplotlib.pyplot as plt  # For creating plots and visualizations
import numpy as np  # NumPy for numerical operations
from sklearn.decomposition import PCA  # Principal Component Analysis
warnings.filterwarnings('ignore')  # Suppress warnings during execution
from tqdm import tqdm  # For progress bars during loops
from sklearn.feature_extraction.text import TfidfVectorizer  # For TF-IDF vectorization
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel  # For BERT-based NLP tasks

# Initialize a BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a function for mean pooling
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # Extract token embeddings from BERT output
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)  # Calculate the sum of embeddings
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)  # Calculate the sum of attention mask
    return sum_embeddings / sum_mask  # Calculate the mean of embeddings

# Initialize a different tokenizer from 'sentence-transformers'
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Initialize a model from 'sentence-transformers'
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Import specific functions and classes from gensim and other libraries
from gensim.models.doc2vec import Doc2Vec, TaggedDocument  # For Doc2Vec model
from nltk.tokenize import word_tokenize  # Tokenization using NLTK
from scipy.spatial.distance import cdist  # For computing distance between data points

# Initialize three empty dictionaries for time measurements
TimeDict1, TimeDict2 = {}, {}

ComputingCase = 'ColabCPU'



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data

In [None]:
# Define the path to the directory containing the data and the data file
DataFile = r'/content/drive/MyDrive/Data/Course/CleanedData.xlsx'


data = pd.read_excel(DataFile)

# Sample 500 rows with a rating of 1.0 (Bad) and 500 rows with a rating of 5.0 (Good)
Data1KBad = data[data['Rating'] == 1.0].sample(n=500)
Data1KGood = data[data['Rating'] == 5.0].sample(n=500)

# Concatenate the sampled data for 1K rows and reset the index
Data1K = pd.concat([Data1KBad, Data1KGood], axis=0).reset_index()

# Map the 'Rating' values to 'Type' (Bad or Good) and select relevant columns
Data1K['Type'] = Data1K['Rating'].map({1.0: 'Bad', 5.0: 'Good'})
Data1K = Data1K[['CleanedText', 'Type']]


# Feature Extraction Algorithms

##  W2V Avg

In [None]:
def W2VAverage(Data, Feature, VectorSize=100):
    # Create an empty DataFrame to store Word2Vec vectors
    W2VDF = pd.DataFrame()

    # Extract the text data from the 'Feature' column of the input 'Data'
    Text = Data[Feature].tolist()

    # Split the text into lists of words
    Text = [str(i).split() for i in Text]

    # Train a Word2Vec model on the text data with the specified vector size
    model = gensim.models.Word2Vec(Text, vector_size=VectorSize)

    # Populate the 'W2VDF' DataFrame with words and their corresponding Word2Vec vectors
    W2VDF['Words'] = list(model.wv.key_to_index.keys())
    W2VDF['W2V'] = W2VDF['Words'].apply(lambda x: ','.join([str(i) for i in list(model.wv.get_vector(x))]))

    # Create a dictionary to map words to their Word2Vec vectors
    W2VDict = {i: j for i, j in zip(W2VDF['Words'].tolist(), W2VDF['W2V'].tolist())}

    # Define a function to apply Word2Vec averaging to text data
    def ApplyW2V(x):
        L = []
        x = str(x).lower()

        # Iterate through words in the text
        for w in x.split():
            if w in W2VDict.keys():
                L.append([float(j) for j in W2VDict[w].split(',')])

        # If there are Word2Vec vectors for the words in the text, calculate the mean
        if len(L) > 0:
            d = pd.DataFrame(L, columns=[str(f'C{i}') for i in range(VectorSize)])
            return ','.join([str(d[i].mean()) for i in d.columns])
        else:
            return 0

    # Apply Word2Vec averaging to the 'Feature' column of the 'Data'
    Data['W2V'] = Data[Feature].apply(ApplyW2V)

    # Remove rows where Word2Vec averaging resulted in 0
    Data.drop(list(Data[Data['W2V'] == 0].index), axis=0, inplace=True)

    # Extract the individual Word2Vec vector components and add them as columns to 'Data'
    for i in range(VectorSize):
        Data[f'C{i+1}'] = Data['W2V'].apply(lambda x: float(str(x).split(',')[i]))

    # Create a new DataFrame containing the Word2Vec vector components and 'Type' column
    ThisData = Data[[f'C{i}' for i in range(1, VectorSize + 1)]]
    ThisData['Type'] = Data['Type']

    # Return the final DataFrame with Word2Vec vector components and 'Type'
    return ThisData


##  BertAverage

In [None]:
def BertAverage(Data, Feature):
    # Extract sentences from the 'Feature' column of the input 'Data'
    sentences = Data[Feature].tolist()
    #['I want to study now','i hate ice cream']

    # Tokenize the sentences using the BERT tokenizer with specified settings
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

    # Use the BERT model to generate embeddings for the tokenized sentences
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Calculate the mean pooling of BERT embeddings
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Create a dictionary to map sentences to their embeddings
    DataDict = {i: [j, [float(h) for h in list(k)]] for i, j, k in tqdm(zip(range(len(sentences)), sentences, sentence_embeddings))}

    # Extract the embeddings and create a DataFrame
    DF = pd.DataFrame([i[1] for i in DataDict.values()])

    # Add the 'Type' column from the input 'Data' to the DataFrame
    DF['Type'] = Data['Type']

    # Return the DataFrame with BERT embeddings and 'Type'
    return DF


## Doc2Vec

In [None]:
def ApplyDoc2Vec(Data, Feature, VectorSize=300):
    # Preprocess and clean the text data by replacing double newline characters with a space
    AllReviews = Data[Feature].str.replace('\n\n', ' ')

    # Add an 'Index' column to the input 'Data'
    Data['Index'] = list(Data.index)

    # Filter out rows with non-null values in 'AllReviews'
    non_na = AllReviews.notna()
    non_na_Reviews = AllReviews[non_na]
    non_na_Index = Data[non_na]['Index']

    # Tokenize the non-null reviews and create a list of TaggedDocument objects
    non_na_reviews_list = list(map(lambda x: word_tokenize(x), non_na_Reviews.values))
    documents = [TaggedDocument(doc, [non_na_Index.values[i]]) for i, doc in enumerate(non_na_reviews_list)]

    # Train a Doc2Vec model with the specified vector size and other settings
    model = Doc2Vec(documents, vector_size=VectorSize, window=2, min_count=1, workers=4)

    # Create a dictionary to map document indices to their embeddings
    document_dict = {}
    for idx, text_df in tqdm(Data[non_na][["Index", Feature]].iterrows()):
        text = text_df[Feature].replace("\n\n", ' ')
        document_dict[text_df['Index']] = model.docvecs[text_df['Index']]

    # Create a DataFrame from the document embeddings dictionary and add the 'Type' column
    document_embeddings_df = pd.DataFrame.from_dict(document_dict, orient="index")
    document_embeddings_df['Type'] = Data['Type']

    # Return the DataFrame with document embeddings and 'Type'
    return document_embeddings_df


# Applying Functions 1K

In [None]:
W2VAverageData = W2VAverage(Data1K, 'CleanedText', VectorSize=100)
W2VAverageData.to_csv(os.path.join(Path, 'W2VAverageData1K.csv'))


Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C92,C93,C94,C95,C96,C97,C98,C99,C100,Type
0,-0.014706,0.340658,0.134642,0.063955,0.107437,-0.486861,0.211813,0.737262,-0.320579,-0.211589,...,0.243889,0.024260,0.064195,0.612908,0.220014,0.176887,-0.293265,0.038743,-0.082845,Bad
1,-0.015264,0.326326,0.132390,0.064643,0.102685,-0.470696,0.202899,0.711801,-0.310265,-0.203703,...,0.234421,0.022444,0.061952,0.588161,0.213504,0.168942,-0.282046,0.038781,-0.079289,Bad
2,-0.013040,0.319478,0.127080,0.064563,0.100209,-0.459530,0.199708,0.695114,-0.303378,-0.200887,...,0.229788,0.021207,0.062410,0.575461,0.206301,0.166324,-0.277362,0.036952,-0.076428,Bad
3,-0.014412,0.347296,0.138749,0.067378,0.108677,-0.499846,0.216719,0.755431,-0.327008,-0.215911,...,0.250434,0.024942,0.067942,0.629912,0.226655,0.178105,-0.300612,0.038868,-0.085738,Bad
4,-0.014646,0.314694,0.122119,0.070703,0.100178,-0.458022,0.194771,0.693883,-0.309056,-0.203761,...,0.232060,0.018927,0.062076,0.570779,0.204737,0.165288,-0.277762,0.039327,-0.075446,Bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.016646,0.357144,0.140280,0.067804,0.115589,-0.505953,0.228015,0.775081,-0.337242,-0.223258,...,0.260756,0.025193,0.064703,0.642173,0.234257,0.191202,-0.307064,0.043862,-0.089959,Good
996,-0.015743,0.335438,0.129667,0.083692,0.108040,-0.485720,0.212388,0.738836,-0.328339,-0.221779,...,0.252835,0.014789,0.063901,0.606522,0.219111,0.177483,-0.292699,0.035809,-0.079301,Good
997,-0.015312,0.321449,0.128512,0.063915,0.101560,-0.463704,0.200877,0.706595,-0.310619,-0.204626,...,0.236046,0.019709,0.061576,0.581230,0.209680,0.166188,-0.282652,0.042563,-0.079812,Good
998,-0.009948,0.331822,0.131098,0.066555,0.102383,-0.478413,0.208269,0.723804,-0.312559,-0.206859,...,0.237958,0.020675,0.066010,0.601055,0.215669,0.168131,-0.287858,0.037143,-0.080295,Good


In [None]:

BertAverageData = BertAverage(Data1K, 'CleanedText')

BertAverageData.to_csv(os.path.join(Path, 'BertAverageData1K.csv'))


999it [00:01, 765.06it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,Type
0,-0.247017,-0.008449,0.107657,-0.259377,-0.070890,0.022557,0.103397,-0.075634,0.032042,-0.007544,...,-0.058549,0.033008,-0.050552,-0.002913,0.220632,-0.057814,-0.060231,-0.020764,-0.076219,Bad
1,-0.029988,-0.010109,0.257552,-0.085149,0.010593,-0.022310,-0.156227,-0.084031,0.059994,0.039692,...,-0.286835,0.143673,0.028904,0.032941,0.124562,-0.116027,-0.267445,-0.013849,-0.026657,Bad
2,-0.034670,-0.005444,-0.039189,-0.047522,-0.099169,-0.169091,-0.102180,-0.196749,-0.000533,-0.093578,...,-0.193659,-0.071473,-0.066204,-0.185926,0.272739,-0.005388,-0.191420,0.241676,-0.025772,Bad
3,-0.015449,-0.047247,0.129182,-0.080443,-0.072609,-0.065152,0.017761,-0.142713,-0.003891,0.000608,...,-0.179510,0.112446,-0.086104,0.113086,0.170542,-0.137085,-0.052040,-0.151035,-0.034254,Bad
4,-0.095413,-0.180064,0.314022,0.002391,-0.112965,-0.075880,-0.288371,-0.258927,0.098612,-0.220020,...,-0.576275,0.170914,-0.288876,-0.008172,0.350523,0.042718,-0.047200,0.044999,-0.010483,Bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,0.020793,0.221476,0.250533,-0.141496,0.039716,0.250921,-0.186287,0.168591,-0.173220,-0.035386,...,-0.033128,-0.170019,0.235027,0.153493,0.075505,0.056420,0.644378,0.138996,0.052638,Good
995,-0.432128,0.383811,-0.172395,-0.311446,-0.096279,0.064920,0.102175,-0.089879,0.068946,-0.218425,...,-0.431361,0.035483,-0.197614,-0.156438,0.594792,-0.080822,0.570297,-0.236011,0.313380,Good
996,-0.232823,-0.012066,-0.160512,-0.055228,0.054974,-0.029214,0.240235,-0.041870,-0.006769,-0.140899,...,-0.106518,0.034299,-0.004359,-0.109644,0.263062,0.020581,-0.036294,0.053189,0.165252,Good
997,0.118029,-0.067469,0.181741,0.033509,-0.106678,0.011951,-0.038538,-0.080969,-0.084312,0.079091,...,-0.246663,0.017887,-0.049701,-0.177710,0.168179,-0.080370,-0.047998,-0.020462,-0.122158,Good


In [None]:

Doc2VecData = ApplyDoc2Vec(Data1K, 'CleanedText', VectorSize=300)


Doc2VecData.to_csv(os.path.join(Path, 'Doc2VecData1K.csv'))


999it [00:00, 7990.43it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,Type
0,-0.006873,0.027155,-0.057789,0.095979,-0.031509,-0.049758,-0.114121,0.209580,-0.003984,-0.102518,...,0.098666,0.036480,0.059230,0.132003,0.111079,0.050373,-0.166996,0.099195,-0.058080,Bad
1,-0.009364,0.064231,-0.054720,0.102755,-0.025679,-0.068803,-0.107343,0.230755,-0.003812,-0.108294,...,0.126393,0.045500,0.062547,0.149992,0.118965,0.069606,-0.166033,0.102281,-0.041757,Bad
2,-0.010820,0.031428,-0.038130,0.060378,-0.018885,-0.037308,-0.069391,0.142363,-0.003214,-0.066275,...,0.084283,0.031121,0.045688,0.101673,0.074281,0.043255,-0.103921,0.065057,-0.035022,Bad
3,-0.003603,0.018985,-0.031257,0.048448,-0.015862,-0.025146,-0.054198,0.111139,-0.001018,-0.057562,...,0.058732,0.020762,0.030884,0.067995,0.056008,0.034069,-0.087798,0.049634,-0.031283,Bad
4,-0.005321,0.020982,-0.022075,0.030630,-0.007885,-0.024696,-0.032522,0.068362,0.000231,-0.035252,...,0.048559,0.017435,0.020476,0.053044,0.040071,0.021223,-0.049348,0.030582,-0.018303,Bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.000599,0.008594,-0.013102,0.022012,-0.006517,-0.016873,-0.028896,0.057750,-0.000100,-0.027197,...,0.027830,0.013944,0.013934,0.032617,0.025947,0.019334,-0.041665,0.020057,-0.016851,Good
996,-0.001648,0.004850,-0.004960,0.005539,-0.000983,-0.000383,-0.006306,0.007064,0.000247,-0.002351,...,0.007868,0.001906,0.008161,0.010934,0.007126,0.006375,-0.004475,0.005226,-0.001331,Good
997,-0.006133,0.028965,-0.034966,0.062885,-0.013004,-0.040155,-0.068520,0.141843,-0.001800,-0.064339,...,0.085063,0.034091,0.046572,0.101564,0.070772,0.043521,-0.113201,0.065109,-0.034691,Good
998,-0.013674,0.044633,-0.063397,0.124355,-0.031109,-0.058264,-0.127788,0.267903,-0.007283,-0.125796,...,0.140474,0.053493,0.074687,0.168985,0.145085,0.078580,-0.197198,0.123946,-0.064167,Good
