In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
import os
import scipy.signal
import fasttext

from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score,accuracy_score
from numba import jit, prange
from tqdm import tqdm
from vmdpy import VMD
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

os.chdir('../')
os.chdir('data/')

In [2]:
def format_text(df,col):
      comp_df = df.copy()
      # remove all the punctuation
      comp_df[col] = comp_df[col].str.replace(r'(@\w*)','')

      #Remove all non-character
      comp_df[col] = comp_df[col].str.replace(r"[^a-zA-Z ]","")

      # Remove extra space
      comp_df[col] = comp_df[col].str.strip()

      # Change to lowercase
      comp_df[col] = comp_df[col].str.lower()
      return comp_df

def energy(u):
# Estimate PSD `S_xx_welch` at discrete frequencies `f_welch`
    f_welch, S_xx_welch = scipy.signal.welch(u)
    # Integrate PSD over spectral bandwidth
    # to obtain signal power `P_welch`
    df_welch = f_welch[1] - f_welch[0]
    return np.sum(S_xx_welch) * df_welch  

def maxvdm(f):
    alpha = 2     
    tau = 0            
    K = 3       
    DC = 0             
    init = 1           
    tol = 1e-9
    u, u_hat, omega = VMD(f, alpha, tau, K, DC, init, tol) 
    energy_array=[]
    for i in u:
        energy_array.append(energy(i))
    ind = np.argmax(energy_array)
    return u[ind]


In [3]:
def tune(f,alpha,K):   
    tau = 0                 
    DC = 0             
    init = 1           
    tol = 1e-9
    u, u_hat, omega = VMD(f, alpha, tau, K, DC, init, tol) 
    energy_array=[]
    for i in u:
        energy_array.append(energy(i))
    ind = np.argmax(energy_array)
    return u[ind]

@jit(parallel=True,forceobj=True)
def tuning(features):
    f_dict={}
    acc_dict={}
    for k in tqdm(prange(2,7)):
        for al in prange(2,7):
            X = []
            for i in prange(len(features)):
                X.append(tune(features[i],al,k))
                
            X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.25)
            
            lr = LogisticRegression()
            lr.fit(X_train,y_train)
            y_pred = lr.predict(X_test)
            
            f = f1_score(y_test, y_pred)
            a = accuracy_score(y_test, y_pred)
            f_dict['K : '+str(k)+' Alpha : '+str(al)] = f
            acc_dict['K : '+str(k)+' Alpha : '+str(al)] = a
    
    return f_dict,acc_dict

## Loading IMdb Dataset:

In [4]:
imdb_data = pd.read_csv('IMDB.csv')

A sample of 5k rows from 50k is taken randomly for primary analysis

In [5]:
imdb_data_sam = imdb_data.sample(n=7000, replace=False)

train = format_text(imdb_data_sam,'review')
X = train['review'].tolist()
Y = train['sentiment']

In [6]:
le = LabelEncoder()
le.fit(Y)
y = le.transform(Y)

### TF-IDF Vectoriser

In [7]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(X).toarray()

#### Fine-Tuning Parameters for VMD Mode extraction

In [8]:
a,f = tuning(features)

### Extracting VMD Modes

In [None]:
@jit(parallel=True,forceobj=True)
def vm(features):
    X = np.zeros((features.shape[0],features.shape[1]))
    for i in tqdm(prange(len(features))):
        X[i]=maxvdm(features[i])
    return X

X_vmd = vm(features)

 59%|█████▉    | 4119/7000 [4:17:54<3:00:29,  3.76s/it]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_vmd, y, test_size=0.25)

### Evaluating Model

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

## FastText Pre-Trained Model

In [None]:
ft = fasttext.load_model('cc.en.300.bin')
size = ft.get_dimension()
corpus = []
for i in X:
    corpus.append(word_tokenize(i))

In [None]:
X_fast = np.zeros((len(corpus),size)) #Initializing the X matrix with zeros
for i in range(len(corpus)):
    emb = np.zeros((1,size))
    for w in corpus[i]:
        emb = emb +  ft.get_word_vector(w)
    X_fast[i] = emb

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_fast, y, test_size=0.25)
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

## t-SNE Plots for Semantic Relation-Analysis

In [None]:
VMD_embedded = TSNE(n_components=2).fit_transform(X_vmd.T)
fast = TSNE(n_components=2).fit_transform(X_fast)

In [None]:
words = tfidf.get_feature_names()

Randomdly selecting words in the VMD and fastText corpus to compare semantic

In [None]:
a = np.random.randint(300, size=10)

In [None]:
plt.figure(figsize=(11,11))
for i in a:
    plt.scatter(VMD_embedded[i][0], VMD_embedded[i][1], c='red')
    plt.annotate(words[i], VMD_embedded[i], xytext=None, xycoords='data', textcoords='data', arrowprops=None)
    plt.title("TF-IDF_VMD")
plt.show()

In [None]:
plt.figure(figsize=(11,11))
for i in a:
    plt.scatter(fast[i][0], fast[i][1], c='blue')
    plt.annotate(words[i], fast[i], xytext=None, xycoords='data', textcoords='data', arrowprops=None)
    plt.title("FastText")
plt.show()