In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
import os
import scipy.signal
from numba import jit, prange

from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed
from sklearn.metrics import classification_report
from tqdm import tqdm
from vmdpy import VMD
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

os.chdir('../')
os.chdir('data/')

In [2]:
def format_text(df,col):
      comp_df = df.copy()
      # remove all the punctuation
      comp_df[col] = comp_df[col].str.replace(r'(@\w*)','')

      #Remove URL
      comp_df[col] = comp_df[col].str.replace(r"http\S+", "")

      #Remove all non-character
      comp_df[col] = comp_df[col].str.replace(r"[^a-zA-Z ]","")

      # Remove extra space
      comp_df[col] = comp_df[col].str.replace(r'( +)'," ")
      comp_df[col] = comp_df[col].str.strip()

      # Change to lowercase
      comp_df[col] = comp_df[col].str.lower()
      comp_df[col] = comp_df[col].str.replace('httpurl', '')
      return comp_df

def energy(u):
# Estimate PSD `S_xx_welch` at discrete frequencies `f_welch`
    f_welch, S_xx_welch = scipy.signal.welch(u)
    # Integrate PSD over spectral bandwidth
    # to obtain signal power `P_welch`
    df_welch = f_welch[1] - f_welch[0]
    return np.sum(S_xx_welch) * df_welch  

def maxvdm(f):
    alpha = 2     
    tau = 0            
    K = 2       
    DC = 0             
    init = 1           
    tol = 1e-10
    u, u_hat, omega = VMD(f, alpha, tau, K, DC, init, tol) 
    energy_array=[]
    for i in u:
        energy_array.append(energy(i))
    ind = np.argmax(energy_array)
    return u[ind]

def extract(features):
  X = []
  for i in features:
    X.append(maxvdm(i))
  return X

In [3]:
imdb_data = pd.read_csv('IMDB.csv')

In [4]:
imdb_data_sam = imdb_data.sample(n=5000, replace=False)
train = format_text(imdb_data_sam,'review')
X = train['review'].tolist()
Y = train['sentiment']

In [5]:
le = LabelEncoder()
le.fit(Y)
y = le.transform(Y)

In [6]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(X).toarray()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.25)

In [8]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.82      0.84       660
           1       0.81      0.86      0.84       590

    accuracy                           0.84      1250
   macro avg       0.84      0.84      0.84      1250
weighted avg       0.84      0.84      0.84      1250



In [None]:
@jit(parallel=True,forceobj=True)
def vm(features):
    X = np.zeros((features.shape[0],features.shape[1]-1))
    for i in tqdm(prange(len(features))):
        X[i]=maxvdm(features[i])
    return X

X_train = vm(X_train)
X_test  = vm(X_test)

  7%|▋         | 264/3750 [03:26<46:10,  1.26it/s]  

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))