In [2]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
import os
import scipy.signal
import asyncio
import numba as nb

from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed
from sklearn.metrics import classification_report
from tqdm import tqdm
from vmdpy import VMD
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

os.chdir('../')
os.chdir('data/')

## Functions needed for execution

format_text() takes the dataframe and the column index which contains the text that needs to be cleaned, in this case the cleaning process was focussed for Tweets which include removal of special charactors, links and numbers.

In [22]:
def format_text(df,col):
      #Remove @ tags
      comp_df = df.copy()

      # remove all the punctuation
      comp_df[col] = comp_df[col].str.replace(r'(@\w*)','')

      #Remove URL
      comp_df[col] = comp_df[col].str.replace(r"http\S+", "")

      #Remove # tag 
      comp_df[col] = comp_df[col].str.replace('#',"")

      #Remove all non-character
      comp_df[col] = comp_df[col].str.replace(r"[^a-zA-Z ]","")

      # Remove extra space
      comp_df[col] = comp_df[col].str.replace(r'( +)'," ")
      comp_df[col] = comp_df[col].str.strip()

      # Change to lowercase
      comp_df[col] = comp_df[col].str.lower()
      comp_df[col] = comp_df[col].str.replace('httpurl', '')
      return comp_df

From the given modes, using the SciPy package the mode containing the maximum energy will be selected.

In [11]:
def energy(u):
# Estimate PSD `S_xx_welch` at discrete frequencies `f_welch`
    f_welch, S_xx_welch = scipy.signal.welch(u)
    # Integrate PSD over spectral bandwidth
    # to obtain signal power `P_welch`
    df_welch = f_welch[1] - f_welch[0]
    return np.sum(S_xx_welch) * df_welch

In [12]:
def maxvdm(f):
    alpha = 3     
    tau = 0            
    K = 2       
    DC = 0             
    init = 1           
    tol = 1e-8
    u, u_hat, omega = VMD(f, alpha, tau, K, DC, init, tol) 
    energy_array=[]
    for i in u:
        energy_array.append(energy(i))
    ind = np.argmax(energy_array)
    return u[ind]


def extract(features):
  X = []
  for i in features:
    X.append(maxvdm(i))
  return X

## Training Data:

Loading and Pre-Processing the Tweets Data

In [25]:
train = pd.read_csv('train.tsv',sep='\t')

In [26]:
os.listdir()

['.config', 'test.tsv', 'train.tsv', 'sample_data']

In [27]:
train = format_text(train,'Text')
X = train['Text'].tolist()
Y_train = train['Label']

Converting String Labels into Numeric Values with LabelEncoder

In [28]:
le = LabelEncoder()
le.fit(Y_train)
Y_train = le.transform(Y_train)

Computing the TF-IDF vectors from the given corpus of training data

In [29]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(X).toarray()

From the functions section, the maxvdm() which extracts K modes and returns the mode with the highest energy is called on every line.

In [None]:
X_data = [maxvdm(i) for i in tqdm(features)]

In [31]:
df = pd.DataFrame(X_data)
df['l'] = Y_train

## Testing Data:

The process of testing is similar to the above-mentioned training method, the same object tfidf and labelencoder must be called.

In [32]:
test = pd.read_csv('/content/test.tsv',sep='\t',header=None)
test = format_text(test,1)
X_test = test[1].tolist()
Y_test = le.transform(test[2])

In [None]:
features_test = tfidf.transform(X_test).toarray()
X_test = [maxvdm(i) for i in tqdm(features_test)]

## Evaluating Model Performance:

In [37]:
lr = LogisticRegression(random_state=0)
lr.fit(X_data,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [38]:
y_pred = lr.predict(X_test)

In [39]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.56      0.64       944
           1       0.68      0.84      0.75      1056

    accuracy                           0.71      2000
   macro avg       0.72      0.70      0.70      2000
weighted avg       0.72      0.71      0.70      2000



## Evaluating Performance on IMdb Data Classification

In [6]:
def format_reviews(df,col):
      #Remove @ tags
      comp_df = df.copy()

      # remove all the punctuation
      comp_df[col] = comp_df[col].str.replace(r'(@\w*)','')

      #Remove URL
      comp_df[col] = comp_df[col].str.replace(r"http\S+", "")

      #Remove all non-character
      comp_df[col] = comp_df[col].str.replace(r"[^a-zA-Z ]","")

      # Remove extra space
      comp_df[col] = comp_df[col].str.replace(r'( +)'," ")
      comp_df[col] = comp_df[col].str.strip()

      # Change to lowercase
      comp_df[col] = comp_df[col].str.lower()
      comp_df[col] = comp_df[col].str.replace('httpurl', '')
      return comp_df

In [3]:
imdb_data = pd.read_csv('IMDB.csv')

In [4]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
train = format_reviews(imdb_data,'review')
X = train['review'].tolist()
Y = train['sentiment']

In [8]:
le = LabelEncoder()
le.fit(Y)
y = le.transform(Y)

In [9]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(X).toarray()

In [None]:
X_data = [maxvdm(i) for i in tqdm(features)]


  0%|          | 0/50000 [00:00<?, ?it/s][A
  0%|          | 1/50000 [00:06<85:01:14,  6.12s/it][A
  0%|          | 2/50000 [00:12<84:24:53,  6.08s/it][A
  0%|          | 3/50000 [00:18<85:52:45,  6.18s/it][A
  0%|          | 4/50000 [00:29<104:38:46,  7.54s/it][A
  0%|          | 5/50000 [00:39<114:44:09,  8.26s/it][A
  0%|          | 6/50000 [00:46<110:38:04,  7.97s/it][A
  0%|          | 7/50000 [00:54<109:20:57,  7.87s/it][A
  0%|          | 8/50000 [01:01<106:52:51,  7.70s/it][A
  0%|          | 9/50000 [01:08<103:40:09,  7.47s/it][A
  0%|          | 10/50000 [01:14<100:22:39,  7.23s/it][A
  0%|          | 11/50000 [01:21<99:14:07,  7.15s/it] [A
  0%|          | 12/50000 [01:29<101:06:00,  7.28s/it][A
  0%|          | 13/50000 [01:37<105:27:59,  7.60s/it][A
  0%|          | 14/50000 [01:46<108:20:59,  7.80s/it][A
  0%|          | 15/50000 [01:53<104:35:33,  7.53s/it][A
  0%|          | 16/50000 [02:00<103:04:52,  7.42s/it][A
  0%|          | 17/50000 [02:08<107:0

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
 X_data, y, test_size=0.33, random_state=22)

In [None]:
lr = LogisticRegression(random_state=0)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))