In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('/Volumes/Rohith/glove/glove.twitter.27B.200d.txt', sep=" ", quoting=3, header=None, index_col=0)
glove = {key: val.values for key, val in df.T.items()}

In [3]:
train = pd.read_csv('/Users/rohith/Documents/College/Semester 4/Papers/NLP-COVID/WNUT-2020-Task-2-Dataset/train.tsv',sep='\t')

In [4]:
train.head()

Unnamed: 0,Id,Text,Label
0,1241490299215634434,Official death toll from #covid19 in the Unite...,INFORMATIVE
1,1245916400981381130,"Dearest Mr. President @USER 1,169 coronavirus ...",INFORMATIVE
2,1241132432402849793,Latest Updates March 20 ⚠️5274 new cases and 3...,INFORMATIVE
3,1236107253666607104,真把公主不当干部 BREAKING: 21 people on Grand Princess...,INFORMATIVE
4,1239673817552879619,OKLAHOMA CITY — The State Department of Educat...,UNINFORMATIVE


In [5]:
def format_text(df,col):
  #Remove @ tags
  comp_df = df.copy()
    
  # remove all the punctuation
  comp_df[col] = comp_df[col].str.replace(r'(@\w*)','')

  #Remove URL
  comp_df[col] = comp_df[col].str.replace(r"http\S+", "")

  #Remove # tag and the following words
  comp_df[col] = comp_df[col].str.replace(r'#\w+',"")

  #Remove all non-character
  comp_df[col] = comp_df[col].str.replace(r"[^a-zA-Z ]","")

  # Remove extra space
  comp_df[col] = comp_df[col].str.replace(r'( +)'," ")
  comp_df[col] = comp_df[col].str.strip()

  # Change to lowercase
  comp_df[col] = comp_df[col].str.lower()

  return comp_df

In [6]:
train_m = format_text(train,'Text')

In [7]:
train_m.head(7)

Unnamed: 0,Id,Text,Label
0,1241490299215634434,official death toll from in the united kingdom...,INFORMATIVE
1,1245916400981381130,dearest mr president coronavirus deaths in the...,INFORMATIVE
2,1241132432402849793,latest updates march new cases and new deaths ...,INFORMATIVE
3,1236107253666607104,breaking people on grand princess cruise ship ...,INFORMATIVE
4,1239673817552879619,oklahoma city the state department of educatio...,UNINFORMATIVE
5,1240790181860409344,democrats somehow managed to fight ebola witho...,UNINFORMATIVE
6,1249147011003187200,as number of deaths surpassed worldwide ny sur...,INFORMATIVE


In [8]:
train_m['Text']=train_m['Text'].str.replace('httpurl', '')

In [9]:
X_X = train_m['Text'].tolist()
Y_train = train_m['Label'].tolist()

In [10]:
le = LabelEncoder()
le.fit(Y_train)
Y_train = le.transform(Y_train)

In [11]:
vocab = []
for i in range(len(X_X)):
    vocab.append(word_tokenize(X_X[i]))

In [12]:
size = 200

In [13]:
X_train = np.zeros((len(vocab),size)) #Initializing the X matrix with zeros
for i in range(len(vocab)):
    emb = np.zeros((1,size))
    for w in vocab[i]:
        if w in glove.keys():
            emb = emb +  glove[w]
    X_train[i] = emb

In [18]:
test = pd.read_csv('/Users/rohith/Documents/College/Semester 4/Papers/NLP-COVID/WNUT-2020-Task-2-Dataset/test.tsv',sep='\t',header=None)

In [19]:
test_m = format_text(test,1)

In [20]:
test_m[1]=test_m[1].str.replace('httpurl', '')

In [21]:
X_X_1 = test_m[1].tolist()
Y_test = le.transform(test_m[2].tolist())

In [22]:
vocab = []
for i in range(len(X_X_1)):
    vocab.append(word_tokenize(X_X_1[i]))

In [23]:
X_test = np.zeros((len(vocab),size)) #Initializing the X matrix with zeros
for i in range(len(vocab)):
    emb = np.zeros((1,size))
    for w in vocab[i]:
        if w in glove.keys():
            emb = emb +  glove[w]
    X_test[i] = emb

In [25]:
models = [
    
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    #MultinomialNB(),
    LogisticRegression(random_state=0),
    KNeighborsClassifier(n_neighbors=3)
]

In [26]:
for i in models:
    #model = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
    name = i.__class__.__name__
    #Split Data 

    #Train Algorithm
    i.fit(X_train, Y_train)

    # Make Predictions
    #y_pred_proba = i.predict_proba(X_test)
    y_pred = i.predict(X_test)
    print(name)
    print(classification_report(Y_test, y_pred))
    print(accuracy_score(Y_test, y_pred))
    print("------------------------------------------------------------")

RandomForestClassifier
              precision    recall  f1-score   support

           0       0.70      0.54      0.61       944
           1       0.66      0.80      0.72      1056

    accuracy                           0.68      2000
   macro avg       0.68      0.67      0.67      2000
weighted avg       0.68      0.68      0.67      2000

0.6765
------------------------------------------------------------
LogisticRegression
              precision    recall  f1-score   support

           0       0.71      0.70      0.71       944
           1       0.74      0.75      0.74      1056

    accuracy                           0.73      2000
   macro avg       0.73      0.72      0.72      2000
weighted avg       0.73      0.73      0.73      2000

0.726
------------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KNeighborsClassifier
              precision    recall  f1-score   support

           0       0.60      0.76      0.67       944
           1       0.72      0.55      0.63      1056

    accuracy                           0.65      2000
   macro avg       0.66      0.66      0.65      2000
weighted avg       0.66      0.65      0.65      2000

0.6505
------------------------------------------------------------


## VMD

In [32]:
from vmdpy import VMD
def maxvdm(f):
    alpha = 1       
    tau = 0            
    K = 4         
    DC = 0             
    init = 1           
    tol = 1e-7  
    u, u_hat, omega = VMD(f, alpha, tau, K, DC, init, tol) 
    energy_array=[]
    for i in u:
        energy_array.append(energy(i))
    ind = np.argmax(energy_array)
    return u[ind]

In [28]:
import scipy.signal
def energy(u):
# Estimate PSD `S_xx_welch` at discrete frequencies `f_welch`
    f_welch, S_xx_welch = scipy.signal.welch(u)

    # Integrate PSD over spectral bandwidth
    # to obtain signal power `P_welch`
    df_welch = f_welch[1] - f_welch[0]
    return np.sum(S_xx_welch) * df_welch

In [33]:
X_train_vmd = []
for i in X_train:
    X_train_vmd.append(maxvdm(i))



In [34]:
X_test_vmd = []
for i in X_test:
    X_test_vmd.append(maxvdm(i))

In [35]:
for i in models:
    #model = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
    name = i.__class__.__name__
    #Split Data 

    #Train Algorithm
    i.fit(X_train_vmd, Y_train)

    # Make Predictions
    #y_pred_proba = i.predict_proba(X_test)
    y_pred = i.predict(X_test_vmd)
    print(name)
    print(classification_report(Y_test, y_pred))
    print(accuracy_score(Y_test, y_pred))
    print("------------------------------------------------------------")

RandomForestClassifier
              precision    recall  f1-score   support

           0       0.66      0.44      0.53       944
           1       0.61      0.79      0.69      1056

    accuracy                           0.63      2000
   macro avg       0.63      0.62      0.61      2000
weighted avg       0.63      0.63      0.61      2000

0.627
------------------------------------------------------------
LogisticRegression
              precision    recall  f1-score   support

           0       0.71      0.67      0.69       944
           1       0.72      0.75      0.74      1056

    accuracy                           0.72      2000
   macro avg       0.72      0.71      0.71      2000
weighted avg       0.72      0.72      0.72      2000

0.716
------------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KNeighborsClassifier
              precision    recall  f1-score   support

           0       0.58      0.67      0.62       944
           1       0.66      0.57      0.61      1056

    accuracy                           0.62      2000
   macro avg       0.62      0.62      0.62      2000
weighted avg       0.62      0.62      0.62      2000

0.6185
------------------------------------------------------------
