In [30]:
import numpy as np
import pandas as pd
import sklearn
import os
from sklearn import preprocessing 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [2]:
os.chdir("/home/embibe/Personal/ML/NUS/LIAR-PLUS-master")

Reading all the preprocessed files.

In [3]:
train=pd.read_csv("train_preprocessed.csv")
val=pd.read_csv("val_preprocessed.csv")
test=pd.read_csv("test_preprocessed.csv")

In [4]:
train.shape,val.shape,test.shape

((10240, 207), (1284, 207), (1267, 207))

Function to initialize tf-idf and fit it on the train set and then transform the test and validation data to sequences based on the learnt tokenizer.

In [5]:
def tf_idf(df,flag):
    if (flag=='train'):
        xyz = np.concatenate([tfidf_statement.fit_transform(df['statement'].values).toarray().tolist(),
                         tfidf_justification.fit_transform(df['justification'].values.astype('U')).toarray().tolist()]
                         ,axis=1)
    else:
        xyz = np.concatenate([tfidf_statement.transform(df['statement'].values).toarray().tolist(),
                         tfidf_justification.transform(df['justification'].values.astype('U')).toarray().tolist()]
                         ,axis=1)
        
    return xyz

Extracting the features and labels from the loaded data.  

In [6]:
x_train = train.drop(['label_multiclass','label_binary'],axis=1)
x_val = val.drop(['label_multiclass','label_binary'],axis=1)
x_test = test.drop(['label_multiclass','label_binary'],axis=1)
y_train_multiclass = train['label_multiclass']
y_test_multiclass = test['label_multiclass']
y_val_multiclass = val['label_multiclass']
y_train_binary=train['label_binary']
y_val_binary=val['label_binary']
y_test_binary=test['label_binary']

In [7]:
x_train.shape,x_val.shape,x_test.shape

((10240, 205), (1284, 205), (1267, 205))

In [8]:
y_train_multiclass.unique(),y_train_binary.unique()

(array([1, 2, 3, 5, 0, 4]), array([0, 1]))

# TF-IDF

In [9]:
tfidf_statement = TfidfVectorizer(lowercase=True,ngram_range=(1,3),max_df=0.9, min_df=0.1)
tfidf_justification = TfidfVectorizer(lowercase=True,ngram_range=(1,3),max_df=0.9, min_df=0.1)

train_tfidf = pd.DataFrame(tf_idf(x_train,'train'))
val_tfidf= pd.DataFrame(tf_idf(x_val,'val'))
test_tfidf=  pd.DataFrame(tf_idf(x_test,'test'))

The original metadata is concatenated with the newly generated tfidf features.

In [10]:
train_features = pd.concat([train_tfidf,x_train],axis=1)
val_features = pd.concat([val_tfidf,x_val],axis=1)
test_features = pd.concat([test_tfidf,x_test],axis=1)

As statement and justification have been converted to vectors the as it is text columns have been removed.

In [11]:
train_features.drop(['statement','justification'],axis=1,inplace=True)
val_features.drop(['statement','justification'],axis=1,inplace=True)
test_features.drop(['statement','justification'],axis=1,inplace=True)

In [12]:
train_features.shape,val_features.shape,test_features.shape

((10240, 290), (1284, 290), (1267, 290))

# RANDOM FOREST

In [51]:
model_multiclass = RandomForestClassifier(n_estimators=200, oob_score='TRUE', n_jobs=-1, random_state=50, max_features="auto",min_samples_leaf=1)
model_multiclass.fit(train_features, y_train_multiclass)
y_pred_multiclass = model_multiclass.predict(test_features)

**Achieved 40% accuracy just by learning on train data. If validation is also passed, parameters will be tuned and accuracy shall improve further.Grid search has been bypassed due to time and resource constraints and had that being done, accuracy will improve further with the best parameters chosen.**

In [52]:
print("accuracy of Random Forest:",accuracy_score(y_pred_multiclass,y_test_multiclass))

('accuracy of Random Forest:', 0.40331491712707185)


In [53]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test_multiclass, y_pred_multiclass)
print(conf_mat)

[[ 64  39  56  42   6   5]
 [ 25 127  39  39  13   6]
 [ 26  50 116  62   2   9]
 [ 17  27  71 117   3   6]
 [  3  20  16   6  47   0]
 [ 15  29  58  62   4  40]]


In [54]:
report_multiclass=classification_report(y_test_multiclass, y_pred_multiclass)
print(report_multiclass)

              precision    recall  f1-score   support

           0       0.43      0.30      0.35       212
           1       0.43      0.51      0.47       249
           2       0.33      0.44      0.37       265
           3       0.36      0.49      0.41       241
           4       0.63      0.51      0.56        92
           5       0.61      0.19      0.29       208

   micro avg       0.40      0.40      0.40      1267
   macro avg       0.46      0.41      0.41      1267
weighted avg       0.44      0.40      0.40      1267



In [50]:
model_binary = RandomForestClassifier(n_estimators=400, oob_score='TRUE', n_jobs=-1, random_state=50, max_features="auto",min_samples_leaf=1)
model_binary.fit(train_features, y_train_binary)
y_pred_binary = model_binary.predict(test_features)

**Achieved 72% accuracy just by learning on train data. If validation is also passed, parameters will be tuned and accuracy shall improve further.**

In [43]:
print("accuracy of Random Forest:",accuracy_score(y_pred_binary,y_test_binary))

('accuracy of Random Forest:', 0.7277032359905288)


In [44]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test_binary, y_pred_binary)
print(conf_mat)

[[315 238]
 [107 607]]


In [45]:
report_binary=classification_report(y_test_binary, y_pred_binary)
print(report_binary)

              precision    recall  f1-score   support

           0       0.75      0.57      0.65       553
           1       0.72      0.85      0.78       714

   micro avg       0.73      0.73      0.73      1267
   macro avg       0.73      0.71      0.71      1267
weighted avg       0.73      0.73      0.72      1267



Before inserting into SVM all the features are scaled so as to be consistent with the weightage.

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(train_features)
x_val_scaled = scaler.transform(val_features)
x_test_scaled = scaler.transform(test_features)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.
  """


# SVM MODEL

For hyperparameter tuning. **The model is not trained due to resource constraints.**

In [20]:
params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [22]:
!pip freeze > requirements_tfidf.txt

An attempt to add validation data in SVM for improving results.

In [None]:
# from sklearn.model_selection import PredefinedSplit
# X = pd.concat(x_train_scaled,x_val_scaled)
# y = pd.concat(y_train_multiclass,y_val_multiclass)
# test_fold = [0, 1, -1, 1]
# ps = PredefinedSplit(test_fold)
# ps.get_n_splits()
# print(ps)       

# PredefinedSplit(test_fold=array([ 0,  1, -1,  1]))
# for train_index, test_index in ps.split():
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

In [None]:
svm_model_multiclass = GridSearchCV(SVC(), params_grid, cv=5)
svm_model_multiclass.fit(x_train_scaled, y_train_multiclass)

In [None]:
print('Best score for training data:', svm_model_multiclass.best_score_,"\n") 

# View the best parameters for the model found using grid search
print('Best C:',svm_model_multiclass.best_estimator_.C,"\n") 
print('Best Kernel:',svm_model_multiclass.best_estimator_.kernel,"\n")
print('Best Gamma:',svm_model_multiclass.best_estimator_.gamma,"\n")

final_model_multiclass = svm_model_multiclass.best_estimator_
y_pred_multiclass = final_model_multiclass.predict(x_test_scaled)
y_pred_multiclass = list(encoder.inverse_transform(y_pred_multiclass))

In [None]:
print(confusion_matrix(y_test,y_pred_multiclass))
print("\n")
print(classification_report(y_test,y_pred_multiclass))

print("Training set score for SVM: %f" % final_model_multiclass.score(x_train_scaled , y_train))
print("Testing  set score for SVM: %f" % final_model_multiclass.score(x_test_scaled  , y_test ))

svm_model_multiclass.score

In [None]:
svm_model_binary = GridSearchCV(SVC(), params_grid, cv=5)
svm_model_binary.fit(x_train_scaled, train_y_binary)

In [None]:
print('Best score for training data:', svm_model_binary.best_score_,"\n") 

# View the best parameters for the model found using grid search
print('Best C:',svm_model_binary.best_estimator_.C,"\n") 
print('Best Kernel:',svm_model_binary.best_estimator_.kernel,"\n")
print('Best Gamma:',svm_model_binary.best_estimator_.gamma,"\n")

final_model_binary = svm_model_binary.best_estimator_
y_pred_binary = final_model_binary.predict(x_test_scaled)
y_pred_binary = list(encoder.inverse_transform(y_pred_binary))

In [None]:
print(confusion_matrix(test_y_binary,y_pred_binary))
print("\n")
print(classification_report(test_y_binary,y_pred_binary))

print("Training set score for SVM: %f" % final_model_binary.score(x_train_scaled ,train_y_binary))
print("Testing  set score for SVM: %f" % final_model_binary.score(x_test_scaled  , test_y_binary ))

svm_model_binary.score