In [None]:
import pandas as pd
import numpy as np
import re

from textblob import TextBlob

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
path_to_clean = "C:/xampp/htdocs/Data-Mining/FP/clean/"
path_to_fitur = "C:/xampp/htdocs/Data-Mining/FP/fitur/"
lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=4000, max_depth=30)
nb = GaussianNB()
svm = SVC(C=100, kernel='linear', gamma=0.5)

classifiers = [lr,rf,svm,nb]
classifiers_name = ["Logistic Regression","Random Forest","SVM","Naive Bayes"]

In [3]:
filename = "SS-Twitter"

path_to_baseline = path_to_clean + filename +"_BASELINE.csv"
path_to_experiment = path_to_clean + filename +"_EXPERIMENT.csv"

In [4]:
base_ss = pd.read_csv(path_to_baseline)
expe_ss = pd.read_csv(path_to_experiment)
base_ss.head()
# expe_ss.head()

Unnamed: 0,mean pos,mean neg,Tweet,Clean,class
0,3,2,?RT @justinbiebcr: The bigger the better....if...,rt @justinbiebcr bigger better ... know mean ;),4
1,3,1,"Listening to the ""New Age"" station on @Slacker...",listening new age station @slackeradio,4
2,1,1,I favorited a YouTube video -- Drake and Josh ...,favorited youtube video drake josh storm rock,2
3,4,2,i didnt mean knee high I ment in lengt it goes...,did not mean knee high ment lengt goes knes cu...,4
4,2,1,I wana see the vid Kyan,wanna see video kyan,4


In [5]:
clean = base_ss['Clean'].to_list()
true = base_ss['class'].values

no_url = expe_ss['NO URL'].to_list()
no_stopword = expe_ss['NO STOPWORD'].to_list()
no_number = expe_ss['NO NUMBER'].to_list()
no_repeat = expe_ss['NO REPEAT'].to_list()
no_acronym = expe_ss['NO ACRONYM'].to_list()
no_negation = expe_ss['NO NEGATION'].to_list()
our = expe_ss['OUR'].to_list()

prep = [no_url,no_stopword,no_number,no_repeat,no_acronym,no_negation,our]
name = ["a","b","c","d","e","f"]

In [6]:
def get_pred(clean):
    pred = list()
    result = [TextBlob(re.sub("[.]", "", str(i))).sentiment.polarity for i in clean]
    for n in result:
        if n < 0:
            pred.append(0)
        elif n > 0:
            pred.append(4)
        else:
            pred.append(2)
    return result,pred

def matrix(true,pred):
    matrix = np.array(confusion_matrix(true, pred, labels=[4,2,0]))
    confusion = pd.DataFrame(matrix, index=['positive','Neutral', 'negative'],
                             columns=['predicted_positive','predicted_neutral','predicted_negative'])
    print ("Accuracy Score: {0:.2f}%".format(accuracy_score(true, pred)*100))
    print ("-"*80)
    print ("Confusion Matrix\n")
    print (confusion)
    print ("-"*80)
    print ("Classification Report\n")
    print (classification_report(true, pred))
    
def split(result,true):
    x = np.array(result)
    y = np.array(true)
    SEED = 10

    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.3,random_state=SEED)

    x_train = x_train.reshape(-1, 1)
    y_train = y_train.reshape(-1, 1)
    x_test = x_test.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)
    
    print ("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive, {2:.2f}% neutral"
       .format(len(x_train),
        (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
        (len(x_train[y_train == 4]) / (len(x_train)*1.))*100,
        (len(x_train[y_train == 2]) / (len(x_train)*1.))*100))

    print ("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive, {2:.2f}% neutral"
       .format(len(x_test),
        (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
        (len(x_test[y_test == 4]) / (len(x_test)*1.))*100,
        (len(x_test[y_test == 2]) / (len(x_test)*1.))*100))
    
    return x_train, x_test, y_train, y_test

def clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test):
#     for classifier,classifier_name in zip(classifiers,classifiers_name):
    classifier.fit(x_train,y_train)
    predicted_values = list(classifier.predict(x_test))

    print ("-"*80)
    print ("For Method : ",classifier_name)
    print ("Accuracy Score: {0:.2f}%".format(accuracy_score(predicted_values, y_test)*100))
    print ("Classification Report\n")
    print (classification_report(predicted_values, y_test))
    print ("-"*80)

In [7]:
print("\t\t\t\t\t"+filename+" BASELINE")
result,pred = get_pred(clean)
matrix(true,pred)
x_train, x_test, y_train, y_test = split(result,true)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					SS-Twitter BASELINE
Accuracy Score: 55.17%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 943                309                  81
Neutral                  730               1005                 205
negative                 262                304                 379
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.57      0.40      0.47       945
           2       0.62      0.52      0.56      1940
           4       0.49      0.71      0.58      1333

   micro avg       0.55      0.55      0.55      4218
   macro avg       0.56      0.54      0.54      4218
weighted avg       0.57      0.55      0.55      4218

Train set has total 2952 entries with 22.97% negative, 31.33% positive, 31.33% neutral
Test set 

  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 55.61%
Classification Report

              precision    recall  f1-score   support

           0       0.32      0.54      0.40       158
           2       0.69      0.57      0.62       720
           4       0.51      0.54      0.53       388

   micro avg       0.56      0.56      0.56      1266
   macro avg       0.51      0.55      0.52      1266
weighted avg       0.59      0.56      0.57      1266

--------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 54.42%
Classification Report

              precision    recall  f1-score   support

           0       0.16      0.63      0.26        70
           2       0.78      0.53      0.63       863
           4       0.46      0.56      0.50       333

   micro avg       0.54      0.54      0.54      1266
   macro avg       0.47      0.57      0.46      1266
weighted avg       0.66      0.54      0.58      1266

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  Naive Bayes
Accuracy Score: 53.63%
Classification Report

              precision    recall  f1-score   support

           0       0.21      0.64      0.32        87
           2       0.82      0.52      0.63       930
           4       0.35      0.57      0.43       249

   micro avg       0.5

  y = column_or_1d(y, warn=True)


In [8]:
print("\t\t\t\t\t"+filename+" NO URL")
result,pred = get_pred(no_url)
matrix(true,pred)
x_train, x_test, y_train, y_test = split(result,true)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					SS-Twitter NO URL
Accuracy Score: 55.17%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 943                309                  81
Neutral                  729               1005                 206
negative                 262                304                 379
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.57      0.40      0.47       945
           2       0.62      0.52      0.56      1940
           4       0.49      0.71      0.58      1333

   micro avg       0.55      0.55      0.55      4218
   macro avg       0.56      0.54      0.54      4218
weighted avg       0.57      0.55      0.55      4218

Train set has total 2952 entries with 22.97% negative, 31.33% positive, 31.33% neutral
Test set ha

  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 55.53%
Classification Report

              precision    recall  f1-score   support

           0       0.33      0.55      0.41       161
           2       0.68      0.56      0.62       716
           4       0.52      0.54      0.53       389

   micro avg       0.56      0.56      0.56      1266
   macro avg       0.51      0.55      0.52      1266
weighted avg       0.59      0.56      0.56      1266

--------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 54.42%
Classification Report

              precision    recall  f1-score   support

           0       0.16      0.63      0.26        70
           2       0.78      0.53      0.63       863
           4       0.46      0.56      0.50       333

   micro avg       0.54      0.54      0.54      1266
   macro avg       0.47      0.57      0.46      1266
weighted avg       0.66      0.54      0.58      1266

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  Naive Bayes
Accuracy Score: 53.63%
Classification Report

              precision    recall  f1-score   support

           0       0.21      0.64      0.32        87
           2       0.82      0.52      0.63       930
           4       0.35      0.57      0.43       249

   micro avg       0.5

  y = column_or_1d(y, warn=True)


In [9]:
print("\t\t\t\t\t"+filename+" NO STOPWORD")
result,pred = get_pred(no_stopword)
matrix(true,pred)
x_train, x_test, y_train, y_test = split(result,true)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					SS-Twitter NO STOPWORD
Accuracy Score: 54.17%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 958                284                  91
Neutral                  765                949                 226
negative                 280                287                 378
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.54      0.40      0.46       945
           2       0.62      0.49      0.55      1940
           4       0.48      0.72      0.57      1333

   micro avg       0.54      0.54      0.54      4218
   macro avg       0.55      0.54      0.53      4218
weighted avg       0.56      0.54      0.54      4218

Train set has total 2952 entries with 22.97% negative, 31.33% positive, 31.33% neutral
Test s

  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 53.95%
Classification Report

              precision    recall  f1-score   support

           0       0.31      0.52      0.39       159
           2       0.66      0.56      0.61       701
           4       0.51      0.51      0.51       406

   micro avg       0.54      0.54      0.54      1266
   macro avg       0.49      0.53      0.50      1266
weighted avg       0.57      0.54      0.55      1266

--------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 53.32%
Classification Report

              precision    recall  f1-score   support

           0       0.12      0.65      0.20        49
           2       0.78      0.52      0.63       876
           4       0.45      0.54      0.49       341

   micro avg       0.53      0.53      0.53      1266
   macro avg       0.45      0.57      0.44      1266
weighted avg       0.66      0.53      0.57      1266

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  Naive Bayes
Accuracy Score: 52.69%
Classification Report

              precision    recall  f1-score   support

           0       0.19      0.64      0.29        80
           2       0.81      0.51      0.63       928
           4       0.34      0.54      0.42       258

   micro avg       0.5

  y = column_or_1d(y, warn=True)


In [10]:
print("\t\t\t\t\t"+filename+" NO NUMBER")
result,pred = get_pred(no_number)
matrix(true,pred)
x_train, x_test, y_train, y_test = split(result,true)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					SS-Twitter NO NUMBER
Accuracy Score: 55.29%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 949                302                  82
Neutral                  732               1005                 203
negative                 264                303                 378
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.57      0.40      0.47       945
           2       0.62      0.52      0.57      1940
           4       0.49      0.71      0.58      1333

   micro avg       0.55      0.55      0.55      4218
   macro avg       0.56      0.54      0.54      4218
weighted avg       0.57      0.55      0.55      4218

Train set has total 2952 entries with 22.97% negative, 31.33% positive, 31.33% neutral
Test set

  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 55.92%
Classification Report

              precision    recall  f1-score   support

           0       0.32      0.54      0.40       159
           2       0.69      0.57      0.62       716
           4       0.53      0.55      0.54       391

   micro avg       0.56      0.56      0.56      1266
   macro avg       0.51      0.55      0.52      1266
weighted avg       0.59      0.56      0.57      1266

--------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 54.42%
Classification Report

              precision    recall  f1-score   support

           0       0.13      0.67      0.22        54
           2       0.79      0.53      0.63       877
           4       0.46      0.56      0.51       335

   micro avg       0.54      0.54      0.54      1266
   macro avg       0.46      0.59      0.45      1266
weighted avg       0.67      0.54      0.58      1266

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  Naive Bayes
Accuracy Score: 53.87%
Classification Report

              precision    recall  f1-score   support

           0       0.21      0.64      0.32        87
           2       0.82      0.52      0.64       931
           4       0.35      0.57      0.43       248

   micro avg       0.5

  y = column_or_1d(y, warn=True)


In [11]:
print("\t\t\t\t\t"+filename+" NO ACRONYM")
result,pred = get_pred(no_acronym)
matrix(true,pred)
x_train, x_test, y_train, y_test = split(result,true)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					SS-Twitter NO ACRYNIM
Accuracy Score: 55.26%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 941                324                  68
Neutral                  732               1012                 196
negative                 260                307                 378
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.59      0.40      0.48       945
           2       0.62      0.52      0.56      1940
           4       0.49      0.71      0.58      1333

   micro avg       0.55      0.55      0.55      4218
   macro avg       0.56      0.54      0.54      4218
weighted avg       0.57      0.55      0.55      4218

Train set has total 2952 entries with 22.97% negative, 31.33% positive, 31.33% neutral
Test se

  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 55.13%
Classification Report

              precision    recall  f1-score   support

           0       0.32      0.57      0.41       152
           2       0.70      0.55      0.62       742
           4       0.49      0.54      0.52       372

   micro avg       0.55      0.55      0.55      1266
   macro avg       0.50      0.55      0.51      1266
weighted avg       0.59      0.55      0.56      1266

--------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 53.95%
Classification Report

              precision    recall  f1-score   support

           0       0.13      0.73      0.22        48
           2       0.79      0.52      0.63       888
           4       0.45      0.55      0.50       330

   micro avg       0.54      0.54      0.54      1266
   macro avg       0.46      0.60      0.45      1266
weighted avg       0.67      0.54      0.58      1266

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  Naive Bayes
Accuracy Score: 53.63%
Classification Report

              precision    recall  f1-score   support

           0       0.20      0.68      0.31        78
           2       0.79      0.52      0.63       900
           4       0.39      0.55      0.45       288

   micro avg       0.5

  y = column_or_1d(y, warn=True)


In [12]:
print("\t\t\t\t\t"+filename+" NO NEGATION")
result,pred = get_pred(no_negation)
matrix(true,pred)
x_train, x_test, y_train, y_test = split(result,true)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					SS-Twitter NO NEGATION
Accuracy Score: 55.00%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 943                309                  81
Neutral                  730               1005                 205
negative                 269                304                 372
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.57      0.39      0.46       945
           2       0.62      0.52      0.56      1940
           4       0.49      0.71      0.58      1333

   micro avg       0.55      0.55      0.55      4218
   macro avg       0.56      0.54      0.53      4218
weighted avg       0.57      0.55      0.55      4218

Train set has total 2952 entries with 22.97% negative, 31.33% positive, 31.33% neutral
Test s

  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 55.45%
Classification Report

              precision    recall  f1-score   support

           0       0.32      0.54      0.40       156
           2       0.68      0.57      0.62       711
           4       0.52      0.54      0.53       399

   micro avg       0.55      0.55      0.55      1266
   macro avg       0.51      0.55      0.52      1266
weighted avg       0.59      0.55      0.56      1266

--------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 54.27%
Classification Report

              precision    recall  f1-score   support

           0       0.13      0.67      0.22        54
           2       0.79      0.53      0.63       876
           4       0.46      0.56      0.50       336

   micro avg       0.54      0.54      0.54      1266
   macro avg       0.46      0.58      0.45      1266
weighted avg       0.67      0.54      0.58      1266

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  Naive Bayes
Accuracy Score: 53.63%
Classification Report

              precision    recall  f1-score   support

           0       0.21      0.65      0.31        85
           2       0.82      0.52      0.63       931
           4       0.35      0.56      0.43       250

   micro avg       0.5

  y = column_or_1d(y, warn=True)


In [13]:
print("\t\t\t\t\t"+filename+" OUR")
result,pred = get_pred(our)
matrix(true,pred)
x_train, x_test, y_train, y_test = split(result,true)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					SS-Twitter OUR
Accuracy Score: 55.38%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 947                310                  76
Neutral                  724               1018                 198
negative                 267                307                 371
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.58      0.39      0.47       945
           2       0.62      0.52      0.57      1940
           4       0.49      0.71      0.58      1333

   micro avg       0.55      0.55      0.55      4218
   macro avg       0.56      0.54      0.54      4218
weighted avg       0.57      0.55      0.55      4218

Train set has total 2952 entries with 22.97% negative, 31.33% positive, 31.33% neutral
Test set has t

  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 55.85%
Classification Report

              precision    recall  f1-score   support

           0       0.33      0.57      0.42       155
           2       0.72      0.56      0.63       755
           4       0.48      0.55      0.51       356

   micro avg       0.56      0.56      0.56      1266
   macro avg       0.51      0.56      0.52      1266
weighted avg       0.60      0.56      0.57      1266

--------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 54.11%
Classification Report

              precision    recall  f1-score   support

           0       0.15      0.63      0.24        62
           2       0.82      0.52      0.64       927
           4       0.40      0.58      0.47       277

   micro avg       0.54      0.54      0.54      1266
   macro avg       0.45      0.58      0.45      1266
weighted avg       0.69      0.54      0.58      1266

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  Naive Bayes
Accuracy Score: 54.27%
Classification Report

              precision    recall  f1-score   support

           0       0.21      0.66      0.31        83
           2       0.83      0.52      0.64       937
           4       0.35      0.58      0.44       246

   micro avg       0.5

  y = column_or_1d(y, warn=True)


In [5]:
# %reset_selective <regular_expression>

In [17]:
filename = "STS-Test"

path_to_baseline = path_to_clean + filename +"_BASELINE.csv"
path_to_experiment = path_to_clean + filename +"_EXPERIMENT.csv"

In [28]:
base_sts = pd.read_csv(path_to_baseline)
expe_sts = pd.read_csv(path_to_experiment)
# np.shape(expe_sts)
# base_ss.head()
# expe_ss.head()

In [26]:
clean_sts = base_sts['Clean'].to_list()
true_sts = base_sts['class'].values

no_url = expe_sts['NO URL'].to_list()
no_stopword = expe_sts['NO STOPWORD'].to_list()
no_number = expe_sts['NO NUMBER'].to_list()
no_repeat = expe_sts['NO REPEAT'].to_list()
no_acronym = expe_sts['NO ACRONYM'].to_list()
no_negation = expe_sts['NO NEGATION'].to_list()
our = expe_sts['OUR'].to_list()

prep = [no_url,no_stopword,no_number,no_repeat,no_acronym,no_negation,our]
name = ["a","b","c","d","e","f"]

In [31]:
print("\t\t\t\t\t"+filename+" BASELINE")
result,pred = get_pred(clean_sts)
matrix(true_sts,pred)
x_train, x_test, y_train, y_test = split(result,true_sts)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					STS-Test BASELINE
Accuracy Score: 66.47%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 147                 24                  11
Neutral                   37                 94                   8
negative                  48                 39                  90
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.83      0.51      0.63       177
           2       0.60      0.68      0.64       139
           4       0.63      0.81      0.71       182

   micro avg       0.66      0.66      0.66       498
   macro avg       0.69      0.66      0.66       498
weighted avg       0.69      0.66      0.66       498

Train set has total 348 entries with 33.05% negative, 38.79% positive, 38.79% neutral
Test set has

  y = column_or_1d(y, warn=True)
  'recall', 'true', average, warn_for)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 64.67%
Classification Report

              precision    recall  f1-score   support

           0       0.60      0.84      0.70        44
           2       0.66      0.51      0.57        53
           4       0.70      0.62      0.66        53

   micro avg       0.65      0.65      0.65       150
   macro avg       0.65      0.66      0.64       150
weighted avg       0.66      0.65      0.64       150

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 56.00%
Classification Report

              precision    recall  f1-score   support

           0       0.77      0.55      0.64        87
           2       0.10      0.27      0.14        15
           4       0.68      0.67      0.67        48

   micro avg       0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [32]:
print("\t\t\t\t\t"+filename+" NO URL")
result,pred = get_pred(no_url)
matrix(true_sts,pred)
x_train, x_test, y_train, y_test = split(result,true_sts)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					STS-Test NO URL
Accuracy Score: 66.47%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 147                 24                  11
Neutral                   37                 94                   8
negative                  48                 39                  90
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.83      0.51      0.63       177
           2       0.60      0.68      0.64       139
           4       0.63      0.81      0.71       182

   micro avg       0.66      0.66      0.66       498
   macro avg       0.69      0.66      0.66       498
weighted avg       0.69      0.66      0.66       498

Train set has total 348 entries with 33.05% negative, 38.79% positive, 38.79% neutral
Test set has t

  y = column_or_1d(y, warn=True)
  'recall', 'true', average, warn_for)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 64.67%
Classification Report

              precision    recall  f1-score   support

           0       0.60      0.84      0.70        44
           2       0.66      0.51      0.57        53
           4       0.70      0.62      0.66        53

   micro avg       0.65      0.65      0.65       150
   macro avg       0.65      0.66      0.64       150
weighted avg       0.66      0.65      0.64       150

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 56.00%
Classification Report

              precision    recall  f1-score   support

           0       0.77      0.55      0.64        87
           2       0.10      0.27      0.14        15
           4       0.68      0.67      0.67        48

   micro avg       0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [33]:
print("\t\t\t\t\t"+filename+" NO STOPWORD")
result,pred = get_pred(no_stopword)
matrix(true_sts,pred)
x_train, x_test, y_train, y_test = split(result,true_sts)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					STS-Test NO STOPWORD
Accuracy Score: 66.47%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 147                 22                  13
Neutral                   38                 93                   8
negative                  53                 33                  91
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.81      0.51      0.63       177
           2       0.63      0.67      0.65       139
           4       0.62      0.81      0.70       182

   micro avg       0.66      0.66      0.66       498
   macro avg       0.69      0.66      0.66       498
weighted avg       0.69      0.66      0.66       498

Train set has total 348 entries with 33.05% negative, 38.79% positive, 38.79% neutral
Test set 

  y = column_or_1d(y, warn=True)
  'recall', 'true', average, warn_for)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 60.67%
Classification Report

              precision    recall  f1-score   support

           0       0.52      0.76      0.62        42
           2       0.68      0.53      0.60        53
           4       0.66      0.56      0.61        55

   micro avg       0.61      0.61      0.61       150
   macro avg       0.62      0.62      0.61       150
weighted avg       0.63      0.61      0.61       150

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 55.33%
Classification Report

              precision    recall  f1-score   support

           0       0.77      0.55      0.64        87
           2       0.10      0.29      0.15        14
           4       0.66      0.63      0.65        49

   micro avg       0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [34]:
print("\t\t\t\t\t"+filename+" NO NUMBER")
result,pred = get_pred(no_number)
matrix(true_sts,pred)
x_train, x_test, y_train, y_test = split(result,true_sts)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					STS-Test NO NUMBER
Accuracy Score: 66.47%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 147                 24                  11
Neutral                   37                 94                   8
negative                  48                 39                  90
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.83      0.51      0.63       177
           2       0.60      0.68      0.64       139
           4       0.63      0.81      0.71       182

   micro avg       0.66      0.66      0.66       498
   macro avg       0.69      0.66      0.66       498
weighted avg       0.69      0.66      0.66       498

Train set has total 348 entries with 33.05% negative, 38.79% positive, 38.79% neutral
Test set ha

  y = column_or_1d(y, warn=True)
  'recall', 'true', average, warn_for)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 62.00%
Classification Report

              precision    recall  f1-score   support

           0       0.60      0.79      0.68        47
           2       0.66      0.50      0.57        54
           4       0.62      0.59      0.60        49

   micro avg       0.62      0.62      0.62       150
   macro avg       0.62      0.63      0.62       150
weighted avg       0.63      0.62      0.61       150

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 56.00%
Classification Report

              precision    recall  f1-score   support

           0       0.77      0.55      0.64        87
           2       0.10      0.27      0.14        15
           4       0.68      0.67      0.67        48

   micro avg       0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [35]:
print("\t\t\t\t\t"+filename+" NO ACRONYM")
result,pred = get_pred(no_acronym)
matrix(true_sts,pred)
x_train, x_test, y_train, y_test = split(result,true_sts)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					STS-Test NO ACRYNIM
Accuracy Score: 66.06%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 145                 27                  10
Neutral                   37                 94                   8
negative                  45                 42                  90
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.83      0.51      0.63       177
           2       0.58      0.68      0.62       139
           4       0.64      0.80      0.71       182

   micro avg       0.66      0.66      0.66       498
   macro avg       0.68      0.66      0.65       498
weighted avg       0.69      0.66      0.66       498

Train set has total 348 entries with 33.05% negative, 38.79% positive, 38.79% neutral
Test set h

  y = column_or_1d(y, warn=True)
  'recall', 'true', average, warn_for)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 62.00%
Classification Report

              precision    recall  f1-score   support

           0       0.58      0.77      0.66        47
           2       0.68      0.50      0.58        56
           4       0.62      0.62      0.62        47

   micro avg       0.62      0.62      0.62       150
   macro avg       0.63      0.63      0.62       150
weighted avg       0.63      0.62      0.62       150

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 55.33%
Classification Report

              precision    recall  f1-score   support

           0       0.77      0.55      0.64        88
           2       0.10      0.29      0.15        14
           4       0.66      0.65      0.65        48

   micro avg       0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [36]:
print("\t\t\t\t\t"+filename+" NO NEGATION")
result,pred = get_pred(no_negation)
matrix(true_sts,pred)
x_train, x_test, y_train, y_test = split(result,true_sts)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					STS-Test NO NEGATION
Accuracy Score: 66.27%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 147                 24                  11
Neutral                   37                 94                   8
negative                  49                 39                  89
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.82      0.50      0.62       177
           2       0.60      0.68      0.64       139
           4       0.63      0.81      0.71       182

   micro avg       0.66      0.66      0.66       498
   macro avg       0.68      0.66      0.66       498
weighted avg       0.69      0.66      0.66       498

Train set has total 348 entries with 33.05% negative, 38.79% positive, 38.79% neutral
Test set 

  y = column_or_1d(y, warn=True)
  'recall', 'true', average, warn_for)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 64.67%
Classification Report

              precision    recall  f1-score   support

           0       0.60      0.84      0.70        44
           2       0.66      0.51      0.57        53
           4       0.70      0.62      0.66        53

   micro avg       0.65      0.65      0.65       150
   macro avg       0.65      0.66      0.64       150
weighted avg       0.66      0.65      0.64       150

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 54.67%
Classification Report

              precision    recall  f1-score   support

           0       0.74      0.54      0.63        85
           2       0.10      0.27      0.14        15
           4       0.68      0.64      0.66        50

   micro avg       0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [37]:
print("\t\t\t\t\t"+filename+" OUR")
result,pred = get_pred(our)
matrix(true_sts,pred)
x_train, x_test, y_train, y_test = split(result,true_sts)
for classifier,classifier_name in zip(classifiers,classifiers_name):
    clss(classifiers,classifiers_name,x_train,x_test,y_train,y_test)

					STS-Test OUR
Accuracy Score: 66.47%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_neutral  predicted_negative
positive                 149                 23                  10
Neutral                   37                 94                   8
negative                  49                 40                  88
--------------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.83      0.50      0.62       177
           2       0.60      0.68      0.64       139
           4       0.63      0.82      0.71       182

   micro avg       0.66      0.66      0.66       498
   macro avg       0.69      0.66      0.66       498
weighted avg       0.69      0.66      0.66       498

Train set has total 348 entries with 33.05% negative, 38.79% positive, 38.79% neutral
Test set has tota

  y = column_or_1d(y, warn=True)
  'recall', 'true', average, warn_for)


--------------------------------------------------------------------------------
For Method :  Random Forest
Accuracy Score: 64.67%
Classification Report

              precision    recall  f1-score   support

           0       0.61      0.86      0.72        44
           2       0.68      0.48      0.57        58
           4       0.66      0.65      0.65        48

   micro avg       0.65      0.65      0.65       150
   macro avg       0.65      0.66      0.65       150
weighted avg       0.65      0.65      0.64       150

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
For Method :  SVM
Accuracy Score: 56.67%
Classification Report

              precision    recall  f1-score   support

           0       0.79      0.56      0.66        87
           2       0.10      0.27      0.14        15
           4       0.68      0.67      0.67        48

   micro avg       0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
# def get_ngrams(text, n ):
#     n_grams = ngrams(tkn.tokenize(text), n)
    
#     return [ ' '.join(grams) for grams in n_grams]

# def get_pos(x):
#     for pos in x:
#         tampung = pos[1]
        
#         if tampung.startswith('J'): #adj
#             return 'a'
#         elif tampung.startswith('V'): #verb
#             return 'v'
#         elif tampung.startswith('N'): #noun
#             return 'n'
#         elif tampung.startswith('R'):
#             return 'r'
#         else:
#             return 'n'
        
# def get_synset(text,var):
#     for x in text:
#         synsets = wn.synsets(x, pos=var)
#         if not synsets:
#             continue
#         else:
#             synset = synsets[0]
#             senti = swn.senti_synset(synset.name())
#             print(text,senti)

# #         return synset
# # #             senti = swn.senti_synset(synset.name())
# # #             print(senti)

# # #             sentiment += senti.pos_score() - senti.neg_score()
# # #             tokens_count += 1

In [40]:
# for types, names in zip(prep,name):
#     print("\t\t\t\t\t"+names)
#     result,pred = get_pred(types)
#     matrix(true,pred)
#     x_train, x_test, y_train, y_test = split(result,true)
#     for classifier,classifier_name in zip(classifiers,classifiers_name):
#         cls(classifiers,classifiers_name,x_train,x_test,y_train,y_test)