In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from textblob import TextBlob
from sklearn.pipeline import Pipeline, FeatureUnion
import sklearn.metrics as metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler,MaxAbsScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

<h2> Data Loading

In [40]:
df = pd.read_csv('/home/vlad/Anaconda_notebooks/Propagada_Detection/SLC_task_df.tsv',sep = '\t')
df = df.drop(columns = ['Unnamed: 0', 'article'])
print(f"DataFrame shape:{df.shape}")
df.head()

DataFrame shape:(16297, 2)


Unnamed: 0,text,label
0,US bloggers banned from entering UK\n,non-propaganda
1,Two prominent US bloggers have been banned fro...,non-propaganda
2,Pamela Geller and Robert Spencer co-founded an...,propaganda
3,They were due to speak at an English Defence L...,non-propaganda
4,A government spokesman said individuals whose ...,non-propaganda


<h2> Data tokenization,lemmatization,lowercasing, cleaning fucntion

In [41]:
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English


punctuations = string.punctuation # list of punctuation marks

stop_words = spacy.lang.en.stop_words.STOP_WORDS # list of stop words

parser = English() # english parser

# fucntion for tokenization, lemmatization, lowercasing, striping white space
def tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens ]
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ]

    # return list of tokens
    return tokens

<h2> Custom transformers

In [42]:
from sklearn.base import TransformerMixin, BaseEstimator
class Predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

def clean_text(text):
    return text.strip().lower()

In [12]:
"""
Adapted from code by @michelleful
https://github.com/michelleful/SingaporeRoadnameOrigins

"""

class DataTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, vars):
        self.vars = vars 
            
    def transform(self, data):
        return mydatatransform(data, self.vars)
    
    def fit(self, *_):
        return self

class TextExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name

    def transform(self, df):
        return np.asarray(df[self.column_name]).astype(str)
        
    def fit(self, *_):
        return self
    

class Apply(BaseEstimator, TransformerMixin):
    
    def __init__(self, fn):
        self.fn = np.vectorize(fn)
        
    def transform(self, data):
        return self.fn(data.reshape(data.size, 1))

    def fit(self, *_):
        return self

<h2> Spliting the data 80:20

In [5]:
# separate minority and majority classes
from sklearn.utils import resample
nonPropaganda = df[df.label=='non-propaganda']
propaganda = df[df.label=='propaganda']

# upsample minority
propagandaUpSampled = resample(propaganda,
                          replace=True, # sample with replacement
                          n_samples=len(nonPropaganda), # match number in majority class
                          random_state=0) # reproducible results

# combine majority and upsampled minority
df = pd.concat([propagandaUpSampled,nonPropaganda])

# check new class counts
df.label.value_counts()

non-propaganda    11577
propaganda        11577
Name: label, dtype: int64

<h3> Build in features (Character & word n-grams, tfidf transformer) 

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
vect_chr = CountVectorizer(ngram_range=(1,4), analyzer='char')
vect_words = CountVectorizer(tokenizer = tokenizer, ngram_range=(1,3), analyzer='word')
tfidf = TfidfTransformer()

NameError: name 'tokenizer' is not defined

## Classifiers

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

clf_log_reg = LogisticRegression(class_weight="balanced")
clf_nb = MultinomialNB()
clf_svc = LinearSVC(class_weight="balanced")
clf_rf = RandomForestClassifier()

<h3> Functions for features extraction

In [8]:
nlp = spacy.load("en_core_web_sm")
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words)) # average word length per sentence
def count_nouns(text):
    docs = nlp(str(text))
    nnp_list = [word.tag_ for word in docs if word.pos_ == "NOUN"]
    return len(nnp_list) # number of nouns
def count_adjectives(text):
    docs = nlp(str(text))
    jj_list = [word.tag_ for word in docs if word.pos_ == "ADJ"]
    return len(jj_list) # number of adjectives
def count_verbs(text):
    docs = nlp(str(text))
    verb_list = [word.tag_ for word in docs if word.pos_ == "VERB"]
    return len(verb_list) # number of verbs
def number_of_entities(text):
    docs = nlp(str(text))
    entities = [(i, i.label_, i.label) for i in docs.ents]
    return len(entities) # number of entities

<h3> Baseline
    

In [43]:
from sklearn.dummy import DummyClassifier
X = df.text
y = df.label
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Create dummy classifer
baseline = DummyClassifier(strategy='uniform', random_state=1)

# "Train" model
baseline.fit(X_train, y_train)
predicted = baseline.predict(X_test)  
baselineScore = baseline.score(X_test, y_test)  

print("Baseline score:", baselineScore)
print(classification_report(y_test, predicted))

Baseline score: 0.5131288343558282
                precision    recall  f1-score   support

non-propaganda       0.71      0.51      0.60      2846
    propaganda       0.31      0.51      0.39      1229

      accuracy                           0.51      4075
     macro avg       0.51      0.51      0.49      4075
  weighted avg       0.59      0.51      0.53      4075



<h3> Data splitting

In [10]:
train_test_set = df.loc[:]
X = train_test_set[['text']] # Dataframe
y = train_test_set['label'] # Series
X_train, X_test, y_train, y_true = train_test_split(X, y, test_size=0.2,random_state = 42)
print(f"Number of train instaces: {len(X_train)}")
print(f"Number of test instances: {len(X_test)}")

Number of train instaces: 18523
Number of test instances: 4631


<h3> Wordn n-grams vs Character n-grams

In [14]:
import warnings
warnings.filterwarnings('ignore')
###### Word n-grams
logreg = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('vect', CountVectorizer(ngram_range=(1,4), analyzer='word')),
    ('minmax', MaxAbsScaler()),
    ('clf', LogisticRegression(class_weight = "balanced")),
    ])
model = logreg.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Word n-grams \n")
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))
###### Character n-grams
logreg = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('vect', CountVectorizer(ngram_range=(1,6), analyzer='char')),
    ('minmax', MaxAbsScaler()),
    ('clf', LogisticRegression(class_weight = "balanced")),
    ])
model = logreg.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Character n-grams \n")
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Word n-grams 

Logistic Regression Accuracy: 0.9054199956812783
                precision    recall  f1-score   support

non-propaganda       0.90      0.91      0.91      2322
    propaganda       0.91      0.90      0.90      2309

      accuracy                           0.91      4631
     macro avg       0.91      0.91      0.91      4631
  weighted avg       0.91      0.91      0.91      4631

Character n-grams 

Logistic Regression Accuracy: 0.8846901317210106
                precision    recall  f1-score   support

non-propaganda       0.92      0.84      0.88      2322
    propaganda       0.85      0.93      0.89      2309

      accuracy                           0.88      4631
     macro avg       0.89      0.88      0.88      4631
  weighted avg       0.89      0.88      0.88      4631



<h3>Word n-grams + Character n-grams

In [17]:
pipeline_w_c = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('text_features', FeatureUnion([
        ('word-n-grams', Pipeline([
        ('cleaner', Predictors()),
        ('vectw', CountVectorizer(tokenizer = tokenizer, ngram_range=(1,4), analyzer='word')),
    ])),
     ('char-n-grams', Pipeline([
        ('vectc', CountVectorizer(ngram_range=(1,6), analyzer='char')),
    ]))])),
    ('tfidf',tfidf),
    ('minmax', MaxAbsScaler()),
    ('clf' , LogisticRegression(class_weight = "balanced")),
     ])
    
model = pipeline_w_c.fit(X_train, y_train)  
predicted = model.predict(X_test)    
print("Word + Character n-grams \n")
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Word + Character n-grams 

Logistic Regression Accuracy: 0.8827467069747355
                precision    recall  f1-score   support

non-propaganda       0.93      0.83      0.88      2322
    propaganda       0.85      0.94      0.89      2309

      accuracy                           0.88      4631
     macro avg       0.89      0.88      0.88      4631
  weighted avg       0.89      0.88      0.88      4631



<h3> All features

In [20]:
pipeline = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('text_features', FeatureUnion([
        ('vect_c', vect_chr),
        ('avg_word', Apply(lambda x: avg_word(x))),
        ('nouns', Apply(lambda x: count_nouns(x))),
        ('adjectives', Apply(lambda x: count_adjectives(x))),
        ('verbs', Apply(lambda x: count_verbs(x))),
        ('entities', Apply(lambda x: number_of_entities(x))),
        ('totalCharacters', Apply(lambda s: len(s.split()))),
        ('sentiment', Apply(lambda x: TextBlob(x).sentiment[0])),
    ])),
    ('minmax', MaxAbsScaler()),
    ('clf' , LogisticRegression(class_weight = "balanced")),   
])

model = pipeline.fit(X_train, y_train)  
predicted = model.predict(X_test)         
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Logistic Regression Accuracy: 0.8626646512632261
                precision    recall  f1-score   support

non-propaganda       0.91      0.80      0.85      2322
    propaganda       0.82      0.92      0.87      2309

      accuracy                           0.86      4631
     macro avg       0.87      0.86      0.86      4631
  weighted avg       0.87      0.86      0.86      4631



<h3> Char. n-grams + avg.words + sentiment + NER

In [29]:
pipeline = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('text_features', FeatureUnion([
     ('char-n-grams', Pipeline([
        ('cleaner', Predictors()),
        ('vectc', vect_chr),
        ('tfidf', tfidf),
    ])),
     ('avgWords', Pipeline([
        ('avg_word', Apply(lambda x: avg_word(x))),
        ('minmax', MinMaxScaler()),
    ])),
    ('sent', Pipeline([
        ('sentiment', Apply(lambda x: TextBlob(x).sentiment[0])),
        ('minmax', MinMaxScaler()),
    ])),
    ('ent', Pipeline([
        ('entities', Apply(lambda x: number_of_entities(x))),
        ('minmax', MinMaxScaler()),
    ])),
       
    ])),
    ('minmax', MaxAbsScaler()),
    ('clf' , clf_log_reg),   
])

model = pipeline.fit(X_train, y_train)  # train the classifier
predicted = model.predict(X_test)          # apply the model to the test data
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Logistic Regression Accuracy: 0.8641762038436622
                precision    recall  f1-score   support

non-propaganda       0.92      0.79      0.85      2322
    propaganda       0.82      0.93      0.87      2309

      accuracy                           0.86      4631
     macro avg       0.87      0.86      0.86      4631
  weighted avg       0.87      0.86      0.86      4631



<h3> Char. n-grams + sentiment + NER

In [30]:
pipeline = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('text_features', FeatureUnion([
     ('char-n-grams', Pipeline([
        ('cleaner', Predictors()),
        ('vectc', vect_chr),
        ('tfidf', tfidf),
    ])),
    ('sent', Pipeline([
        ('sentiment', Apply(lambda x: TextBlob(x).sentiment[0])),
        ('minmax', MinMaxScaler()),
    ])),
    ('ent', Pipeline([
        ('entities', Apply(lambda x: number_of_entities(x))),
        ('minmax', MinMaxScaler()),
    ])),
    ])),
    ('minmax', MaxAbsScaler()),
    ('clf' , clf_log_reg),   
])

model = pipeline.fit(X_train, y_train)  # train the classifier
predicted = model.predict(X_test)          # apply the model to the test data
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Logistic Regression Accuracy: 0.8641762038436622
                precision    recall  f1-score   support

non-propaganda       0.92      0.79      0.85      2322
    propaganda       0.82      0.93      0.87      2309

      accuracy                           0.86      4631
     macro avg       0.87      0.86      0.86      4631
  weighted avg       0.87      0.86      0.86      4631



<h3> Word. n-grams + sentiment + NER

In [31]:
pipeline = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('text_features', FeatureUnion([
     ('char-n-grams', Pipeline([
        ('cleaner', Predictors()),
        ('vectw', CountVectorizer(tokenizer = tokenizer, ngram_range=(1,4), analyzer='word')),
        ('tfidf', tfidf),
    ])),
    ('sent', Pipeline([
        ('sentiment', Apply(lambda x: TextBlob(x).sentiment[0])),
    ])),
    ('ent', Pipeline([
        ('entities', Apply(lambda x: number_of_entities(x))),
    ])),
    ])),
    ('minmax', MaxAbsScaler()),
    ('clf' , clf_log_reg),   
])

model = pipeline.fit(X_train, y_train)  # train the classifier
predicted = model.predict(X_test)          # apply the model to the test data
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Logistic Regression Accuracy: 0.8808032822284604
                precision    recall  f1-score   support

non-propaganda       0.90      0.86      0.88      2322
    propaganda       0.86      0.90      0.88      2309

      accuracy                           0.88      4631
     macro avg       0.88      0.88      0.88      4631
  weighted avg       0.88      0.88      0.88      4631



<h3> Word n-grams + sentiment + NER

In [33]:
pipeline = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('text_features', FeatureUnion([
     ('char-n-grams', Pipeline([
        ('cleaner', Predictors()),
        ('vectw', CountVectorizer(tokenizer = tokenizer, ngram_range=(1,4), analyzer='word')),
    ])),
    ('sent', Pipeline([
        ('sentiment', Apply(lambda x: TextBlob(x).sentiment[0])),
    ])),
    ('ent', Pipeline([
        ('entities', Apply(lambda x: number_of_entities(x))),
    ])),
    ])),
    ('minmax', MaxAbsScaler()),
    ('clf' , clf_log_reg),   
])

model = pipeline.fit(X_train, y_train)  # train the classifier
predicted = model.predict(X_test)          # apply the model to the test data
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Logistic Regression Accuracy: 0.8995897214424531
                precision    recall  f1-score   support

non-propaganda       0.89      0.91      0.90      2322
    propaganda       0.91      0.89      0.90      2309

      accuracy                           0.90      4631
     macro avg       0.90      0.90      0.90      4631
  weighted avg       0.90      0.90      0.90      4631



<h3> Suport Vector Classifier and Naive Bayes 

In [34]:
##### Word n-grams
logreg = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('vect', CountVectorizer(ngram_range=(1,4), analyzer='word')),
    ('minmax', MaxAbsScaler()),
    ('clf', clf_svc),
    ])
model = logreg.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Word n-grams \n")
print("Support Vector Classifier Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))
###### Character n-grams
logreg = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('vect', CountVectorizer(ngram_range=(1,6), analyzer='char')),
    ('minmax', MaxAbsScaler()),
    ('clf', clf_svc),
    ])
model = logreg.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Character n-grams \n")
print("Support Vector Classifier Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))
###### Word n-grams
logreg = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('vect', CountVectorizer(ngram_range=(1,4), analyzer='word')),
    ('minmax', MaxAbsScaler()),
    ('clf', clf_nb),
    ])
model = logreg.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Word n-grams \n")
print("Naive Bayes Classifier Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))
###### Character n-grams
logreg = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('vect', CountVectorizer(ngram_range=(1,6), analyzer='char')),
    ('minmax', MaxAbsScaler()),
    ('clf', clf_nb),
    ])
model = logreg.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Character n-grams \n")
print("Naive Bayes Classifier Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Word n-grams 

Support Vector Classifier Accuracy: 0.8957028719499028
                precision    recall  f1-score   support

non-propaganda       0.91      0.88      0.89      2322
    propaganda       0.88      0.91      0.90      2309

      accuracy                           0.90      4631
     macro avg       0.90      0.90      0.90      4631
  weighted avg       0.90      0.90      0.90      4631

Character n-grams 

Support Vector Classifier Accuracy: 0.8810192183113799
                precision    recall  f1-score   support

non-propaganda       0.92      0.83      0.88      2322
    propaganda       0.85      0.93      0.89      2309

      accuracy                           0.88      4631
     macro avg       0.88      0.88      0.88      4631
  weighted avg       0.88      0.88      0.88      4631

Word n-grams 

Naive Bayes Classifier Accuracy: 0.8129993521917512
                precision    recall  f1-score   support

non-propaganda       0.92      0.69      0.79      23

<h3> Random Forrest Classifier

In [None]:
###### Word n-grams
rf = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('vect', CountVectorizer(ngram_range=(1,4), analyzer='word')),
    ('minmax', MaxAbsScaler()),
    ('clf', clf_rf),
    ])
model = rf.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Word n-grams \n")
print("Random Forrest Classifier Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))
###### Character n-grams
rf = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('vect', CountVectorizer(ngram_range=(1,6), analyzer='char')),
    ('minmax', MaxAbsScaler()),
    ('clf', clf_rf),
    ])
model = rf.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Character n-grams \n")
print("Random Forrest Classifier Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Word n-grams 

Random Forrest Classifier Accuracy: 0.9252861153098683
                precision    recall  f1-score   support

non-propaganda       0.90      0.96      0.93      2322
    propaganda       0.95      0.90      0.92      2309

      accuracy                           0.93      4631
     macro avg       0.93      0.93      0.93      4631
  weighted avg       0.93      0.93      0.93      4631



import joblib
joblib.dump(model, "logReg_binary")

<h1>  Task II Fragment Level Classification 

In [7]:
from sklearn.utils import shuffle
Corpus = pd.read_csv("FLC_task2_df.tsv",sep = '\t')
Corpus = Corpus.drop(columns = ['Unnamed: 0'])
Corpus = shuffle(Corpus)
Corpus.head()

Unnamed: 0,article,text,lable
924,article702077434.txt,I ruined my life,Thought-terminating_Cliches
1620,article703056647.txt,local continue to balk that their rituals have...,Doubt
4943,article774007496.txt,beaten her to death,Loaded_Language
1850,article707451080.txt,Weinstein-level” sexual assault,"Name_Calling,Labeling"
4366,article765982381.txt,Where is the voice of conscience to condemn wh...,Doubt


In [8]:
Corpus['lable'].value_counts()

Loaded_Language                       1554
Name_Calling,Labeling                  678
Repetition                             549
Flag-Waving                            412
Causal_Oversimplification              342
Exaggeration,Minimisation              342
Doubt                                  311
Appeal_to_fear-prejudice               276
Slogans                                234
Appeal_to_Authority                    224
Black-and-White_Fallacy                203
Whataboutism,Straw_Men,Red_Herring     188
Thought-terminating_Cliches            133
Bandwagon,Reductio_ad_hitlerum         120
Name: lable, dtype: int64

<h3> Baseline

In [13]:
from sklearn.dummy import DummyClassifier

X = Corpus.text
y = Corpus.lable
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Create dummy classifer
baseline = DummyClassifier(strategy='uniform', random_state=1)

# "Train" model
baseline.fit(X_train, y_train)
predicted = baseline.predict(X_test)  
baselineScore = baseline.score(X_test, y_test)  

print("Baseline score:", baselineScore)
print(classification_report(y_test, predicted))

Baseline score: 0.07614942528735633
                                    precision    recall  f1-score   support

               Appeal_to_Authority       0.06      0.14      0.09        49
          Appeal_to_fear-prejudice       0.04      0.05      0.05        76
    Bandwagon,Reductio_ad_hitlerum       0.01      0.04      0.01        28
           Black-and-White_Fallacy       0.01      0.02      0.01        42
         Causal_Oversimplification       0.08      0.09      0.09        86
                             Doubt       0.06      0.08      0.07        73
         Exaggeration,Minimisation       0.09      0.10      0.10        97
                       Flag-Waving       0.08      0.08      0.08        96
                   Loaded_Language       0.33      0.07      0.12       398
             Name_Calling,Labeling       0.15      0.09      0.11       161
                        Repetition       0.06      0.04      0.05       151
                           Slogans       0.05      

<h3> Data splitting

In [14]:
train_test_set = Corpus.loc[:]
X = train_test_set[['text']] # Dataframe
y = train_test_set['lable'] # Series
X_train, X_test, y_train, y_true = train_test_split(X, y, test_size=0.2,random_state = 42)
print(f"Number of train instaces: {len(X_train)}")
print(f"Number of test instances: {len(X_test)}")

Number of train instaces: 4452
Number of test instances: 1114


<h3> Word vs Character n-grams

In [27]:
print("Word n-grams \n")
pipeline = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('cleaner', Predictors()),
    ('vectc', CountVectorizer(ngram_range=(1,4), analyzer='word')),
    ('tfidf', tfidf),
    ('minmax', MaxAbsScaler()),
    ('clf', LogisticRegression(class_weight = "balanced")),   
])

model = pipeline.fit(X_train, y_train) 
predicted = model.predict(X_test)         
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

##############################################################################

print("Character n-grams \n")
pipeline = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    #('cleaner', Predictors()),
    ('vectc', CountVectorizer(ngram_range=(1,6), analyzer='char')),
    ('tfidf', tfidf),
    ('minmax', MaxAbsScaler()),
    ('clf', LogisticRegression(class_weight = "balanced")),   
])

model = pipeline.fit(X_train, y_train) 
predicted = model.predict(X_test)         
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Word n-grams
Logistic Regression Accuracy: 0.6032315978456014
                                    precision    recall  f1-score   support

               Appeal_to_Authority       0.65      0.60      0.62        58
          Appeal_to_fear-prejudice       0.35      0.25      0.29        52
    Bandwagon,Reductio_ad_hitlerum       0.77      0.62      0.69        32
           Black-and-White_Fallacy       0.82      0.86      0.84        36
         Causal_Oversimplification       0.66      0.80      0.73        76
                             Doubt       0.47      0.58      0.52        55
         Exaggeration,Minimisation       0.31      0.26      0.28        72
                       Flag-Waving       0.70      0.76      0.73        78
                   Loaded_Language       0.63      0.67      0.65       296
             Name_Calling,Labeling       0.56      0.45      0.50       137
                        Repetition       0.53      0.51      0.52       113
                         

<h3> Word + Character n-grams

In [28]:
pipeline_w_c = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('text_features', FeatureUnion([
        ('word-n-grams', Pipeline([
        ('cleaner', Predictors()),
        ('vectw', CountVectorizer(tokenizer = tokenizer, ngram_range=(1,4), analyzer='word')),
    ])),
     ('char-n-grams', Pipeline([
        ('cleaner', Predictors()),
        ('vectc', CountVectorizer(ngram_range=(1,6), analyzer='char')),
    ]))])),
    ('tfidf',tfidf),
    ('minmax', MaxAbsScaler()),
    ('clf' , LogisticRegression(class_weight = "balanced")),
     ])
    
model = pipeline_w_c.fit(X_train, y_train)  
predicted = model.predict(X_test)    
print("Word + Character n-grams \n")
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Word + Character n-grams 

Logistic Regression Accuracy: 0.6391382405745063
                                    precision    recall  f1-score   support

               Appeal_to_Authority       0.69      0.59      0.64        58
          Appeal_to_fear-prejudice       0.39      0.40      0.40        52
    Bandwagon,Reductio_ad_hitlerum       0.80      0.62      0.70        32
           Black-and-White_Fallacy       0.88      0.83      0.86        36
         Causal_Oversimplification       0.72      0.91      0.80        76
                             Doubt       0.50      0.56      0.53        55
         Exaggeration,Minimisation       0.42      0.35      0.38        72
                       Flag-Waving       0.67      0.79      0.73        78
                   Loaded_Language       0.66      0.68      0.67       296
             Name_Calling,Labeling       0.59      0.53      0.56       137
                        Repetition       0.53      0.56      0.55       113
           

<h3> Chararacter n-grams + sentiment + NER

In [33]:
pipeline = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('text_features', FeatureUnion([
     ('char-n-grams', Pipeline([
        ('cleaner', Predictors()),
        ('vectc', CountVectorizer(tokenizer = tokenizer, ngram_range=(1,6), analyzer='char')),
        ('tfidf', tfidf),
    ])),
    ('sent', Pipeline([
        ('sentiment', Apply(lambda x: TextBlob(x).sentiment[0])),
    ])),
    ('ent', Pipeline([
        ('entities', Apply(lambda x: number_of_entities(x))),

    ])),
       
    ])),
    ('minmax', MaxAbsScaler()),
    ('clf' , clf_log_reg),   
])

model = pipeline.fit(X_train, y_train)  # train the classifier
predicted = model.predict(X_test)          # apply the model to the test data
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Logistic Regression Accuracy: 0.6409335727109515
                                    precision    recall  f1-score   support

               Appeal_to_Authority       0.65      0.62      0.64        58
          Appeal_to_fear-prejudice       0.39      0.40      0.40        52
    Bandwagon,Reductio_ad_hitlerum       0.80      0.62      0.70        32
           Black-and-White_Fallacy       0.91      0.83      0.87        36
         Causal_Oversimplification       0.73      0.91      0.81        76
                             Doubt       0.53      0.56      0.54        55
         Exaggeration,Minimisation       0.41      0.35      0.38        72
                       Flag-Waving       0.68      0.78      0.73        78
                   Loaded_Language       0.67      0.67      0.67       296
             Name_Calling,Labeling       0.59      0.55      0.57       137
                        Repetition       0.53      0.56      0.54       113
                           Slogans    

<h3> All features

In [31]:
pipeline = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('text_features', FeatureUnion([
        ('vect_c', vect_chr),
        ('avg_word', Apply(lambda x: avg_word(x))),
        ('nouns', Apply(lambda x: count_nouns(x))),
        ('adjectives', Apply(lambda x: count_adjectives(x))),
        ('verbs', Apply(lambda x: count_verbs(x))),
        ('entities', Apply(lambda x: number_of_entities(x))),
        ('totalCharacters', Apply(lambda s: len(s.split()))),
        ('sentiment', Apply(lambda x: TextBlob(x).sentiment[0])),
    ])),
    ('minmax', MaxAbsScaler()),
    ('clf' , LogisticRegression(class_weight = "balanced")),   
])

model = pipeline.fit(X_train, y_train)  
predicted = model.predict(X_test)         
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Logistic Regression Accuracy: 0.6166965888689407
                                    precision    recall  f1-score   support

               Appeal_to_Authority       0.61      0.60      0.61        58
          Appeal_to_fear-prejudice       0.30      0.25      0.27        52
    Bandwagon,Reductio_ad_hitlerum       0.83      0.62      0.71        32
           Black-and-White_Fallacy       0.77      0.83      0.80        36
         Causal_Oversimplification       0.83      0.84      0.84        76
                             Doubt       0.52      0.42      0.46        55
         Exaggeration,Minimisation       0.37      0.39      0.38        72
                       Flag-Waving       0.72      0.81      0.76        78
                   Loaded_Language       0.64      0.65      0.64       296
             Name_Calling,Labeling       0.55      0.51      0.53       137
                        Repetition       0.50      0.58      0.54       113
                           Slogans    

<h3> Suport Vector Classifier and Naive Bayes 

In [32]:
##### Word n-grams
logreg = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('vect', CountVectorizer(ngram_range=(1,4), analyzer='word')),
    ('minmax', MaxAbsScaler()),
    ('clf', clf_svc),
    ])
model = logreg.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Word n-grams \n")
print("Support Vector Classifier Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))
###### Character n-grams
logreg = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('vect', CountVectorizer(ngram_range=(1,6), analyzer='char')),
    ('minmax', MaxAbsScaler()),
    ('clf', clf_svc),
    ])
model = logreg.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Character n-grams \n")
print("Support Vector Classifier Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))
###### Word n-grams
logreg = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('vect', CountVectorizer(ngram_range=(1,4), analyzer='word')),
    ('minmax', MaxAbsScaler()),
    ('clf', clf_nb),
    ])
model = logreg.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Word n-grams \n")
print("Naive Bayes Classifier Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))
###### Character n-grams
logreg = Pipeline([
    ('text_extractor', TextExtractor('text')), 
    ('vect', CountVectorizer(ngram_range=(1,6), analyzer='char')),
    ('minmax', MaxAbsScaler()),
    ('clf', clf_nb),
    ])
model = logreg.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Character n-grams \n")
print("Naive Bayes Classifier Accuracy:",metrics.accuracy_score(y_true, predicted))
print(classification_report(y_true, predicted))

Word n-grams 

Support Vector Classifier Accuracy: 0.5763016157989228
                                    precision    recall  f1-score   support

               Appeal_to_Authority       0.70      0.55      0.62        58
          Appeal_to_fear-prejudice       0.23      0.12      0.15        52
    Bandwagon,Reductio_ad_hitlerum       0.77      0.62      0.69        32
           Black-and-White_Fallacy       0.81      0.81      0.81        36
         Causal_Oversimplification       0.92      0.64      0.76        76
                             Doubt       0.57      0.24      0.33        55
         Exaggeration,Minimisation       0.33      0.26      0.29        72
                       Flag-Waving       0.68      0.81      0.74        78
                   Loaded_Language       0.54      0.76      0.63       296
             Name_Calling,Labeling       0.56      0.36      0.44       137
                        Repetition       0.43      0.50      0.47       113
                 

<h1> Fragment level classification

In [36]:
df = pd.read_csv('/home/vlad/Anaconda_notebooks/Propagada_Detection/Fragment_classification.tsv',sep = '\t')
df = df.drop(columns = ['Unnamed: 0'])
df = shuffle(df)
df.shape

(17143, 2)

In [44]:
X = df.text
y = df.label
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [59]:
import warnings
warnings.filterwarnings('ignore')
###### Word n-grams
logreg = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3), analyzer='word')),
    ('tfidf', TfidfTransformer()),
    ('minmax', MaxAbsScaler()),
    ('clf', LogisticRegression(class_weight = "balanced")),
    ])
model = logreg.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Word n-grams \n")
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print(classification_report(y_test, predicted))

Word n-grams 

Logistic Regression Accuracy: 0.7312883435582822
                precision    recall  f1-score   support

non-propaganda       0.79      0.83      0.81      2846
    propaganda       0.56      0.50      0.53      1229

      accuracy                           0.73      4075
     macro avg       0.68      0.67      0.67      4075
  weighted avg       0.72      0.73      0.73      4075



In [60]:
import joblib
joblib.dump(model, "logReg_fragment_word")

['logReg_fragment_word']

In [58]:
import warnings
warnings.filterwarnings('ignore')
###### Word n-grams
logreg = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3), analyzer='word')),
    ('tfidf', TfidfTransformer()),
    ('minmax', MaxAbsScaler()),
    ('clf', clf_svc),
    ])
model = logreg.fit(X_train, y_train)  
predicted = model.predict(X_test)       
print("Word n-grams \n")
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print(classification_report(y_test, predicted))

Word n-grams 

Logistic Regression Accuracy: 0.7312883435582822
                precision    recall  f1-score   support

non-propaganda       0.79      0.83      0.81      2846
    propaganda       0.56      0.49      0.53      1229

      accuracy                           0.73      4075
     macro avg       0.68      0.66      0.67      4075
  weighted avg       0.72      0.73      0.73      4075



['logReg_fragment2']