# Feature Extraction

In [1]:
#imports
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk import word_tokenize
import spacy 
import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()


In [2]:
df = pd.read_csv('sentence_classifications_.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,article_id,sentence_char_start,sentence_char_end,propaganda,propaganda_type,text,prop_start_char,prop_end_char,prop_txt_snippet,sent_#
0,0,701225819,0,59,non-propaganda,,South Florida Muslim Leader Sofian Zakkout’s D...,,,,1
1,1,701225819,60,207,propaganda,"Name_Calling,Labeling","David Duke, the white supremacist icon and for...",111.0,143.0,Grand Wizard of the Ku Klux Klan,2
2,2,701225819,207,382,propaganda,Loaded_Language,"However, one individual who represents the Mus...",305.0,313.0,enamored,3
3,3,701225819,382,525,non-propaganda,,"Last month, once again, Zakkout chose to showc...",,,,4
4,4,701225819,525,595,non-propaganda,,The postings can be rivaled only by Zakkout’s ...,,,,5


In [4]:
df_1 = df[df['text'].isna()==False]
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15220 entries, 0 to 15240
Data columns (total 11 columns):
Unnamed: 0             15220 non-null int64
article_id             15220 non-null int64
sentence_char_start    15220 non-null int64
sentence_char_end      15220 non-null int64
propaganda             15220 non-null object
propaganda_type        3840 non-null object
text                   15220 non-null object
prop_start_char        3840 non-null float64
prop_end_char          3840 non-null float64
prop_txt_snippet       3840 non-null object
sent_#                 15220 non-null int64
dtypes: float64(2), int64(5), object(4)
memory usage: 1.4+ MB


In [5]:
#LOOKING FOR DIFFERENCES IN PROP AND NON PROP FOR CERTAIN WORDS/TOPICS... MIGHT NEED TO CHANGE SOME LABELS

In [6]:
non_prop = df_1[(df_1['propaganda']=='non-propaganda')]
prop = df_1[(df_1['propaganda']=='propaganda')]


## Feature Extraction From Sentences

### Sentence Sentiment Score

In [7]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [8]:
def sentiment_scores(sentence):
    snt = analyser.polarity_scores(sentence)
    return snt

In [9]:
def sentence_sentiment_score(new_col, transform_col, df):
    df[new_col] = df[transform_col].apply(lambda x: sentiment_scores(x)['compound'])
    return df

In [10]:
#create new column with vader sentiment score for each sentence
df_1 = sentence_sentiment_score('sentiment_score','text',df_1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [11]:
#create new column with absolute value of vader sentiment score for each sentence
df_1['abs_sent_score'] = df_1['sentiment_score'].apply(lambda x: abs(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
df_1.head()

Unnamed: 0.1,Unnamed: 0,article_id,sentence_char_start,sentence_char_end,propaganda,propaganda_type,text,prop_start_char,prop_end_char,prop_txt_snippet,sent_#,sentiment_score,abs_sent_score
0,0,701225819,0,59,non-propaganda,,South Florida Muslim Leader Sofian Zakkout’s D...,,,,1,0.0,0.0
1,1,701225819,60,207,propaganda,"Name_Calling,Labeling","David Duke, the white supremacist icon and for...",111.0,143.0,Grand Wizard of the Ku Klux Klan,2,0.5423,0.5423
2,2,701225819,207,382,propaganda,Loaded_Language,"However, one individual who represents the Mus...",305.0,313.0,enamored,3,0.3612,0.3612
3,3,701225819,382,525,non-propaganda,,"Last month, once again, Zakkout chose to showc...",,,,4,0.0,0.0
4,4,701225819,525,595,non-propaganda,,The postings can be rivaled only by Zakkout’s ...,,,,5,0.0,0.0


### Sentence Punctuation Count

In [13]:
import string  
punctuation = string.punctuation

In [14]:
punctuation = punctuation + '”'
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~”'

In [15]:
def count_punct(text):
    count = 0
    for char in text:
        if char in punctuation:
            count +=1
    return count

def count_punct_col(df,transform_col,new_col):
    df[new_col]=df[transform_col].apply(lambda x: count_punct(x))
    return df
    

In [16]:
test_string = 'He said: "HELLO!"... but that is it... no more'
count_punct(test_string)

10

In [17]:
df_1 = count_punct_col(df_1,'text','punct_count')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [18]:
df_1.head()

Unnamed: 0.1,Unnamed: 0,article_id,sentence_char_start,sentence_char_end,propaganda,propaganda_type,text,prop_start_char,prop_end_char,prop_txt_snippet,sent_#,sentiment_score,abs_sent_score,punct_count
0,0,701225819,0,59,non-propaganda,,South Florida Muslim Leader Sofian Zakkout’s D...,,,,1,0.0,0.0,0
1,1,701225819,60,207,propaganda,"Name_Calling,Labeling","David Duke, the white supremacist icon and for...",111.0,143.0,Grand Wizard of the Ku Klux Klan,2,0.5423,0.5423,4
2,2,701225819,207,382,propaganda,Loaded_Language,"However, one individual who represents the Mus...",305.0,313.0,enamored,3,0.3612,0.3612,4
3,3,701225819,382,525,non-propaganda,,"Last month, once again, Zakkout chose to showc...",,,,4,0.0,0.0,5
4,4,701225819,525,595,non-propaganda,,The postings can be rivaled only by Zakkout’s ...,,,,5,0.0,0.0,1


### Simple Tokenizer for Word Count, Word Length, & POS counter

In [19]:
# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

In [20]:
contr_dict={"I’m": "I am",
            "won’t": "will not",
            "’s" : "", 
            "’ll":"will",
            "’ve ":"have ",
            "n’t":"not",
            "’re": "are",
            "’d": "would",
            "y’all": "all of you",
            "I'm": "I am",
            "won't": "will not",
            "'s" : "", 
            "'ll":"will",
            "'ve ":"have ",
            "n't":"not",
            "'re": "are",
            "'d": "would",
            "y'all": "all of you"}
contr_dict.keys()


dict_keys(['I’m', 'won’t', '’s', '’ll', '’ve ', 'n’t', '’re', '’d', 'y’all', "I'm", "won't", "'s", "'ll", "'ve ", "n't", "'re", "'d", "y'all"])

In [21]:
def replace_contractions(sentence, contr_dict=contr_dict):
    for contr in contr_dict.keys():
        if contr in sentence:
            sentence = sentence.replace(contr,contr_dict[contr])
    return sentence

In [22]:
sent_1 = '''After a period of review, the professor changed Arnold's grade to a B+, according to BuzzFeed News.'''

In [23]:
replace_contractions(sent_1,contr_dict)

'After a period of review, the professor changed Arnold grade to a B+, according to BuzzFeed News.'

In [24]:
def simple_tokenizer(text):
    text = replace_contractions(text,contr_dict)
    mytokens = nlp(text)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [word for word in mytokens if word not in punctuation]
    return mytokens

In [25]:
test_tokens = simple_tokenizer('''South Florida Muslim Leader Sofian Zakkout’s David Duke Day''')

In [26]:
test_tokens

['south',
 'florida',
 'muslim',
 'leader',
 'sofian',
 'zakkout',
 'david',
 'duke',
 'day']

### Sentence Word Count

In [27]:
def word_count(tokens):
    return len(tokens)

In [28]:
word_count(test_tokens)

9

In [29]:
def word_count_col(df, new_col, transform_col):
    df[new_col] = df[transform_col].apply(lambda x: word_count(simple_tokenizer(x)))
    return df


In [30]:
df_1 = word_count_col(df_1,'word_count','text')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [31]:
df_1.head()

Unnamed: 0.1,Unnamed: 0,article_id,sentence_char_start,sentence_char_end,propaganda,propaganda_type,text,prop_start_char,prop_end_char,prop_txt_snippet,sent_#,sentiment_score,abs_sent_score,punct_count,word_count
0,0,701225819,0,59,non-propaganda,,South Florida Muslim Leader Sofian Zakkout’s D...,,,,1,0.0,0.0,0,9
1,1,701225819,60,207,propaganda,"Name_Calling,Labeling","David Duke, the white supremacist icon and for...",111.0,143.0,Grand Wizard of the Ku Klux Klan,2,0.5423,0.5423,4,26
2,2,701225819,207,382,propaganda,Loaded_Language,"However, one individual who represents the Mus...",305.0,313.0,enamored,3,0.3612,0.3612,4,27
3,3,701225819,382,525,non-propaganda,,"Last month, once again, Zakkout chose to showc...",,,,4,0.0,0.0,5,22
4,4,701225819,525,595,non-propaganda,,The postings can be rivaled only by Zakkout’s ...,,,,5,0.0,0.0,1,11


### Part of Speech Counter

### POS: Proportion of Adj, Nouns, Verbs, Adverbs

In [32]:
def adj_counter(text):
    text = replace_contractions(text,contr_dict)
    mytokens = nlp(text)
    adjs = [word for word in mytokens if word.pos_ == "ADJ"]
    return (len(adjs))

In [33]:
def verb_counter(text):
    text = replace_contractions(text,contr_dict)
    mytokens = nlp(text)
    verbs = [word for word in mytokens if word.pos_ == "VERB"]
    return (len(verbs))

In [34]:
def adv_counter(text):
    text = replace_contractions(text,contr_dict)
    mytokens = nlp(text)
    advs = [word for word in mytokens if word.pos_ == "ADV"]
    return (len(advs))

In [35]:
def noun_counter(text):
    text = replace_contractions(text,contr_dict)
    mytokens = nlp(text)
    nouns = [word for word in mytokens if word.pos_ == "NOUN"]
    return (len(nouns))

In [36]:
a = adj_counter('She is a very tall person')
v = verb_counter('She is a very tall person')
d = adv_counter('She is a very tall person')
n = noun_counter('She is a very tall person')
print(a,v,d,n)

1 0 1 1


In [37]:
df_1['%adj'] = df_1.text.apply(lambda x: adj_counter(x)/word_count(x))
df_1['%verb'] = df_1.text.apply(lambda x: verb_counter(x)/word_count(x))
df_1['%adv'] = df_1.text.apply(lambda x: adv_counter(x)/word_count(x))
df_1['%noun'] = df_1.text.apply(lambda x: noun_counter(x)/word_count(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

### Sentence Avg. Word Length

In [38]:
def avg_word_length(tokens):
    word_lengths = [len(token) for token in tokens]
    return np.mean(word_lengths)

In [39]:
avg_word_length(test_tokens)

5.444444444444445

In [40]:
def avg_word_length_col(df, new_col, transform_col):
    df[new_col] = df[transform_col].apply(lambda x: avg_word_length(simple_tokenizer(x)))
    return df

In [41]:
df_1 = avg_word_length_col(df_1, 'avg_word_length', 'text')

  out=out, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [43]:
df_1[df_1['propaganda']=='propaganda'].loc[1]

Unnamed: 0                                                             1
article_id                                                     701225819
sentence_char_start                                                   60
sentence_char_end                                                    207
propaganda                                                    propaganda
propaganda_type                                    Name_Calling,Labeling
text                   David Duke, the white supremacist icon and for...
prop_start_char                                                      111
prop_end_char                                                        143
prop_txt_snippet                        Grand Wizard of the Ku Klux Klan
sent_#                                                                 2
sentiment_score                                                   0.5423
abs_sent_score                                                    0.5423
punct_count                                        

In [44]:
df_1[df_1['propaganda']=='non-propaganda'].loc[3]

Unnamed: 0                                                             3
article_id                                                     701225819
sentence_char_start                                                  382
sentence_char_end                                                    525
propaganda                                                non-propaganda
propaganda_type                                                      NaN
text                   Last month, once again, Zakkout chose to showc...
prop_start_char                                                      NaN
prop_end_char                                                        NaN
prop_txt_snippet                                                     NaN
sent_#                                                                 4
sentiment_score                                                        0
abs_sent_score                                                         0
punct_count                                        

In [46]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15220 entries, 0 to 15240
Data columns (total 20 columns):
Unnamed: 0             15220 non-null int64
article_id             15220 non-null int64
sentence_char_start    15220 non-null int64
sentence_char_end      15220 non-null int64
propaganda             15220 non-null object
propaganda_type        3840 non-null object
text                   15220 non-null object
prop_start_char        3840 non-null float64
prop_end_char          3840 non-null float64
prop_txt_snippet       3840 non-null object
sent_#                 15220 non-null int64
sentiment_score        15220 non-null float64
abs_sent_score         15220 non-null float64
punct_count            15220 non-null int64
word_count             15220 non-null int64
%adj                   15220 non-null float64
%verb                  15220 non-null float64
%adv                   15220 non-null float64
%noun                  15220 non-null float64
avg_word_length        15172 non-null f

In [47]:
df_2 = df_1.drop(['Unnamed: 0','sentence_char_start','sentence_char_end',
                  'prop_start_char','prop_end_char'], axis=1)

In [48]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15220 entries, 0 to 15240
Data columns (total 15 columns):
article_id          15220 non-null int64
propaganda          15220 non-null object
propaganda_type     3840 non-null object
text                15220 non-null object
prop_txt_snippet    3840 non-null object
sent_#              15220 non-null int64
sentiment_score     15220 non-null float64
abs_sent_score      15220 non-null float64
punct_count         15220 non-null int64
word_count          15220 non-null int64
%adj                15220 non-null float64
%verb               15220 non-null float64
%adv                15220 non-null float64
%noun               15220 non-null float64
avg_word_length     15172 non-null float64
dtypes: float64(7), int64(4), object(4)
memory usage: 1.9+ MB


In [49]:
df_2.to_csv('sentence_features.csv',index=False)