In [1]:
!pip install textblob
!pip install textstat
!pip install nltk
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install sklearn



[93m    Linking successful[0m
    /anaconda3/lib/python3.6/site-packages/en_core_web_sm -->
    /anaconda3/lib/python3.6/site-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



In [5]:
import pandas as pd
from textblob import TextBlob
import textstat
import nltk
import spacy
from collections import Counter
import en_core_web_sm
import os
import sklearn

In [6]:
def dataprep_task2(path):
    """Dataprep for Task2 It will return the new data
    :param path: Path to the article's taks3 labels file.
    Example:
    >>> dataprep_task2("datasets-v5/tasks-2-3/train/article111111112.task2.labels")
    Note the method will return Pandas DataFrame
    """
    dir_name = os.path.dirname(path)
    article_id = os.path.basename(path).split('.')[0]
    article_name = os.path.join(dir_name, f'{article_id}.txt')

    with open(article_name, 'r') as f:
        records = f.readlines()

    df = pd.DataFrame(records, columns=['sentences'])

    another_df = pd.read_csv(path, sep='\t', names = ['article', 'N_sentence', 'is_propaganda'])

    result_df = pd.concat([df, another_df], axis=1)

    return result_df.loc[result_df['sentences'] != '\n', :]

df_task2 = pd.DataFrame([])
list_articles = [x for x in os.listdir('/Users/rmania/Downloads/datasets-v5/tasks-2-3/train/') if (x.split('.')[-2] == 'task2' and x.split('.')[-1]=='labels')]
for file in list_articles:
    df_task2 = df_task2.append(dataprep_task2('/Users/rmania/Downloads/datasets-v5/tasks-2-3/train/' + file))

df_task2.to_csv('DataTask2.csv')
df = df_task2
df.shape[0]

14263

In [17]:
df = df.reset_index()

In [18]:
df.loc[0, 'sentences']

'US bloggers banned from entering UK\n'

In [19]:
for article in range(df.shape[0]):
    df.loc[article, 'sentiment_polarity'] = TextBlob(df.loc[article, 'sentences']).sentiment[0]
    df.loc[article, 'sentiment_subjectivity'] = TextBlob(df.loc[article, 'sentences']).sentiment[1]

In [20]:
#entities tagging
nlp = en_core_web_sm.load()

df_POS = pd.Series([])
for article in range(df.shape[0]):
    df_POS = df_POS.append(pd.DataFrame([Counter([x.label_ for x in nlp(df.loc[article, \
                                                            'sentences']).ents])]))

In [22]:
df_readability = pd.DataFrame({'flesch_reading_ease': [textstat.flesch_reading_ease(article) for article in df['sentences']],
                 'smog_index': [textstat.smog_index(article) for article in df['sentences']] ,
                 'flesch_kincaid_grade': [textstat.flesch_kincaid_grade(article) for article in df['sentences']] ,
                 'coleman_liau_index': [textstat.coleman_liau_index(article) for article in df['sentences']] ,
                 'automated_readability_index': [textstat.automated_readability_index(article) for article in df['sentences']], 
                'dale_chall_readability_score': [textstat.dale_chall_readability_score(article) for article in df['sentences']],
                'difficult_words': [textstat.difficult_words(article) for article in df['sentences']],
              'linsear_write_formula': [textstat.linsear_write_formula(article) for article in df['sentences']],
              'gunning_fog': [textstat.gunning_fog(article) for article in df['sentences']],
           'text_standard': [textstat.text_standard(article) for article in df['sentences']]})   
    
    

In [28]:
df_POS.to_csv('/Users/rmania/repos/df_POS_task2.csv')
df_readability.to_csv('/Users/rmania/repos/df_readability_task2.csv')

In [29]:
#Add Affin scores
affin_scores = pd.read_csv('/Users/rmania/repos/data_case2_with_affin.csv')

In [27]:
df_with_features = pd.concat([df[2:], df_POS, df_readability], axis = 1)

KeyboardInterrupt: 

In [None]:
df_with_features = pd.concat([df, df_POS, df_readability], axis = 1)

In [31]:
df.head()

Unnamed: 0,index,sentences,article,N_sentence,is_propaganda,sentiment_polarity,sentiment_subjectivity
0,0,US bloggers banned from entering UK\n,111111112,1,non-propaganda,0.0,0.0
1,2,Two prominent US bloggers have been banned fro...,111111112,3,non-propaganda,0.5,1.0
2,4,Pamela Geller and Robert Spencer co-founded an...,111111112,5,propaganda,0.0,0.0
3,6,They were due to speak at an English Defence L...,111111112,7,non-propaganda,-0.108333,0.125
4,8,A government spokesman said individuals whose ...,111111112,9,non-propaganda,0.35,0.333333


In [42]:
df_with_affine = df.merge(affin_scores, how = 'inner', \
                                        left_on = ['article', 'N_sentence'],\
                                        right_on = ['article', 'N_sentence'])

In [45]:
df_with_affine.columns

Index(['index', 'article', 'N_sentence', 'is_propaganda_x',
       'sentiment_polarity', 'sentiment_subjectivity', 'sentences_y',
       'affin_score'],
      dtype='object')

In [43]:
df_with_affine = df_with_affine.drop(labels = ['sentences_x', 'is_propaganda_y'], axis = 1)

In [47]:
df_with_affine.columns = ['index', 'article', 'N_sentence', 'is_propaganda',
       'sentiment_polarity', 'sentiment_subjectivity', 'sentences', 'affin_score']

In [50]:
df_with_affine.head()
df_with_affine.shape

(14263, 8)

In [55]:
#feat = pd.concat([df_POS, df_readability], axis = 1)

In [54]:
#df_with_features = pd.concat([df_with_affine, df_POS, df_readability], axis = 1)

In [57]:
df_with_features= pd.read_csv('/Users/rmania/repos/df_task2_with_features.csv')

In [58]:

import pandas as pd
import re
from itertools import chain
import os
import sys
import pdb

# nltk
import nltk
from nltk import tokenize
from nltk.corpus import wordnet as wn
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords


# spaCy
import spacy
#import spacy_wordnet
#from spacy_wordnet.wordnet_annotator import WordnetAnnotator

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Configuration reader
#sys.path.append(os.path.abspath("..\\Utils"))
#from config_parser import read_configuration

class DataTokenization:

    def __init__(self, tickets):
        """
        Init DataTokenization object

        Args:
            ds: Dataframe (pandas type) having only index and one column with single document per record.
        """

        # Check if the input meets expectations
        if not isinstance(tickets, list):
            raise Exception('The input should be of type list')

        self.tickets = tickets
  #      self.data_parser_conf = read_configuration("dataParserService")
      
        # define default values for different attributes
        # if they were not provided in the configuration file

        # tickets_tokenizer args
        
        self.use_stemming = "Yes"

        self.n_grams_type = 'simple'
    
        self.n_grams = "3"
   
        self.stopwords = stopwords.words('english')
    
        self.use_phrases = "Yes"


    def _document_tokenizer(self,
                            document: str,
                            m_stopwords: list,
                            wordnet_lemmatizer,
                            stemmer,
                            synonyms_method):
        """
        Args:
            document: a document. Could be from single word to multiple lines/paragraphs text.
            m_stopwords [Optional]: This is a list of the stopwords which should be excluded from the tokens
                                  The default value is empty list which means no words will be excluded.
            stemmer: [object] Stemmming object.         
            wordnet_lemmatizer: [object] Lemmatizer object.         
            synonyms_method [Optional]: How the synonyms are exctacted - using nltk or spacy

        Returns:
            A list of prepared tokens of the documnet.

        Example:
            input document:
            'John.Doe@examplemail.com cannot log onto the SDE (or reset the password).
            Emma raised ART request  for an SDE login for John.Doe@examplemail.com
            This request has status "COMPLETE", but in the "COMMENTS" field there is a message "Failed to add user Example.name to PORT_T_NEW"

            outputted tokens:
            ['log', 'onto', 'sde', 'reset', 'password', 'rais', 'request', 'sde', 'login',
            'request', 'status', 'complet', 'comment', 'messag', 'fail', 'add', 'user', 'examplename', 'porttnew']
        """     
        
        try:
            # remove email adresses
         #   document = ' '.join([t for t in document.split() if not any(c == '@' for c in t)]) 

            # split string into words (tokens)
            tokens = tokenize.word_tokenize(document)   

            # remove any digits, i.e. "3rd edition"; remove any path, i.e. "//math/lib" and remove any path on PC
            # v1
            tokens = [t for t in tokens if not any((c.isdigit() or c == '/' or c == '\\') for c in t)] 
            # v2
            # tokens = [re.sub(r'^https?:\/\/.*[\r\n]*', '', t, flags=re.MULTILINE) for t in tokens]

            # remove special chars
            tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', t) for t in tokens]  
            
            # remove short words which don't contain capital letters. 
            tokens = [t for t in tokens if not (len(t) <= 2 and t != 'no' and t != 'nt' and (t.islower()) or t == '')] 
            
            # downcase
            tokens = [t.lower() for t in tokens]  
            
            # remove stopwords
            tokens = [t for t in tokens if t not in m_stopwords]  
            
            # put words into base form
            tokens = [wordnet_lemmatizer.lemmatize(t, 'v') for t in tokens]  

            # put words into core form
            # execute only if no synonyms are needed
            # if both stemming and synonyms should be added - this is added in visual script
            if ((self.use_stemming.lower().strip() == 'yes'
                or self.use_stemming.lower().strip() == 'y')
               and self.use_phrases.lower().strip() != 'yes'
                and self.use_phrases.lower().strip() != 'y'):

                tokens = [stemmer.stem(t) for t in tokens]  

        except Exception as err:
            raise Exception("There was an issue with tokenizing a document:" + str(err))

        return tokens

  

    def _phrase_generator(self, 
                         wordnet_lemmatizer, stemmer):
        """
        This method generates the phrases based on prepared tickets from 
        _document_tokenizer method.

        Args:
            wordnet_lemmatizer: [object] Lemmatizer object. 

         Returns:
            A list of prepared phreses of the corpus.        
        """

        corpus_all_phrases = \
            [self._document_tokenizer(self.tickets[doc], m_stopwords = self.stopwords,
             wordnet_lemmatizer = wordnet_lemmatizer, stemmer = '', synonyms_method = 'none') 
             for doc in range(len(self.tickets))]

        docs = [' '.join([t for t in doc]) for doc in corpus_all_phrases]

        # set spaCy object with wordnet annotator
        nlp = spacy.load('en_core_web_sm')
#        nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')
        bad_deps = ('prep', 'aux')
        phrase_list = []
        phrase_list_per_doc = []

        for doc in docs:
            # take the phases using spaCy object with noun_chunks attribute
            phrase_list_per_doc = []
            for phrase in nlp(doc).noun_chunks:
               
                # exclude bad_deps
                while len(phrase) > 1 and phrase[0].dep_ not in bad_deps:
                    phrase = phrase[1:]

                # check if our phrase have one or more word and            
                if len(phrase) > 1 and len(str(phrase))>1:
                    # Merge the tokens, e.g. good_ideas
                    phrase = str(phrase.text).lower().replace(" ", "_")
                    phrase_list_per_doc.append(phrase)
        
                if  len(phrase_list_per_doc) > 0:    
                    phrase_list.append(phrase_list_per_doc)
                else:
                     phrase_list.append("None")
        return phrase_list


    def tickets_tokenizer(self):
       
        # load object from the related packages - Lemmatizer
        wordnet_lemmatizer = WordNetLemmatizer()

        # load object from the related packages - Stemming
        if (self.use_stemming.lower().strip() == 'yes'
         or self.use_stemming.lower().strip() == 'y'):
            stemmer = SnowballStemmer("english")
        else:
            stemmer = ''

        # Perform the tokenization
        corpus_all_tokens = \
            [self._document_tokenizer(self.tickets[doc], m_stopwords = self.stopwords,
             wordnet_lemmatizer = wordnet_lemmatizer, stemmer = stemmer, synonyms_method = '') 
             for doc in range(len(self.tickets))]

        corpus_all_tokens = [' '.join(doc) for doc in corpus_all_tokens]
        corpus_all_tokens = [doc if str(doc).strip() != '' else 'test' for doc in corpus_all_tokens]

        # Create a list of list per token for bi-grams
        n_grams_2 = []
        n_grams_3 = []

        if (int(self.n_grams) != 0 and
            int(self.n_grams) != 2 and 
            int(self.n_grams) != 3):
            raise Exception("You need to enter valid value for n-grams: 0, 2 or 3 as  integer value.s \n You have used {self.n_grams} !")        

        if (int(self.n_grams) == 2 or int(self.n_grams) == 3):
#pdb.set_trace()
            if (self.n_grams_type.lower().strip() == 'simple'):
                n_grams_2 = [[a + "_" + b for a,b in ngrams(tokens.split(), 2)] for tokens in corpus_all_tokens]  
                if (int(self.n_grams) == 3): 
                    n_grams_3 = [[a + "_" + b + "_" + c for a,b,c in ngrams(tokens.split(), 3)] for tokens in corpus_all_tokens]
            
        n_grams_2 = [' '.join(doc) for doc in n_grams_2]      
        n_grams_3 = [' '.join(doc) for doc in n_grams_3]
        # Phrases 
        if (self.use_stemming.lower().strip() == 'yes'
            or self.use_stemming.lower().strip() == 'y'):

            try:
                phrase_list = list(self._phrase_generator(wordnet_lemmatizer = wordnet_lemmatizer, stemmer = stemmer))
                phrase_list = [' '.join(set_phrases) if set_phrases != "None" else set_phrases for set_phrases in phrase_list]
            except Exception as err:
                raise Exception("There was an issue with separatig the key phreses:" + str(err))

        return (corpus_all_tokens, n_grams_2, n_grams_3, phrase_list)



In [59]:

import time
import sys
import os
import pandas as pd

# load the service params from configuration file
#    sys.path.append(os.path.abspath("..\\Utils"))
#   from config_parser import read_configuration

# load the testing script
# this scripts produce examples
# sys.path.append(os.path.abspath("..\\Testting\\Scripts"))
#from testting_data_parser import TestDataParser

# barplot for most common phrases
# sys.path.append(os.path.abspath("..\\Utils"))
#from word_freqs import word_frequency_barplot


# Load the tickets data
data = df
#   data = pd.read_csv("..\\Resources\\self_service_four_categories.csv", encoding='ISO-8859-1', index_col = 0)

# take the text unstructured part of the dataset
# this df format with the index from the original dataset
documents = list(data['sentences'])
# load the DataPreprocessing object - the main DataParser object (by now)
data_parser_object = DataTokenization(documents)

# clock the operation
tic = time.time()

corpus_all_tokens, n_grams_2, n_grams_3, phrase_list = data_parser_object.tickets_tokenizer()

#pdb.set_trace()

toc = time.time()

time_diff = toc - tic



# Create barplot of most common phrases

#    phrase_list_v2 = [phrase for doc in phrase_list for phrase in doc.split() if phrase not in "None"]

#    word_frequency_barplot(tokenized_text = phrase_list_v2, savedir = "..\\Resources")         


corpus_all_tokens_df = pd.DataFrame(corpus_all_tokens, columns=['corpus_all_tokens'])

corpus_all_tokens_df.to_csv("corpus_all_tokens_df2.csv")

n_grams_2_df = pd.DataFrame(n_grams_2, columns=['n_grams_2'])

n_grams_2_df.to_csv("n_grams_2_df2.csv")

n_grams_3_df = pd.DataFrame(n_grams_3, columns=['n_grams_3'])

n_grams_3_df.to_csv("n_grams_3_df2.csv")

phrase_list_df= pd.DataFrame(phrase_list, columns=['phrase_list'])

phrase_list_df.to_csv("phrase_list2.csv")

combined_dataset = pd.concat([corpus_all_tokens_df, n_grams_2_df, n_grams_3_df], axis=1, join = 'inner')

combined_dataset.to_csv("corpus_all_tokens_n_grams_df2.csv")



In [60]:
corpus = pd.read_csv('corpus_all_tokens_df2.csv')
ngrams_2 = pd.read_csv('n_grams_2_df2.csv')
corpus_all_tokens_n_grams_df2 = pd.read_csv('corpus_all_tokens_n_grams_df2.csv')
phrase_list2_df = pd.read_csv("phrase_list2.csv")
total_df = corpus_all_tokens_n_grams_df2.join(phrase_list2_df.phrase_list)
corpus_all_tokens = total_df[['corpus_all_tokens']]



In [None]:
total_df['fulltext'] = pd.Series([])
for i in range(total_df.shape[0]):
    if total_df.phrase_list[i].strip() != "None":
        total_df['fulltext'][i] = str(total_df.corpus_all_tokens[i]) + " " + \
        str(total_df.n_grams_2[i]) + " " + str(total_df.n_grams_3[i]) + " " + str(total_df.phrase_list[i])
    else:
        total_df['fulltext'][i] = str(total_df.corpus_all_tokens[i]) + " " + \
        str(total_df.n_grams_2[i]) + " " + str(total_df.n_grams_3[i])
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [None]:
len_sentence = [len(doc.split()) for doc in corpus_all_tokens_n_grams_df2['corpus_all_tokens'].values]
total_df['sentence_length'] = len_sentence
total_df.to_csv('/Users/rmania/repos/total_df_task2.csv')

In [None]:
all_tokens = list(total_df['fulltext'])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus_all_tokens_unique = set([w for doc in all_tokens for w in doc.split()])
print("Total number of uniqure words in the corpus: " + str(len(corpus_all_tokens_unique)))

# encode the words in the dictionary in order to create keys DB table 
word2idx = {w:i for i,w in enumerate(corpus_all_tokens_unique)}
tfidf_object = TfidfVectorizer(decode_error='ignore', vocabulary = word2idx)
X_tfidf = tfidf_object.fit_transform(all_tokens)

In [None]:
test = pd.DataFrame(X_tfidf.toarray())
test.to_csv("X_tfidf.csv")


In [None]:
from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components=600, n_iter=2, random_state = 1234)
X_matrix = lsa.fit_transform(X_tfidf)


In [None]:
X_matrix_df = pd.DataFrame(X_matrix)
X_matrix_df.to_csv('X_matrix_df.csv')


In [None]:
features = df_with_features[['sentence_id', 'article_id',
       'label', 'flesch_reading_ease', 'smog_index',
       'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog',
       'text_standard', 'sentiment_polarity', 'sentiment_subjectivity',
       'Unnamed: 0.2', '0', 'GPE', 'CARDINAL', 'ORG', 'NORP', 'PERSON', 'DATE',
       'TIME', 'LOC', 'ORDINAL', 'EVENT', 'WORK_OF_ART', 'FAC', 'LAW',
       'PRODUCT', 'MONEY', 'QUANTITY', 'PERCENT', 'LANGUAGE']]

In [None]:
all_features = pd.concat([features, pd.DataFrame(X_matrix)], axis = 1)
all_features["Len_sen"] = len_sentence
all_features= all_features.drop(labels = 'text_standard', axis = 1)
all_features = all_features.fillna(value = 0)

all_features.shape[0]

In [None]:
X = all_features.iloc[:, 3:].values
y = all_features['label'].map({'non-propaganda':0, 'propaganda':1})
