In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import time


In [2]:
df = pd.read_csv('../data/clean_data_dates_dummies.csv')

In [3]:
df.head()

Unnamed: 0,protestnumber,protesterviolence,participants,notes,stateresponse,labor wage dispute,land farm issue,police brutality,"political behavior, process","price increases, tax policy",...,year_2017,year_2018,year_2019,region_Asia,region_Central America,region_Europe,region_MENA,region_North America,region_Oceania,region_South America
0,1,0.0,3000,Canada s railway passenger system was finally...,ignore,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,2,0.0,1000,protestors were only identified as young peop...,ignore,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,3,0.0,500,"THE Queen, after calling on Canadians to rema...",ignore,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,4,1.0,300,Canada s federal government has agreed to acq...,accomodation,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,5,1.0,950,Protests were directed against the state due t...,arrests,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
import time
import datetime
def datetime_to_int(dt):
    return int(dt.strftime("%Y%m%d%H%M%S"))

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14474 entries, 0 to 14473
Columns: 214 entries, protestnumber to region_South America
dtypes: float64(1), int64(211), object(2)
memory usage: 23.6+ MB


# Text Cleaning

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer
import string

def tokenize(text):
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]
                                

In [7]:
tf_vectorizer = TfidfVectorizer(tokenizer=tokenize,
                                stop_words = 'english', 
                                lowercase = True,
                                ngram_range = (1,2),
                                max_df = 0.5, 
                                min_df = 10,
                                max_features = 3000) 



In [8]:
df['notes'] = df['notes'].str.replace(r'(\b\w{1,2}\b)', '') # for words removing 1 or 2 letter words
 


In [9]:
notes=df['notes']


In [10]:
tf_vectorizer.fit(notes)



TfidfVectorizer(max_df=0.5, max_features=3000, min_df=10, ngram_range=(1, 2),
                stop_words='english',
                tokenizer=<function tokenize at 0x13d2749d0>)

In [11]:
notes_vc=tf_vectorizer.transform(notes)

In [12]:
tf_vectorizer.get_feature_names()[:200]

['000',
 '000 000',
 '000 demonstrator',
 '000 people',
 '000 police',
 '000 protester',
 '000 student',
 '000 supporter',
 '000 worker',
 '000s',
 '100',
 '100 people',
 '1000',
 '100000',
 '100000 people',
 '120',
 '150',
 '15th',
 '1989',
 '1995',
 '1st',
 '200',
 '200 people',
 '200000',
 '2003',
 '2009',
 '2011',
 '2011 ha',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '250',
 '300',
 '300 people',
 '350',
 '400',
 '406th',
 '406th protest',
 '500',
 '500 people',
 '5th',
 '600',
 '700',
 '800',
 'abandoned',
 'abdullah',
 'ablaze',
 'able',
 'abortion',
 'abuja',
 'abuse',
 'academic',
 'accept',
 'access',
 'accident',
 'accommodation',
 'accord',
 'according',
 'according local',
 'according police',
 'account',
 'accusation',
 'accuse',
 'accused',
 'accusing',
 'acronym',
 'act',
 'acting',
 'action',
 'activist',
 'activity',
 'actual',
 'actual number',
 'added',
 'adding',
 'additional',
 'address',
 'addressed',
 'addressing',
 'administration',
 'administrative

In [13]:
notes_clean_df=pd.DataFrame(notes_vc.todense(), 
                          columns=tf_vectorizer.get_feature_names())

In [14]:
notes_clean_df.shape

(14474, 3000)

In [15]:
notes_clean_df.columns

Index(['000', '000 000', '000 demonstrator', '000 people', '000 police',
       '000 protester', '000 student', '000 supporter', '000 worker', '000s',
       ...
       'yesterday morning', 'yesterday protest', 'yesterday thousand', 'young',
       'young men', 'young people', 'youth', 'yugoslavia', 'zimbabwe', 'zone'],
      dtype='object', length=3000)

In [16]:
# Add the vectorized text back to our original data frame
combined_df = pd.concat([df, notes_clean_df], axis =1)
combined_df.head(1)

Unnamed: 0,protestnumber,protesterviolence,participants,notes,stateresponse,labor wage dispute,land farm issue,police brutality,"political behavior, process","price increases, tax policy",...,yesterday morning,yesterday protest,yesterday thousand,young,young men,young people,youth,yugoslavia,zimbabwe,zone
0,1,0.0,3000,Canada railway passenger system was finally ...,ignore,1,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
combined_df.shape

(14474, 3214)

In [18]:
# Drop the original notes column
combined_df.drop(columns = 'notes', inplace=True)


In [19]:
# Write to csv
combined_df.to_csv('../data/numeric_text_combined.csv', index=False)