In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import time


In [21]:
df = pd.read_csv('../data/clean_data.csv')

In [22]:
df.head()

Unnamed: 0,country,year,region,protestnumber,startday,startmonth,startyear,endday,endmonth,endyear,...,labor wage dispute,land farm issue,police brutality,"political behavior, process","price increases, tax policy",removal of politician,social restrictions,startdate,enddate,duration
0,Canada,1990,North America,1,15.0,1.0,1990.0,15.0,1.0,1990.0,...,1,0,0,1,0,0,0,1990-01-15,1990-01-15,0 days 00:00:00.000000000
1,Canada,1990,North America,2,25.0,6.0,1990.0,25.0,6.0,1990.0,...,0,0,0,1,0,0,0,1990-06-25,1990-06-25,0 days 00:00:00.000000000
2,Canada,1990,North America,3,1.0,7.0,1990.0,1.0,7.0,1990.0,...,0,0,0,1,0,0,0,1990-07-01,1990-07-01,0 days 00:00:00.000000000
3,Canada,1990,North America,4,12.0,7.0,1990.0,6.0,9.0,1990.0,...,0,1,0,0,0,0,0,1990-07-12,1990-09-06,56 days 00:00:00.000000000
4,Canada,1990,North America,5,14.0,8.0,1990.0,15.0,8.0,1990.0,...,0,0,0,1,0,0,0,1990-08-14,1990-08-15,1 days 00:00:00.000000000


In [25]:
import time
import datetime
def datetime_to_int(dt):
    return int(dt.strftime("%Y%m%d%H%M%S"))

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14474 entries, 0 to 14473
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   country                      14474 non-null  object 
 1   year                         14474 non-null  int64  
 2   region                       14474 non-null  object 
 3   protestnumber                14474 non-null  int64  
 4   startday                     14474 non-null  float64
 5   startmonth                   14474 non-null  float64
 6   startyear                    14474 non-null  float64
 7   endday                       14474 non-null  float64
 8   endmonth                     14474 non-null  float64
 9   endyear                      14474 non-null  float64
 10  protesterviolence            14474 non-null  float64
 11  participants                 14474 non-null  int64  
 12  notes                        14474 non-null  object 
 13  stateresponse   

# Text Cleaning

In [39]:
import nltk
from nltk.stem import WordNetLemmatizer
import string

def tokenize(text):
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]
                                

In [40]:
tf_vectorizer = TfidfVectorizer(tokenizer=tokenize,
                                stop_words = 'english', 
                                lowercase = True,
                                ngram_range = (1,2),
                                max_df = 0.5, 
                                min_df = 10) 



In [41]:
df['notes'] = df['notes'].str.replace(r'(\b\w{1,2}\b)', '') # for words removing 1 or 2 letter words
 


In [42]:
X=df['notes']


In [43]:
tf_vectorizer.fit(X)



TfidfVectorizer(max_df=0.5, min_df=10, ngram_range=(1, 2), stop_words='english',
                tokenizer=<function tokenize at 0x7fe61123eca0>)

In [44]:
X_vc=tf_vectorizer.transform(X)

In [45]:
tf_vectorizer.get_feature_names()[:200]

['000',
 '000 000',
 '000 according',
 '000 activist',
 '000 angry',
 '000 anti',
 '000 class',
 '000 demonstrator',
 '000 doctor',
 '000 ethnic',
 '000 farmer',
 '000 gathered',
 '000 interior',
 '000 job',
 '000 marched',
 '000 marcher',
 '000 member',
 '000 officer',
 '000 opposition',
 '000 paris',
 '000 participant',
 '000 people',
 '000 police',
 '000 pro',
 '000 protest',
 '000 protester',
 '000 protestors',
 '000 public',
 '000 resident',
 '000 riot',
 '000 romanian',
 '000 specially',
 '000 striking',
 '000 strong',
 '000 student',
 '000 supporter',
 '000 teacher',
 '000 took',
 '000 turned',
 '000 union',
 '000 woman',
 '000 worker',
 '000 year',
 '000s',
 '000s individual',
 '100',
 '100 arrested',
 '100 demonstrator',
 '100 injured',
 '100 member',
 '100 participant',
 '100 people',
 '100 police',
 '100 protester',
 '100 resident',
 '100 student',
 '1000',
 '1000 people',
 '100000',
 '100000 people',
 '100000 protester',
 '10th',
 '110',
 '11am',
 '11th',
 '120',
 '120000',

In [46]:
X_clean_df=pd.DataFrame(X_vc.todense(), 
                          columns=tf_vectorizer.get_feature_names())

In [47]:
X_clean_df.shape

(14474, 12819)

In [48]:
X_clean_df.columns

Index(['000', '000 000', '000 according', '000 activist', '000 angry',
       '000 anti', '000 class', '000 demonstrator', '000 doctor', '000 ethnic',
       ...
       'zero', 'zhytomyr', 'zhytomyr cherkasy', 'zia', 'zimbabwe',
       'zimbabwean', 'zone', 'zongo', 'zulu', 'zuma'],
      dtype='object', length=12819)

# EDA on Text

# Vectorize Text

In [9]:
# Note that much of this section of code comes from Heather's review 10/19
text = df[['notes']]


In [12]:
text.shape

(14474, 1)

In [13]:
# Instantiate and set params for TFIDF
tf = TfidfVectorizer(max_features = 3000, ngram_range = (1,2), stop_words = 'english', max_df = 0.9, min_df = 3)

In [15]:
# Fit transform the data
test_tf = tf.fit_transform(text['notes'])


In [16]:
# Create a new data frame vectorized text
df_text_tf = pd.DataFrame(test_tf.todense(), columns=tf.get_feature_names())


In [17]:
# Reset indicies for all dataframes
df_text_tf.reset_index(drop=True, inplace=True)

In [18]:
# Add the vectorized text back to our original data frame
combined_df = pd.concat([df, df_text_tf], axis =1)
combined_df.head(1)

Unnamed: 0,country,year,region,protestnumber,startday,startmonth,startyear,endday,endmonth,endyear,...,yesterday morning,yesterday protest,young,young men,young people,youth,youths,yugoslavia,zimbabwe,zone
0,Canada,1990,North America,1,15.0,1.0,1990.0,15.0,1.0,1990.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Drop the original notes column
combined_df.drop(columns = 'notes', inplace=True)


In [None]:
# Write to csv
combined_df.to_csv('../data/vectorized_text_combined.csv')