In [166]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split


In [167]:
df = pd.read_csv('../data/clean_data.csv')

In [168]:
df.head()

Unnamed: 0,country,year,region,protestnumber,startday,startmonth,startyear,endday,endmonth,endyear,...,labor wage dispute,land farm issue,police brutality,"political behavior, process","price increases, tax policy",removal of politician,social restrictions,startdate,enddate,duration
0,Canada,1990,North America,1,15.0,1.0,1990.0,15.0,1.0,1990.0,...,1,0,0,1,0,0,0,1990-01-15,1990-01-15,0 days 00:00:00.000000000
1,Canada,1990,North America,2,25.0,6.0,1990.0,25.0,6.0,1990.0,...,0,0,0,1,0,0,0,1990-06-25,1990-06-25,0 days 00:00:00.000000000
2,Canada,1990,North America,3,1.0,7.0,1990.0,1.0,7.0,1990.0,...,0,0,0,1,0,0,0,1990-07-01,1990-07-01,0 days 00:00:00.000000000
3,Canada,1990,North America,4,12.0,7.0,1990.0,6.0,9.0,1990.0,...,0,1,0,0,0,0,0,1990-07-12,1990-09-06,56 days 00:00:00.000000000
4,Canada,1990,North America,5,14.0,8.0,1990.0,15.0,8.0,1990.0,...,0,0,0,1,0,0,0,1990-08-14,1990-08-15,1 days 00:00:00.000000000


In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14474 entries, 0 to 14473
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   country                      14474 non-null  object 
 1   year                         14474 non-null  int64  
 2   region                       14474 non-null  object 
 3   protestnumber                14474 non-null  int64  
 4   startday                     14474 non-null  float64
 5   startmonth                   14474 non-null  float64
 6   startyear                    14474 non-null  float64
 7   endday                       14474 non-null  float64
 8   endmonth                     14474 non-null  float64
 9   endyear                      14474 non-null  float64
 10  protesterviolence            14474 non-null  float64
 11  participants                 14474 non-null  int64  
 12  notes                        14474 non-null  object 
 13  stateresponse   

# Text Cleaning

In [170]:
import nltk
from nltk.stem import WordNetLemmatizer
import string

def tokenize(text):
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]
                                

In [171]:
tf_vectorizer = CountVectorizer(tokenizer=tokenize,
                                stop_words = 'english', 
                                lowercase = True,
                                max_df = 0.5, 
                                min_df = 10) 



In [172]:
df['notes'] = df['notes'].str.replace(r'(\b\w{1,2}\b)', '') # for words removing 1 or 2 letter words
 


In [173]:
X=df['notes']
y=df['stateresponse']

In [174]:
X_train, X_test , y_train , y_test = train_test_split(X,y, random_state=42)

In [175]:
tf_vectorizer.fit(X_train)



CountVectorizer(max_df=0.5, min_df=10, stop_words='english',
                tokenizer=<function tokenize at 0x7f9efd96f280>)

In [176]:
X_train_vc=tf_vectorizer.transform(X_train)

In [177]:
X_test_vc=tf_vectorizer.transform(X_test)

In [178]:
tf_vectorizer.get_feature_names()[:200]

['000',
 '000s',
 '100',
 '1000',
 '100000',
 '10th',
 '110',
 '11th',
 '120',
 '120000',
 '125',
 '12th',
 '130',
 '13th',
 '140',
 '14th',
 '150',
 '150000',
 '15th',
 '160',
 '16th',
 '170',
 '17th',
 '180',
 '18th',
 '1973',
 '1980',
 '1980s',
 '1989',
 '1990',
 '1990s',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1999',
 '19th',
 '1st',
 '200',
 '2000',
 '200000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2020',
 '2021',
 '20th',
 '21st',
 '230',
 '250',
 '250000',
 '25th',
 '26th',
 '27th',
 '29th',
 '2nd',
 '300',
 '3000',
 '300000',
 '350',
 '350000',
 '3rd',
 '400',
 '400000',
 '406th',
 '450',
 '4th',
 '500',
 '5000',
 '500000',
 '5th',
 '600',
 '650',
 '6th',
 '700',
 '700th',
 '750',
 '7th',
 '80',
 '800',
 '90',
 '900',
 '9th',
 'ababa',
 'abad',
 'abandon',
 'abandoned',
 'abdallah',
 'abdel',
 'abdelaziz',
 'abducted',
 'abduction

# EDA on Text

# Vectorize Text

In [9]:
# Note that much of this section of code comes from Heather's review 10/19
text = df[['notes']]


In [12]:
text.shape

(14474, 1)

In [13]:
# Instantiate and set params for TFIDF
tf = TfidfVectorizer(max_features = 3000, ngram_range = (1,2), stop_words = 'english', max_df = 0.9, min_df = 3)

In [15]:
# Fit transform the data
test_tf = tf.fit_transform(text['notes'])


In [16]:
# Create a new data frame vectorized text
df_text_tf = pd.DataFrame(test_tf.todense(), columns=tf.get_feature_names())


In [17]:
# Reset indicies for all dataframes
df_text_tf.reset_index(drop=True, inplace=True)

In [18]:
# Add the vectorized text back to our original data frame
combined_df = pd.concat([df, df_text_tf], axis =1)
combined_df.head(1)

Unnamed: 0,country,year,region,protestnumber,startday,startmonth,startyear,endday,endmonth,endyear,...,yesterday morning,yesterday protest,young,young men,young people,youth,youths,yugoslavia,zimbabwe,zone
0,Canada,1990,North America,1,15.0,1.0,1990.0,15.0,1.0,1990.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Drop the original notes column
combined_df.drop(columns = 'notes', inplace=True)


In [None]:
# Write to csv
combined_df.to_csv('../data/vectorized_text_combined.csv')