In [74]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import train_test_split. Crossval score. Gridsearch CV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold

# Import metrics
from sklearn import metrics

# Import Pipeline
from sklearn.pipeline import Pipeline

# Import models

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier


# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.datasets import make_classification

from sklearn.metrics import classification_report, accuracy_score

from sklearn.svm import LinearSVC, SVC


In [75]:
df = pd.read_csv('../data/vectorized_text_combined.csv')

In [76]:
df.head()

Unnamed: 0,country,year,region,protestnumber,startday,startmonth,startyear,endday,endmonth,endyear,...,yesterday morning,yesterday protest,young,young men,young people,youth,youths,yugoslavia,zimbabwe,zone
0,Canada,1990,North America,1,15.0,1.0,1990.0,15.0,1.0,1990.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Canada,1990,North America,2,25.0,6.0,1990.0,25.0,6.0,1990.0,...,0.0,0.0,0.369161,0.0,0.474898,0.0,0.0,0.0,0.0,0.0
2,Canada,1990,North America,3,1.0,7.0,1990.0,1.0,7.0,1990.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Canada,1990,North America,4,12.0,7.0,1990.0,6.0,9.0,1990.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Canada,1990,North America,5,14.0,8.0,1990.0,15.0,8.0,1990.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
df.columns[:30]

Index(['country', 'year', 'region', 'protestnumber', 'startday', 'startmonth',
       'startyear', 'endday', 'endmonth', 'endyear', 'protesterviolence',
       'participants', 'stateresponse', 'labor wage dispute',
       'land farm issue', 'police brutality', 'political behavior, process',
       'price increases, tax policy', 'removal of politician',
       'social restrictions', 'startdate', 'enddate', 'duration', '000',
       '000 000', '000 demonstrators', '000 farmers', '000 people',
       '000 police', '000 protesters'],
      dtype='object')

In [52]:
# Get dummies
df = pd.get_dummies(df, columns=['country', 'region'], drop_first=True)

In [53]:
# Drop some columns that aren't working right now
df.drop(columns=['labor wage dispute', 'startdate', 'enddate', 'duration'], inplace=True)

In [137]:
# Adapted from a review with Heather
# Function to remove punctuation and digits
def remove_punctuation(text):
    lower = text.lower()  
    exclude = string.punctuation + string.digits #identifies punctuation marks and digits
    return "".join(ch for ch in lower if ch not in exclude) 

In [138]:
# Apply the function to the ingredients string
df['notes'] = df['notes'].apply(remove_punctuation)

In [139]:
# Instantiate lemmatizer. 
lemmatizer = WordNetLemmatizer()

In [140]:
# Adapted from a review with Heather:
# This function will lemmatize all the words in our features
def split_and_lem(string):
    word = ""   #empty string
    string = string.split(' ') #creates list of words
    for i in string: #iterates through that list of words
        word += (lemmatizer.lemmatize(i) + " ")  #stems each word and adds to empty string
    return word  #returns string with stemmed words

In [141]:
# Lemmatize the string
df_train['ingredients_string'] = df_train['ingredients_string'].apply(split_and_lem)
df_test['ingredients_string'] = df_test['ingredients_string'].apply(split_and_lem)

In [54]:
# Step 1: Split into training & testing sets
X = df.drop(columns='stateresponse')
y = df['stateresponse']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=42)

In [55]:
X.shape, y.shape

((14474, 3187), (14474,))

In [56]:
# Get the baseline model performance
y_test.value_counts(normalize = True)

ignore             0.530810
crowd dispersal    0.174081
arrests            0.101133
accomodation       0.069909
killings           0.053053
beatings           0.042553
shootings          0.028461
Name: stateresponse, dtype: float64

# Multinomial Naive Bayes

In [57]:
nb = MultinomialNB()

In [58]:
nb.fit(X_train, y_train)

MultinomialNB()

In [61]:
nb.score(X_train, y_train)

0.05020727775218793

# Ada Boost

In [65]:
# Run the AdaBoost Classifier
ada = AdaBoostClassifier(DecisionTreeClassifier())

ada.fit(X_train, y_train)

print('Train score: ', ada.score(X_train, y_train))
print('Test score: ', ada.score(X_test, y_test))

Train score:  1.0
Test score:  0.5694943354517823


In [66]:
# from: https://intellipaat.com/community/17347/scikit-learn-get-accuracy-scores-for-each-class

# Use classification report to view cuisine level scores

y_pred = ada.predict(X_test)

target_names = list(y_test.value_counts().index)

print(classification_report(y_test, y_pred, target_names=target_names, digits=4))

                 precision    recall  f1-score   support

         ignore     0.1760    0.1739    0.1750       253
crowd dispersal     0.4399    0.4098    0.4243       366
        arrests     0.2837    0.2597    0.2712       154
   accomodation     0.4307    0.4540    0.4420       630
       killings     0.7526    0.7585    0.7555      1921
       beatings     0.3535    0.3646    0.3590       192
      shootings     0.1573    0.1359    0.1458       103

       accuracy                         0.5695      3619
      macro avg     0.3705    0.3652    0.3675      3619
   weighted avg     0.5666    0.5695    0.5679      3619



# Support Vector Classifier

In [72]:
# C values to GridSearch over
pgrid = {"C": np.linspace(0.0001, 1, 20)}

# Instantiate and fit a gridsearch model for this SVC!
svc = LinearSVC()

cv = StratifiedKFold(shuffle = True, random_state = 42)

gs = GridSearchCV(svc, pgrid, cv = 5, verbose=2, n_jobs=-1)
gs.fit(X_train, y_train)

print('Train score: ', gs.score(X_train, y_train))
print('Test score: ', gs.score(X_test, y_test))

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.9min finished


Train score:  0.4733302625518194
Test score:  0.47278253661232383


In [None]:
# Score model on training set.
print(f'Train accuracy = {gs.score(X_train, y_train)}')
# Score model on testing set.
print(f'Test accuracy = {gs.score(X_test, y_test)}')

In [73]:
# from: https://intellipaat.com/community/17347/scikit-learn-get-accuracy-scores-for-each-class

# Use classification report to view cuisine level scores

y_pred = gs.predict(X_test)

target_names = list(y_test.value_counts().index)

print(classification_report(y_test, y_pred, target_names=target_names, digits=4))

                 precision    recall  f1-score   support

         ignore     0.0904    0.1383    0.1094       253
crowd dispersal     0.0000    0.0000    0.0000       366
        arrests     0.0000    0.0000    0.0000       154
   accomodation     0.0000    0.0000    0.0000       630
       killings     0.5186    0.8725    0.6505      1921
       beatings     0.0000    0.0000    0.0000       192
      shootings     0.0000    0.0000    0.0000       103

       accuracy                         0.4728      3619
      macro avg     0.0870    0.1444    0.1086      3619
   weighted avg     0.2816    0.4728    0.3529      3619



  _warn_prf(average, modifier, msg_start, len(result))
