# Imports

In [50]:
import joblib
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Input Data

In [2]:
df = pd.read_csv("../pro-test/data/Leb_1_drop_non_impact_params.csv", index_col=0, header=0)

## Remove spaces from column headings

In [3]:
df.columns = df.columns.str.replace(" ", "")

# Feature Selection

## X Parameters

### Combining and imputing protest size

#### Combine size columns

In [4]:
original_size_parameters = df[["sizeexact", "sizeestimate"]]
original_size_parameters["sizeexact"] = original_size_parameters["sizeexact"].fillna(0)
original_size_parameters["sizeestimate"] = original_size_parameters["sizeestimate"].fillna(0)
combined_sizes = pd.DataFrame(
    data=original_size_parameters["sizeestimate"] + original_size_parameters["sizeexact"],
    columns=["combined_sizes"],
)
record_number = combined_sizes.index
combined_sizes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_size_parameters['sizeexact'] = original_size_parameters['sizeexact'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_size_parameters['sizeestimate'] = original_size_parameters['sizeestimate'].fillna(0)


Unnamed: 0_level_0,combined_sizes
recordnumber,Unnamed: 1_level_1
20191592,31.0
20200162,-99.0
20191864,3.0
20200903,-99.0
20200891,-99.0
...,...
20200311,31.0
20192258,3.0
20191013,31.0
20200552,301.0


#### Impute -99 values to averages

In [5]:
mean_size_imputer = SimpleImputer(missing_values=-99, strategy="mean")
combined_sizes = pd.DataFrame(
    mean_size_imputer.fit_transform(combined_sizes), index=record_number, columns=["combined_sizes"]
)
combined_sizes

Unnamed: 0_level_0,combined_sizes
recordnumber,Unnamed: 1_level_1
20191592,31.000000
20200162,145.916121
20191864,3.000000
20200903,145.916121
20200891,145.916121
...,...
20200311,31.000000
20192258,3.000000
20191013,31.000000
20200552,301.000000


### Final selection of X parameters

In [6]:
selected_X_parameters = ["Amal", "Hezbollah", "ProgressiveSocialistMovement"]
selected_X_parameters

['Amal', 'Hezbollah', 'ProgressiveSocialistMovement']

## X Data

In [7]:
X = df[selected_X_parameters]
X = pd.concat([X, combined_sizes], axis=1)
X

Unnamed: 0_level_0,Amal,Hezbollah,ProgressiveSocialistMovement,combined_sizes
recordnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20191592,0,0,0,31.000000
20200162,0,0,0,145.916121
20191864,0,0,0,3.000000
20200903,0,0,0,145.916121
20200891,0,0,0,145.916121
...,...,...,...,...
20200311,0,0,0,31.000000
20192258,0,0,0,3.000000
20191013,0,0,0,31.000000
20200552,0,0,0,301.000000


# y Encoding

In [8]:
y = pd.get_dummies(df["repression"])
y

Unnamed: 0_level_0,Army present at event,Arrests / detentions,Deaths inflicted,Injuries inflicted,"No known coercion, no security presence",Party Militias/ Baltagia present at event,Physical harassment,Security forces present at event
recordnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
20191592,0,0,0,0,1,0,0,0
20200162,0,0,0,0,1,0,0,0
20191864,0,0,0,0,1,0,0,0
20200903,0,0,0,0,1,0,0,0
20200891,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...
20200311,0,0,0,0,1,0,0,0
20192258,1,0,0,0,0,0,0,0
20191013,0,0,0,0,1,0,0,0
20200552,0,0,0,0,1,0,0,0


## Formatting column titles

In [9]:
y.columns = y.columns.str.replace(" ", "_")
y.columns = y.columns.str.replace("/", "")
y.columns = y.columns.str.replace(",", "")

# Combined X and y

In [10]:
data = pd.concat([X, y], axis=1)
data

Unnamed: 0_level_0,Amal,Hezbollah,ProgressiveSocialistMovement,combined_sizes,Army_present_at_event,Arrests__detentions,Deaths_inflicted,Injuries_inflicted,No_known_coercion_no_security_presence,Party_Militias_Baltagia_present_at_event,Physical_harassment,Security_forces_present_at_event
recordnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20191592,0,0,0,31.000000,0,0,0,0,1,0,0,0
20200162,0,0,0,145.916121,0,0,0,0,1,0,0,0
20191864,0,0,0,3.000000,0,0,0,0,1,0,0,0
20200903,0,0,0,145.916121,0,0,0,0,1,0,0,0
20200891,0,0,0,145.916121,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20200311,0,0,0,31.000000,0,0,0,0,1,0,0,0
20192258,0,0,0,3.000000,1,0,0,0,0,0,0,0
20191013,0,0,0,31.000000,0,0,0,0,1,0,0,0
20200552,0,0,0,301.000000,0,0,0,0,1,0,0,0


In [11]:
data.isna().sum()

Amal                                        0
Hezbollah                                   0
ProgressiveSocialistMovement                0
combined_sizes                              0
Army_present_at_event                       0
Arrests__detentions                         0
Deaths_inflicted                            0
Injuries_inflicted                          0
No_known_coercion_no_security_presence      0
Party_Militias_Baltagia_present_at_event    0
Physical_harassment                         0
Security_forces_present_at_event            0
dtype: int64

# Train/Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=17)

In [13]:
data_train = pd.concat([X_train, y_train], axis=1)
data_train

Unnamed: 0_level_0,Amal,Hezbollah,ProgressiveSocialistMovement,combined_sizes,Army_present_at_event,Arrests__detentions,Deaths_inflicted,Injuries_inflicted,No_known_coercion_no_security_presence,Party_Militias_Baltagia_present_at_event,Physical_harassment,Security_forces_present_at_event
recordnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20190252,0,0,0,3.000000,0,0,0,0,0,0,1,0
20200722,0,0,0,145.916121,0,0,0,0,1,0,0,0
20200872,0,0,0,145.916121,0,0,0,0,1,0,0,0
20190346,0,0,0,3.000000,0,0,0,0,1,0,0,0
20190108,0,0,0,145.916121,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20191765,0,0,0,31.000000,0,0,0,0,1,0,0,0
20191706,0,0,0,301.000000,0,0,0,0,1,0,0,0
20200309,0,0,0,3.000000,0,0,0,0,1,0,0,0
20192257,0,0,0,145.916121,1,0,0,0,0,0,0,0


## Remove spaces from column names

# Logistic Regression

In [14]:
y_columns = y.columns.values
y_columns

array(['Army_present_at_event', 'Arrests__detentions', 'Deaths_inflicted',
       'Injuries_inflicted', 'No_known_coercion_no_security_presence',
       'Party_Militias_Baltagia_present_at_event', 'Physical_harassment',
       'Security_forces_present_at_event'], dtype=object)

## Creating dictionary of trained models

In [65]:
model_dict = {}

for i in y_columns:
    model_dict[i] = LogisticRegression().fit(X_train, y_train[i])

In [66]:
model_dict

{'Army_present_at_event': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'Arrests__detentions': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'Deaths_inflicted': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs'

## Manually defining models for display purposes

In [67]:
model_dict["Army_present_at_event"].predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [15]:
model_Army_present_at_event = LogisticRegression()
model_Army_present_at_event.fit(X_train, y_train["Army_present_at_event"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
model_Arrests__detentions = LogisticRegression()
model_Arrests__detentions.fit(X_train, y_train["Arrests__detentions"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
model_Deaths_inflicted = LogisticRegression()
model_Deaths_inflicted.fit(X_train, y_train["Deaths_inflicted"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
model_Injuries_inflicted = LogisticRegression()
model_Injuries_inflicted.fit(X_train, y_train["Injuries_inflicted"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
model_No_known_coercion_no_security_presence = LogisticRegression()
model_No_known_coercion_no_security_presence.fit(
    X_train, y_train["No_known_coercion_no_security_presence"]
)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
model_Party_Militias_Baltagia_present_at_event = LogisticRegression()
model_Party_Militias_Baltagia_present_at_event.fit(
    X_train, y_train["Party_Militias_Baltagia_present_at_event"]
)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
model_Physical_harassment = LogisticRegression()
model_Physical_harassment.fit(X_train, y_train["Physical_harassment"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
model_Security_forces_present_at_event = LogisticRegression()
model_Security_forces_present_at_event.fit(X_train, y_train["Security_forces_present_at_event"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# Evaluate Accuracy of Model


In [23]:
y_columns

array(['Army_present_at_event', 'Arrests__detentions', 'Deaths_inflicted',
       'Injuries_inflicted', 'No_known_coercion_no_security_presence',
       'Party_Militias_Baltagia_present_at_event', 'Physical_harassment',
       'Security_forces_present_at_event'], dtype=object)

## Army_present_at_event

In [24]:
model_Army_present_at_event.score(X_test, y_test["Army_present_at_event"])

0.9332706766917294

In [25]:
Predictions = model_Army_present_at_event.predict(X_test)
Compare = pd.DataFrame({"Predictions": Predictions, "Actuals": y_test["Army_present_at_event"]})
Compare["Predictions"].value_counts()

0    1064
Name: Predictions, dtype: int64

In [26]:
y_test["Army_present_at_event"].value_counts()

0    993
1     71
Name: Army_present_at_event, dtype: int64

## Arrests__detentions

In [27]:
model_Arrests__detentions.score(X_test, y_test["Arrests__detentions"])

0.9915413533834586

In [28]:
Predictions = model_Arrests__detentions.predict(X_test)
Compare = pd.DataFrame({"Predictions": Predictions, "Actuals": y_test["Arrests__detentions"]})
Compare["Predictions"].value_counts()

0    1064
Name: Predictions, dtype: int64

In [29]:
y_test["Arrests__detentions"].value_counts()

0    1055
1       9
Name: Arrests__detentions, dtype: int64

## Deaths_inflicted

In [30]:
model_Deaths_inflicted.score(X_test, y_test["Deaths_inflicted"])

0.9990601503759399

In [31]:
Predictions = model_Deaths_inflicted.predict(X_test)
Compare = pd.DataFrame({"Predictions": Predictions, "Actuals": y_test["Deaths_inflicted"]})
Compare["Predictions"].value_counts()

0    1064
Name: Predictions, dtype: int64

In [32]:
y_test["Deaths_inflicted"].value_counts()

0    1063
1       1
Name: Deaths_inflicted, dtype: int64

## Injuries_inflicted

In [33]:
model_Injuries_inflicted.score(X_test, y_test["Injuries_inflicted"])

0.9877819548872181

In [34]:
Predictions = model_Injuries_inflicted.predict(X_test)
Compare = pd.DataFrame({"Predictions": Predictions, "Actuals": y_test["Injuries_inflicted"]})
Compare["Predictions"].value_counts()

0    1063
1       1
Name: Predictions, dtype: int64

In [35]:
y_test["Injuries_inflicted"].value_counts()

0    1052
1      12
Name: Injuries_inflicted, dtype: int64

## No_known_coercion_no_security_presence

In [36]:
model_No_known_coercion_no_security_presence.score(
    X_test, y_test["No_known_coercion_no_security_presence"]
)

0.8026315789473685

In [37]:
Predictions = model_No_known_coercion_no_security_presence.predict(X_test)
Compare = pd.DataFrame(
    {"Predictions": Predictions, "Actuals": y_test["No_known_coercion_no_security_presence"]}
)
Compare["Predictions"].value_counts()

1    1061
0       3
Name: Predictions, dtype: int64

In [38]:
y_test["No_known_coercion_no_security_presence"].value_counts()

1    853
0    211
Name: No_known_coercion_no_security_presence, dtype: int64

## Party_Militias_Baltagia_present_at_event

In [39]:
model_Party_Militias_Baltagia_present_at_event.score(
    X_test, y_test["Party_Militias_Baltagia_present_at_event"]
)

0.9915413533834586

In [40]:
Predictions = model_Party_Militias_Baltagia_present_at_event.predict(X_test)
Compare = pd.DataFrame(
    {"Predictions": Predictions, "Actuals": y_test["Party_Militias_Baltagia_present_at_event"]}
)
Compare["Predictions"].value_counts()

0    1063
1       1
Name: Predictions, dtype: int64

In [41]:
y_test["Party_Militias_Baltagia_present_at_event"].value_counts()

0    1056
1       8
Name: Party_Militias_Baltagia_present_at_event, dtype: int64

## Physical_harassment

In [42]:
model_Physical_harassment.score(X_test, y_test["Physical_harassment"])

0.9953007518796992

In [43]:
Predictions = model_Physical_harassment.predict(X_test)
Compare = pd.DataFrame({"Predictions": Predictions, "Actuals": y_test["Physical_harassment"]})
Compare["Predictions"].value_counts()

0    1063
1       1
Name: Predictions, dtype: int64

In [44]:
y_test["Physical_harassment"].value_counts()

0    1060
1       4
Name: Physical_harassment, dtype: int64

## Security_forces_present_at_event

In [45]:
model_Security_forces_present_at_event.score(X_test, y_test["Security_forces_present_at_event"])

0.900375939849624

In [46]:
Predictions = model_Security_forces_present_at_event.predict(X_test)
Compare = pd.DataFrame(
    {"Predictions": Predictions, "Actuals": y_test["Security_forces_present_at_event"]}
)
Compare["Predictions"].value_counts()

0    1064
Name: Predictions, dtype: int64

In [47]:
y_test["Security_forces_present_at_event"].value_counts()

0    958
1    106
Name: Security_forces_present_at_event, dtype: int64

# Save the models

In [49]:
y_columns

array(['Army_present_at_event', 'Arrests__detentions', 'Deaths_inflicted',
       'Injuries_inflicted', 'No_known_coercion_no_security_presence',
       'Party_Militias_Baltagia_present_at_event', 'Physical_harassment',
       'Security_forces_present_at_event'], dtype=object)

In [69]:
for i in y_columns:
    save_path = "../api/model_logistic_regression_" + i
    joblib.dump(model_dict[i], save_path)