# Imports

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Input Data

In [2]:
df = pd.read_csv('../pro-test/data/Leb_1_drop_non_impact_params.csv',index_col=0,header=0)

## Drop additional details in target

In [3]:
#target

## Remove spaces from column headings

In [4]:
df.columns = df.columns.str.replace(' ', '')

# Pre-Processing

## Combining and imputing protest size

### Combine size columns

In [5]:
original_size_parameters = df[['sizeexact','sizeestimate']]
original_size_parameters['sizeexact'] = original_size_parameters['sizeexact'].fillna(0)
original_size_parameters['sizeestimate'] = original_size_parameters['sizeestimate'].fillna(0)
combined_sizes = pd.DataFrame(data=original_size_parameters['sizeestimate'] + original_size_parameters['sizeexact'],columns=['combined_sizes'])
record_number = combined_sizes.index
combined_sizes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_size_parameters['sizeexact'] = original_size_parameters['sizeexact'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_size_parameters['sizeestimate'] = original_size_parameters['sizeestimate'].fillna(0)


Unnamed: 0_level_0,combined_sizes
recordnumber,Unnamed: 1_level_1
20191592,31.0
20200162,-99.0
20191864,3.0
20200903,-99.0
20200891,-99.0
...,...
20200311,31.0
20192258,3.0
20191013,31.0
20200552,301.0


### Impute -99 values to averages

In [6]:
mean_size_imputer = SimpleImputer(missing_values=-99,strategy="mean")
combined_sizes = pd.DataFrame(mean_size_imputer.fit_transform(combined_sizes),index=record_number,columns=['combined_sizes'])
combined_sizes

Unnamed: 0_level_0,combined_sizes
recordnumber,Unnamed: 1_level_1
20191592,31.000000
20200162,145.916121
20191864,3.000000
20200903,145.916121
20200891,145.916121
...,...
20200311,31.000000
20192258,3.000000
20191013,31.000000
20200552,301.000000


### Drop old size columns and add new size column to DataFrame

In [7]:
selected_X_parameters = ['Amal','Hezbollah','ProgressiveSocialistMovement']
selected_X_parameters

['Amal', 'Hezbollah', 'ProgressiveSocialistMovement']

## X Data

In [8]:
X = df[selected_X_parameters]  
X = pd.concat([X,combined_sizes],axis=1)
X

Unnamed: 0_level_0,Amal,Hezbollah,ProgressiveSocialistMovement,combined_sizes
recordnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20191592,0,0,0,31.000000
20200162,0,0,0,145.916121
20191864,0,0,0,3.000000
20200903,0,0,0,145.916121
20200891,0,0,0,145.916121
...,...,...,...,...
20200311,0,0,0,31.000000
20192258,0,0,0,3.000000
20191013,0,0,0,31.000000
20200552,0,0,0,301.000000


# y Encoding

In [9]:
y = pd.get_dummies(df['repression'])
y

Unnamed: 0_level_0,Army present at event,Arrests / detentions,Deaths inflicted,Injuries inflicted,"No known coercion, no security presence",Party Militias/ Baltagia present at event,Physical harassment,Security forces present at event
recordnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
20191592,0,0,0,0,1,0,0,0
20200162,0,0,0,0,1,0,0,0
20191864,0,0,0,0,1,0,0,0
20200903,0,0,0,0,1,0,0,0
20200891,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...
20200311,0,0,0,0,1,0,0,0
20192258,1,0,0,0,0,0,0,0
20191013,0,0,0,0,1,0,0,0
20200552,0,0,0,0,1,0,0,0


## Formatting column titles

In [10]:
y.columns = y.columns.str.replace(' ', '_')
y.columns = y.columns.str.replace('/', '')
y.columns = y.columns.str.replace(',', '')

# Combined X and y

In [11]:
data = pd.concat([X,y],axis=1)
data

Unnamed: 0_level_0,Amal,Hezbollah,ProgressiveSocialistMovement,combined_sizes,Army_present_at_event,Arrests__detentions,Deaths_inflicted,Injuries_inflicted,No_known_coercion_no_security_presence,Party_Militias_Baltagia_present_at_event,Physical_harassment,Security_forces_present_at_event
recordnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20191592,0,0,0,31.000000,0,0,0,0,1,0,0,0
20200162,0,0,0,145.916121,0,0,0,0,1,0,0,0
20191864,0,0,0,3.000000,0,0,0,0,1,0,0,0
20200903,0,0,0,145.916121,0,0,0,0,1,0,0,0
20200891,0,0,0,145.916121,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20200311,0,0,0,31.000000,0,0,0,0,1,0,0,0
20192258,0,0,0,3.000000,1,0,0,0,0,0,0,0
20191013,0,0,0,31.000000,0,0,0,0,1,0,0,0
20200552,0,0,0,301.000000,0,0,0,0,1,0,0,0


In [12]:
data.isna().sum()

Amal                                        0
Hezbollah                                   0
ProgressiveSocialistMovement                0
combined_sizes                              0
Army_present_at_event                       0
Arrests__detentions                         0
Deaths_inflicted                            0
Injuries_inflicted                          0
No_known_coercion_no_security_presence      0
Party_Militias_Baltagia_present_at_event    0
Physical_harassment                         0
Security_forces_present_at_event            0
dtype: int64

# Train/Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=17)

NameError: name 'train_test_split' is not defined

In [None]:
data_train = pd.concat([X_train,y_train],axis=1)
data_train

## Remove spaces from column names

# Logistic Regression

In [None]:
y_columns = y.columns.values
y_columns

In [None]:
model_Army_present_at_event = LogisticRegression()
model_Army_present_at_event.fit(X_train,y_train['Army_present_at_event'])

In [None]:
model_Arrests__detentions = LogisticRegression()
model_Arrests__detentions.fit(X_train,y_train['Arrests__detentions'])

In [None]:
model_Deaths_inflicted = LogisticRegression()
model_Deaths_inflicted.fit(X_train,y_train['Deaths_inflicted'])

In [None]:
model_Injuries_inflicted = LogisticRegression()
model_Injuries_inflicted.fit(X_train,y_train['Injuries_inflicted'])

In [None]:
model_No_known_coercion_no_security_presence = LogisticRegression()
model_No_known_coercion_no_security_presence.fit(X_train,y_train['No_known_coercion_no_security_presence'])

In [None]:
model_Party_Militias_Baltagia_present_at_event = LogisticRegression()
model_Party_Militias_Baltagia_present_at_event.fit(X_train,y_train['Party_Militias_Baltagia_present_at_event'])

In [None]:
model_Physical_harassment = LogisticRegression()
model_Physical_harassment.fit(X_train,y_train['Physical_harassment'])

In [None]:
model_Security_forces_present_at_event = LogisticRegression()
model_Security_forces_present_at_event.fit(X_train,y_train['Security_forces_present_at_event'])

# Evaluate Accuracy of Model


In [None]:
y_columns

## Army_present_at_event

In [None]:
model_Army_present_at_event.score(X_test,y_test['Army_present_at_event'])

In [None]:
Predictions = model_Army_present_at_event.predict(X_test)
Compare = pd.DataFrame({'Predictions':Predictions,'Actuals':y_test['Army_present_at_event']})
Compare['Predictions'].value_counts()

In [None]:
y_test['Army_present_at_event'].value_counts()

## Arrests__detentions

In [None]:
model_Arrests__detentions.score(X_test,y_test['Arrests__detentions'])

In [None]:
Predictions = model_Arrests__detentions.predict(X_test)
Compare = pd.DataFrame({'Predictions':Predictions,'Actuals':y_test['Arrests__detentions']})
Compare['Predictions'].value_counts()

In [None]:
y_test['Arrests__detentions'].value_counts()

## Deaths_inflicted

In [None]:
model_Deaths_inflicted.score(X_test,y_test['Deaths_inflicted'])

In [None]:
Predictions = model_Deaths_inflicted.predict(X_test)
Compare = pd.DataFrame({'Predictions':Predictions,'Actuals':y_test['Deaths_inflicted']})
Compare['Predictions'].value_counts()

In [None]:
y_test['Deaths_inflicted'].value_counts()

## Injuries_inflicted

In [None]:
model_Injuries_inflicted.score(X_test,y_test['Injuries_inflicted'])

In [None]:
Predictions = model_Injuries_inflicted.predict(X_test)
Compare = pd.DataFrame({'Predictions':Predictions,'Actuals':y_test['Injuries_inflicted']})
Compare['Predictions'].value_counts()

In [None]:
y_test['Injuries_inflicted'].value_counts()

## No_known_coercion_no_security_presence

In [None]:
model_No_known_coercion_no_security_presence.score(X_test,y_test['No_known_coercion_no_security_presence'])

In [None]:
Predictions = model_No_known_coercion_no_security_presence.predict(X_test)
Compare = pd.DataFrame({'Predictions':Predictions,'Actuals':y_test['No_known_coercion_no_security_presence']})
Compare['Predictions'].value_counts()

In [None]:
y_test['No_known_coercion_no_security_presence'].value_counts()

## Party_Militias_Baltagia_present_at_event

In [None]:
model_Party_Militias_Baltagia_present_at_event.score(X_test,y_test['Party_Militias_Baltagia_present_at_event'])

In [None]:
Predictions = model_Party_Militias_Baltagia_present_at_event.predict(X_test)
Compare = pd.DataFrame({'Predictions':Predictions,'Actuals':y_test['Party_Militias_Baltagia_present_at_event']})
Compare['Predictions'].value_counts()

In [None]:
y_test['Party_Militias_Baltagia_present_at_event'].value_counts()

## Physical_harassment

In [None]:
model_Physical_harassment.score(X_test,y_test['Physical_harassment'])

In [None]:
Predictions = model_Physical_harassment.predict(X_test)
Compare = pd.DataFrame({'Predictions':Predictions,'Actuals':y_test['Physical_harassment']})
Compare['Predictions'].value_counts()

In [None]:
y_test['Physical_harassment'].value_counts()

## Security_forces_present_at_event

In [None]:
model_Security_forces_present_at_event.score(X_test,y_test['Security_forces_present_at_event'])

In [None]:
Predictions = model_Security_forces_present_at_event.predict(X_test)
Compare = pd.DataFrame({'Predictions':Predictions,'Actuals':y_test['Security_forces_present_at_event']})
Compare['Predictions'].value_counts()

In [None]:
y_test['Security_forces_present_at_event'].value_counts()