# Imports

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression

# Input Data

In [2]:
df = pd.read_csv('../pro-test/data/Leb_1_drop_non_impact_params.csv',index_col=0,header=0)

## Remove spaces from column headings

In [3]:
df.columns = df.columns.str.replace(' ', '')

# Feature Selection

## X Parameters

### Combining and imputing protest size

#### Combine size columns

In [4]:
original_size_parameters = df[['sizeexact','sizeestimate']]
original_size_parameters['sizeexact'] = original_size_parameters['sizeexact'].fillna(0)
original_size_parameters['sizeestimate'] = original_size_parameters['sizeestimate'].fillna(0)
combined_sizes = pd.DataFrame(data=original_size_parameters['sizeestimate'] + original_size_parameters['sizeexact'],columns=['combined_sizes'])
record_number = combined_sizes.index
combined_sizes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_size_parameters['sizeexact'] = original_size_parameters['sizeexact'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_size_parameters['sizeestimate'] = original_size_parameters['sizeestimate'].fillna(0)


Unnamed: 0_level_0,combined_sizes
recordnumber,Unnamed: 1_level_1
20191592,31.0
20200162,-99.0
20191864,3.0
20200903,-99.0
20200891,-99.0
...,...
20200311,31.0
20192258,3.0
20191013,31.0
20200552,301.0


#### Impute -99 values to averages

In [5]:
mean_size_imputer = SimpleImputer(missing_values=-99,strategy="mean")
combined_sizes = pd.DataFrame(mean_size_imputer.fit_transform(combined_sizes),index=record_number,columns=['combined_sizes'])
combined_sizes

Unnamed: 0_level_0,combined_sizes
recordnumber,Unnamed: 1_level_1
20191592,31.000000
20200162,145.916121
20191864,3.000000
20200903,145.916121
20200891,145.916121
...,...
20200311,31.000000
20192258,3.000000
20191013,31.000000
20200552,301.000000


### Final selection of X parameters

In [6]:
selected_X_parameters = ['Amal','Hezbollah','ProgressiveSocialistMovement']
selected_X_parameters

['Amal', 'Hezbollah', 'ProgressiveSocialistMovement']

## X Data

In [7]:
X = df[selected_X_parameters]  
X = pd.concat([X,combined_sizes],axis=1)
X

Unnamed: 0_level_0,Amal,Hezbollah,ProgressiveSocialistMovement,combined_sizes
recordnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20191592,0,0,0,31.000000
20200162,0,0,0,145.916121
20191864,0,0,0,3.000000
20200903,0,0,0,145.916121
20200891,0,0,0,145.916121
...,...,...,...,...
20200311,0,0,0,31.000000
20192258,0,0,0,3.000000
20191013,0,0,0,31.000000
20200552,0,0,0,301.000000


# y Encoding

In [8]:
y = pd.get_dummies(df['repression'])
y

Unnamed: 0_level_0,Army present at event,Arrests / detentions,Deaths inflicted,Injuries inflicted,"No known coercion, no security presence",Party Militias/ Baltagia present at event,Physical harassment,Security forces present at event
recordnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
20191592,0,0,0,0,1,0,0,0
20200162,0,0,0,0,1,0,0,0
20191864,0,0,0,0,1,0,0,0
20200903,0,0,0,0,1,0,0,0
20200891,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...
20200311,0,0,0,0,1,0,0,0
20192258,1,0,0,0,0,0,0,0
20191013,0,0,0,0,1,0,0,0
20200552,0,0,0,0,1,0,0,0


## Remove spaces from column headings

In [9]:
y.columns = y.columns.str.replace(' ', '')
y

Unnamed: 0_level_0,Armypresentatevent,Arrests/detentions,Deathsinflicted,Injuriesinflicted,"Noknowncoercion,nosecuritypresence",PartyMilitias/Baltagiapresentatevent,Physicalharassment,Securityforcespresentatevent
recordnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
20191592,0,0,0,0,1,0,0,0
20200162,0,0,0,0,1,0,0,0
20191864,0,0,0,0,1,0,0,0
20200903,0,0,0,0,1,0,0,0
20200891,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...
20200311,0,0,0,0,1,0,0,0
20192258,1,0,0,0,0,0,0,0
20191013,0,0,0,0,1,0,0,0
20200552,0,0,0,0,1,0,0,0


# Combined X and y

In [10]:
data = pd.concat([X,y],axis=1)
data

Unnamed: 0_level_0,Amal,Hezbollah,ProgressiveSocialistMovement,combined_sizes,Armypresentatevent,Arrests/detentions,Deathsinflicted,Injuriesinflicted,"Noknowncoercion,nosecuritypresence",PartyMilitias/Baltagiapresentatevent,Physicalharassment,Securityforcespresentatevent
recordnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20191592,0,0,0,31.000000,0,0,0,0,1,0,0,0
20200162,0,0,0,145.916121,0,0,0,0,1,0,0,0
20191864,0,0,0,3.000000,0,0,0,0,1,0,0,0
20200903,0,0,0,145.916121,0,0,0,0,1,0,0,0
20200891,0,0,0,145.916121,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20200311,0,0,0,31.000000,0,0,0,0,1,0,0,0
20192258,0,0,0,3.000000,1,0,0,0,0,0,0,0
20191013,0,0,0,31.000000,0,0,0,0,1,0,0,0
20200552,0,0,0,301.000000,0,0,0,0,1,0,0,0


In [11]:
data.isna().sum()

Amal                                    0
Hezbollah                               0
ProgressiveSocialistMovement            0
combined_sizes                          0
Armypresentatevent                      0
Arrests/detentions                      0
Deathsinflicted                         0
Injuriesinflicted                       0
Noknowncoercion,nosecuritypresence      0
PartyMilitias/Baltagiapresentatevent    0
Physicalharassment                      0
Securityforcespresentatevent            0
dtype: int64

# Train/Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=17)

In [13]:
data_train = pd.concat([X_train,y_train],axis=1)
data_train

Unnamed: 0_level_0,Amal,Hezbollah,ProgressiveSocialistMovement,combined_sizes,Armypresentatevent,Arrests/detentions,Deathsinflicted,Injuriesinflicted,"Noknowncoercion,nosecuritypresence",PartyMilitias/Baltagiapresentatevent,Physicalharassment,Securityforcespresentatevent
recordnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20190252,0,0,0,3.000000,0,0,0,0,0,0,1,0
20200722,0,0,0,145.916121,0,0,0,0,1,0,0,0
20200872,0,0,0,145.916121,0,0,0,0,1,0,0,0
20190346,0,0,0,3.000000,0,0,0,0,1,0,0,0
20190108,0,0,0,145.916121,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20191765,0,0,0,31.000000,0,0,0,0,1,0,0,0
20191706,0,0,0,301.000000,0,0,0,0,1,0,0,0
20200309,0,0,0,3.000000,0,0,0,0,1,0,0,0
20192257,0,0,0,145.916121,1,0,0,0,0,0,0,0


## Remove spaces from column names

# Logistic Regression

In [14]:
model = smf.logit(formula='Armypresentatevent ~ Amal + Hezbollah + ProgressiveSocialistMovement + combined_sizes', data=data_train).fit()

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


         Current function value: inf
         Iterations: 35


LinAlgError: Singular matrix