# Data Preparation: Feature Selection

## Import Libraries

In [1]:
# Import Required Modules and Packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import sys

from sklearn.feature_selection import VarianceThreshold

In [2]:
cd

/root


## Load Data

In [3]:
# Upload the Final Data
final_data = pd.read_csv('Project/Cleaned/final_data.csv', na_values = ['..'])
final_data.drop(['Unnamed: 0'], axis=1, inplace=True)

## Feature Selection

In [4]:
# Randomly Select 1000 Observation
#final_data = final_data.sample(n = 1000) 
#final_data = final_data.reset_index(drop=True)

In [5]:
# Country-Year, Features, Outcome Data
final_data_country_year = final_data.iloc[0:,:2]
final_data_features = final_data.iloc[0:,2:-14]
final_data_outcome = final_data.iloc[0:,-14:]

In [6]:
# Dropping Features with High Correlation
correlated_features = set()
correlation_matrix = final_data_features.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
final_data_features.drop(labels=correlated_features, axis=1, inplace=True)

print('Number of Correlated Features: ', len(correlated_features))
print("Shape of Data (Final): ",final_data_features.shape)

Number of Correlated Features:  685
Shape of Data (Final):  (4992, 740)


In [7]:
final_data_features.head()

Unnamed: 0,AG.AGR.TRAC.NO,AG.CON.FERT.PT.ZS,AG.CON.FERT.ZS,AG.LND.AGRI.K2,AG.LND.AGRI.ZS,AG.LND.ARBL.HA.PC,AG.LND.ARBL.ZS,AG.LND.CROP.ZS,AG.LND.EL5M.RU.K2,AG.LND.EL5M.RU.ZS,...,per_allsp.ben_q1_tot,per_allsp.cov_pop_tot,per_lm_alllm.adq_pop_tot,per_lm_alllm.ben_q1_tot,per_lm_alllm.cov_pop_tot,per_sa_allsa.adq_pop_tot,per_sa_allsa.ben_q1_tot,per_sa_allsa.cov_pop_tot,per_si_allsi.ben_q1_tot,per_si_allsi.cov_pop_tot
0,110.0,,,377900.0,57.883773,0.397011,11.771283,0.160831,,,...,,,,,,,,,,
1,110.0,,,378670.0,58.001716,0.393003,11.881567,0.168489,,,...,,,,,,,,,,
2,110.0,,,377530.0,57.827099,0.379409,11.722268,0.153172,,,...,,,,,,,,,,
3,110.0,,,377530.0,57.827099,0.369731,11.76822,0.107221,,,...,,,,,,,,,,
4,110.0,,,377530.0,57.827099,0.355579,11.76822,0.107221,,,...,,,,,,,,,,


In [8]:
# Removing Columns with Low Variance
selector = VarianceThreshold(threshold=.5)
selector.fit(final_data_features)
final_data_features = final_data_features[final_data_features.columns[selector.get_support(indices=True)]]

print("Shape of Data (Final): ", final_data_features.shape)

Shape of Data (Final):  (4992, 714)


In [9]:
# Removing Columns with More than 20% missing values
threshold = len(final_data_features) * .2
final_data_features.dropna(thresh = threshold, axis = 1, inplace = True)
final_data_features = final_data_features.fillna(final_data_features.mean())

print("Shape of Data (Final): ",final_data_features.shape)

Shape of Data (Final):  (4992, 714)


In [10]:
# Final Cleaned and Processed Dataset
final_data_features = final_data_features.astype(int)
final_data_outcome = final_data_outcome.astype(float)
final_data = pd.concat([final_data_country_year, final_data_features], axis=1)
final_data = pd.concat([final_data, final_data_outcome], axis=1)

# Set Indices
final_data = final_data.set_index(['country', 'year'])
final_data

Unnamed: 0_level_0,Unnamed: 1_level_0,AG.AGR.TRAC.NO,AG.CON.FERT.PT.ZS,AG.CON.FERT.ZS,AG.LND.AGRI.K2,AG.LND.AGRI.ZS,AG.LND.ARBL.ZS,AG.LND.CROP.ZS,AG.LND.EL5M.RU.K2,AG.LND.EL5M.RU.ZS,AG.LND.EL5M.UR.K2,...,conflict_instances_event_5,conflict_instances_event_6,conflict_fatalities_event_1,conflict_fatalities_event_2,conflict_fatalities_event_3,conflict_fatalities_event_4,conflict_fatalities_event_5,conflict_fatalities_event_6,conflict_instances_total,conflict_fatalities_total
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Afghanistan,1997.0,110,3436,432,377900,57,11,0,6642,4,673,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Afghanistan,1998.0,110,3436,432,378670,58,11,0,6642,4,673,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Afghanistan,1999.0,110,3436,432,377530,57,11,0,6642,4,673,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Afghanistan,2000.0,110,3436,432,377530,57,11,0,6642,4,673,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Afghanistan,2001.0,110,3436,432,377530,57,11,0,6642,4,673,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,2015.0,22496,144,22,162000,41,10,0,1597,0,125,...,6.0,97.0,0.0,0.0,0.0,12.0,0.0,7.0,264.0,19.0
Zimbabwe,2016.0,22496,144,22,162000,41,10,0,1597,0,125,...,14.0,72.0,0.0,0.0,0.0,0.0,0.0,6.0,228.0,6.0
Zimbabwe,2017.0,22496,144,22,162000,41,10,0,1597,0,125,...,15.0,151.0,5.0,0.0,0.0,2.0,0.0,2.0,298.0,9.0
Zimbabwe,2018.0,22496,144,22,162000,41,10,0,1597,0,125,...,19.0,153.0,5.0,2.0,0.0,10.0,0.0,7.0,344.0,24.0


In [11]:
# Shape of Data
print("Shape of Data (Final): ", final_data.shape)

Shape of Data (Final):  (4992, 728)


## Save Pre-processed Data

In [12]:
# Save Data
final_data.to_csv('Project/Cleaned/final_data_preprocessed.csv')