# Data Analysis (Classification): Train-Test Split

## Import Libraries

In [1]:
# Import Required Modules and Packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import sys

import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
cd

/root


## Load Data

In [3]:
# Upload the X and y Data
final_data_x = pd.read_csv('Project/Cleaned/final_data_x.csv', na_values = ['..'])
final_data_y_categorical = pd.read_csv('Project/Cleaned/final_data_y_categorical.csv', na_values = ['..'])

# Drop Column
final_data_x.drop(['Unnamed: 0'], axis=1, inplace=True)
final_data_y_categorical.drop(['Unnamed: 0'], axis=1, inplace=True)

# Set Indices
final_data_x.set_index(['country', 'year'], inplace=True)

# Load Dictionary
dictionary_series = np.load('Project/Cleaned/dictionary_series.npy',allow_pickle='TRUE').item()

In [4]:
# Shape of Data
print("Shape of Data (Features): ", final_data_x.shape)
print("Shape of Data (Outcome - Categorical): ", final_data_y_categorical.shape)

Shape of Data (Features):  (4992, 714)
Shape of Data (Outcome - Categorical):  (4992, 1)


## Outcome Variables - Categorical
For the purposes of this project, supervised machine learning classification models will be used to predict the binary value for<categorical_conflict_instances> with the value of 0 being no conflict events of any kind occurring and 1 being at least one conflict event of any kind occurring.

## Scale Data

In [5]:
# Scale Data
names =  final_data_x.columns
feature_scaler = StandardScaler()
final_data_x_scaled = feature_scaler.fit_transform(final_data_x)
final_data_x_scaled = pd.DataFrame(final_data_x_scaled, columns=names)

## Feature Selection
### ANOVA F-Value 
ANOVA (Analysis of Variance) checks the means of two or more groups that are significantly different from each other to select which features to include in the model.

In [6]:
# Define feature selection
featureselection = SelectKBest(score_func=f_classif, k=20)
X_selectedfeatures = featureselection.fit_transform(final_data_x_scaled, final_data_y_categorical.values.ravel())

print('Shape of X_selectedfeatures:       ', X_selectedfeatures.shape)
print('Selected Features (ANOVA F-Value): ', X_selectedfeatures.shape[1])

Shape of X_selectedfeatures:        (4992, 20)
Selected Features (ANOVA F-Value):  20


In [7]:
# Feature Columns
feature_columns = final_data_x_scaled.columns

selected_features = pd.DataFrame(featureselection.inverse_transform(X_selectedfeatures), 
                                 index=final_data_x_scaled.index, 
                                 columns=feature_columns)

# Selected Feature Columns
selected_columns = selected_features.columns[selected_features.var() != 0]
selected_columns

Index(['EG.CFT.ACCS.ZS', 'EG.ELC.ACCS.RU.ZS', 'EG.ELC.ACCS.UR.ZS',
       'EG.FEC.RNEW.ZS', 'IT.MLT.MAIN.P2', 'SE.PRM.CMPT.FE.ZS',
       'SE.PRM.ENRL.TC.ZS', 'SE.PRM.NENR', 'SE.PRM.TCHR.FE.ZS',
       'SE.SEC.CMPT.LO.FE.ZS', 'SE.SEC.ENRR', 'SE.SEC.NENR', 'SH.ANM.ALLW.ZS',
       'SH.DTH.COMM.ZS', 'SH.DYN.0514', 'SH.STA.BRTC.ZS', 'SL.AGR.EMPL.FE.ZS',
       'SP.ADO.TFRT', 'TM.VAL.MRCH.R6.ZS', 'TX.VAL.MRCH.R6.ZS'],
      dtype='object')

In [8]:
dictionary_series['EG.ELC.ACCS.RU.ZS']

{'Series Name': 'Access to electricity, rural (% of rural population)'}

In [9]:
# Filter out Unselected Feature Columns
final_data_x_scaled_filtered = final_data_x_scaled[selected_columns]

In [10]:
# Shape of Data
print('Shape of Data (Selected Feature Columns):', final_data_x_scaled_filtered.shape)

Shape of Data (Selected Feature Columns): (4992, 20)


## Train-Test Data

In [11]:
# Train-Test Split
X = final_data_x_scaled_filtered
y = final_data_y_categorical
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

# Shape of Train and Test Data
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)

X_test: (999, 20)
y_test: (999, 1)
X_train: (3993, 20)
y_train: (3993, 1)


## Save Train-Test Data

In [12]:
# Save Data
X.to_csv('Project/Cleaned/Classification/X.csv')
y.to_csv('Project/Cleaned/Classification/y.csv')
X_train.to_csv('Project/Cleaned/Classification/X_train.csv')
X_test.to_csv('Project/Cleaned/Classification/X_test.csv')
y_train.to_csv('Project/Cleaned/Classification/y_train.csv')
y_test.to_csv('Project/Cleaned/Classification/y_test.csv')