## Read in data

As this segment of the analysis focuses on Classification Algorithms, the cleaned dataset containing reduced columns is considered for model efficiency. The county names are retrieved from the initial data.

In [1]:
import pandas as pd

In [2]:
data3 = pd.read_csv('complete_data.csv', index_col=0, low_memory=False)
data3.head(3)

Unnamed: 0,area,county,area_code,pension,Long_Dist_Income,gross_income,short_dist_income,Social Welfare,"Agriculture, Forestry and Fishing (A) (%)",Construction (F) (%),...,T1_1AGE45_49F,T1_1AGE50_54F,T1_1AGE55_59F,T1_1AGE60_64F,T1_1AGE65_69F,T1_1AGE70_74F,T1_1AGE75_79F,T1_1AGE80_84F,T1_1AGEGE_85F,T1_1AGETF
0,Carlow Urban,Co. Carlow,1,15.8,35617,105943,22520.0,29.4,1.1,5.0,...,133,125,106,100,102,84,92,55,59,2195
1,Graigue Urban,Co. Carlow,2,20.4,31854,101740,21180.0,26.3,2.0,8.3,...,50,43,43,32,31,30,19,33,17,694
2,Clonmore,Co. Carlow,3,17.0,43499,96130,25389.0,12.1,10.6,8.9,...,21,21,22,13,8,5,12,6,5,268


In [3]:
gi = data3[['gross_income']]
gi.head(3)

Unnamed: 0,gross_income
0,105943
1,101740
2,96130


In [4]:
county = data3[['county']]
county.head(3)

Unnamed: 0,county
0,Co. Carlow
1,Co. Carlow
2,Co. Carlow


In [6]:
data2 = pd.read_csv('Cleaned__complete_data.csv', index_col=0)
data2.head(3)

Unnamed: 0,pension,Long_Dist_Income,"Agriculture, Forestry and Fishing (A) (%)",Construction (F) (%),"Financial, Real Estate, Administrative and Services (K,L,N,S) (%)","Industry (B,C,D,E) (%)","Public Service, Education and Health (O,P,Q) (%)","Wholesale, Transport and Accomodation (G,H,I) (%)",T1_1AGE0T,T6_1_CM_H,T6_5_PCH,T6_6_GSLA,T6_6_OP,T6_7_OTH,T6_8_UHH,T8_1_OTHM,T9_2_HJ,gross_income
0,15.8,35617.0,1.1,5.0,14.0,12.6,30.9,25.3,43,1,13,31,4,4,1.0,12,3,105943
1,20.4,31854.0,2.0,8.3,16.2,12.7,25.7,26.9,16,1,4,9,0,3,7.0,3,3,101740
2,17.0,43499.0,10.6,8.9,13.3,13.9,27.3,13.7,8,1,1,2,157,2,9.0,2,2,96130


## Get classification data

In [7]:
data2.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

Select all non-numeric data

In [9]:
data = data3.select_dtypes('O')
data.head(3)

Unnamed: 0,area,county,area_code,pension,Long_Dist_Income,short_dist_income,Social Welfare,"Agriculture, Forestry and Fishing (A) (%)",Construction (F) (%),"Financial, Real Estate, Administrative and Services (K,L,N,S) (%)",...,T1_1AGE40_44M,T1_1AGE45_49M,T1_1AGETM,T1_1AGE20_24F,T1_1AGE25_29F,T1_1AGE30_34F,T1_1AGE35_39F,T1_1AGE40_44F,T1_1AGE45_49F,T1_1AGETF
0,Carlow Urban,Co. Carlow,1,15.8,35617,22520.0,29.4,1.1,5.0,14.0,...,167,145,2365,255,201,165,152,125,133,2195
1,Graigue Urban,Co. Carlow,2,20.4,31854,21180.0,26.3,2.0,8.3,16.2,...,38,36,711,45,94,54,46,30,50,694
2,Clonmore,Co. Carlow,3,17.0,43499,25389.0,12.1,10.6,8.9,13.3,...,20,16,258,11,14,18,17,17,21,268


In [10]:
data.dtypes.unique()

array([dtype('O')], dtype=object)

The areas of interest are the area descriptions

In [11]:
data = data[['area', 'county', 'GUID', 'GEOGID', 'GEOGDESC']]
data.head(3)

Unnamed: 0,area,county,GUID,GEOGID,GEOGDESC
0,Carlow Urban,Co. Carlow,2AE19629185813A3E055000000000001,ED3409_01001,Carlow Urban
1,Graigue Urban,Co. Carlow,2AE196291A5913A3E055000000000001,ED3409_01002,Graigue Urban
2,Clonmore,Co. Carlow,2AE19629186413A3E055000000000001,ED3409_01003,Clonmore


Add target variable to dataset

In [12]:
data.insert(5, 'gross_income', gi, allow_duplicates = False)
data.head(3)

Unnamed: 0,area,county,GUID,GEOGID,GEOGDESC,gross_income
0,Carlow Urban,Co. Carlow,2AE19629185813A3E055000000000001,ED3409_01001,Carlow Urban,105943
1,Graigue Urban,Co. Carlow,2AE196291A5913A3E055000000000001,ED3409_01002,Graigue Urban,101740
2,Clonmore,Co. Carlow,2AE19629186413A3E055000000000001,ED3409_01003,Clonmore,96130


Area and GEODESC columns look similar. They are checked to see if they are identical.

In [13]:
data['area'].equals(data['GEOGDESC'])

False

In [14]:
data.query('area != GEOGDESC')

Unnamed: 0,area,county,GUID,GEOGID,GEOGDESC,gross_income
0,Carlow Urban,Co. Carlow,2AE19629185813A3E055000000000001,ED3409_01001,Carlow Urban,105943
11,Ballinacarrig (Part Urban),Co. Carlow,2AE19629183713A3E055000000000001,ED3409_01012,Ballinacarrig,87584
18,Carlow Rural (Part Urban),Co Carlow,2AE19629185713A3E055000000000001,ED3409_01019,Carlow Rural,82500
379,Athy Rural (Part Rural),Co. Kildare,2AE19629223B13A3E055000000000001,ED3409_06004,Athy Rural,58160
724,Longford Rural (Part Urban),Co. Longford,2AE19629192713A3E055000000000001,ED3409_09051,Longford Rural,52682
751,Castletown (Part Urban),Co. Louth,2AE19629196913A3E055000000000001,ED3409_10023,Castletown,52342
755,Dundalk Rural (Part Urban),Co. Louth,2AE1962918E213A3E055000000000001,ED3409_10027,Dundalk Rural,52236
758,Haggardstown (Part Urban),Co. Louth,2AE1962918F913A3E055000000000001,ED3409_10030,Haggardstown,52221
769,St. Peter's (Part Urban),Co. Louth,2AE19629192A13A3E055000000000001,ED3409_10041,St. Peter's,52091
797,Ceannanus Mór (Kells) Rural (Part Urban),Co. Meath,2AE1962918BE13A3E055000000000001,ED3409_11026,Ceannanus Mór (Kells) Rural,51795


There are only a handful of recors where the two columns are not identical. And in these cases area is just a slightly more detailed metric for GEODESC. To reduce data redundancy area is dropped.

In [15]:
data.drop(columns=data.columns[1], axis=1, inplace=True)
data.head(3)

Unnamed: 0,area,GUID,GEOGID,GEOGDESC,gross_income
0,Carlow Urban,2AE19629185813A3E055000000000001,ED3409_01001,Carlow Urban,105943
1,Graigue Urban,2AE196291A5913A3E055000000000001,ED3409_01002,Graigue Urban,101740
2,Clonmore,2AE19629186413A3E055000000000001,ED3409_01003,Clonmore,96130


Ensure that there are no values where the target variable is 0.

In [18]:
x = data['gross_income'] < 0
import numpy as np
np.where(x)[0]

array([], dtype=int64)

##Put income into binned categories for classification

In [20]:
data['Binned Income'] = pd.cut(data['gross_income'], 5, labels=False)
data.head(3)

Unnamed: 0,area,GUID,GEOGID,GEOGDESC,gross_income,Binned Income
0,Carlow Urban,2AE19629185813A3E055000000000001,ED3409_01001,Carlow Urban,105943,4
1,Graigue Urban,2AE196291A5913A3E055000000000001,ED3409_01002,Graigue Urban,101740,4
2,Clonmore,2AE19629186413A3E055000000000001,ED3409_01003,Clonmore,96130,4


Make bins more understandable

In [21]:
dict1 = {4: 'High', 3: 'MedHigh', 2: 'Med', 1: 'MedLow', 0: 'Low'}
data = data.replace({'Binned Income': dict1})
data.head(3)

Unnamed: 0,area,GUID,GEOGID,GEOGDESC,gross_income,Binned Income
0,Carlow Urban,2AE19629185813A3E055000000000001,ED3409_01001,Carlow Urban,105943,High
1,Graigue Urban,2AE196291A5913A3E055000000000001,ED3409_01002,Graigue Urban,101740,High
2,Clonmore,2AE19629186413A3E055000000000001,ED3409_01003,Clonmore,96130,High


Drop gross_income column as it is now represented in Binned Income

In [22]:
data.drop(columns=data.columns[4], axis=1, inplace=True)
data.head(3)

Unnamed: 0,area,GUID,GEOGID,GEOGDESC,Binned Income
0,Carlow Urban,2AE19629185813A3E055000000000001,ED3409_01001,Carlow Urban,High
1,Graigue Urban,2AE196291A5913A3E055000000000001,ED3409_01002,Graigue Urban,High
2,Clonmore,2AE19629186413A3E055000000000001,ED3409_01003,Clonmore,High


##NA Check

In [23]:
data.isna().sum()

Unnamed: 0,0
area,0
GUID,0
GEOGID,0
GEOGDESC,0
Binned Income,0


##Save prepared data

In [25]:
data

Unnamed: 0,area,GUID,GEOGID,GEOGDESC,Binned Income
0,Carlow Urban,2AE19629185813A3E055000000000001,ED3409_01001,Carlow Urban,High
1,Graigue Urban,2AE196291A5913A3E055000000000001,ED3409_01002,Graigue Urban,High
2,Clonmore,2AE19629186413A3E055000000000001,ED3409_01003,Clonmore,High
3,Hacketstown,2AE19629187F13A3E055000000000001,ED3409_01004,Hacketstown,High
4,Haroldstown,2AE19629188713A3E055000000000001,ED3409_01005,Haroldstown,High
...,...,...,...,...,...
3404,Shanmullagh,2AE196291AC213A3E055000000000001,ED3409_34066,Shanmullagh,Low
3405,Sheskin,2AE196291AC313A3E055000000000001,ED3409_34067,Sheskin,Low
3406,Tedavnet,2AE196291AC513A3E055000000000001,ED3409_34068,Tedavnet,Low
3407,Tehallan,2AE196291ACF13A3E055000000000001,ED3409_34069,Tehallan,Low


In [26]:
data.to_csv('Classification_data.csv')