In [1]:
import pandas as pd 
import numpy as np

# 1. Data Ingestion
Loading the data using pandas function called `read_excel`

In [11]:
metadata = pd.read_excel("./Data/Election_Data.xlsx")
metadata

Unnamed: 0,**Data Dictionary**
0,
1,1. vote: Party choice: Conservative or Labour
2,
3,2. age: in years
4,
5,3. economic.cond.national: Assessment of curre...
6,
7,4. economic.cond.household: Assessment of curr...
8,
9,"5. Blair: Assessment of the Labour leader, 1 t..."


| Column                      | Description |
|-----------------------------|-------------|
| vote                        | Party choice: Conservative or Labour |
| age                         | Age in years |
| economic.cond.national      | Assessment of current national economic conditions, 1 to 5 |
| economic.cond.household     | Assessment of current household economic conditions, 1 to 5 |
| Blair                       | Assessment of the Labour leader, 1 to 5 |
| Hague                       | Assessment of the Conservative leader, 1 to 5 |
| Europe                      | An 11-point scale that measures respondents' attitudes toward European integration. High scores represent ‘Eurosceptic’ sentiment |
| political.knowledge         | Knowledge of parties' positions on European integration, 0 to 3 |
| gender                      | Female or male |


In [3]:
df = pd.read_excel("./Data/Election_Data.xlsx", sheet_name="Election_Dataset_Two Classes")

In [4]:
df

Unnamed: 0.1,Unnamed: 0,vote,age,economic.cond.national,economic.cond.household,Blair,Hague,Europe,political.knowledge,gender
0,1,Labour,43,3,3,4,1,2,2,female
1,2,Labour,36,4,4,4,4,5,2,male
2,3,Labour,35,4,4,5,2,3,2,male
3,4,Labour,24,4,2,2,1,4,0,female
4,5,Labour,41,2,2,1,1,6,2,male
...,...,...,...,...,...,...,...,...,...,...
1520,1521,Conservative,67,5,3,2,4,11,3,male
1521,1522,Conservative,73,2,2,4,4,8,2,male
1522,1523,Labour,37,3,3,5,4,2,2,male
1523,1524,Conservative,61,3,3,1,4,11,2,male


# 2. Data Preparation

- We'll check for null values and check for non-required columns.

In [5]:
df.shape # num Of rows and columns

(1525, 10)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1525 entries, 0 to 1524
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               1525 non-null   int64 
 1   vote                     1525 non-null   object
 2   age                      1525 non-null   int64 
 3   economic.cond.national   1525 non-null   int64 
 4   economic.cond.household  1525 non-null   int64 
 5   Blair                    1525 non-null   int64 
 6   Hague                    1525 non-null   int64 
 7   Europe                   1525 non-null   int64 
 8   political.knowledge      1525 non-null   int64 
 9   gender                   1525 non-null   object
dtypes: int64(8), object(2)
memory usage: 119.3+ KB


- We can see `Unnamed: 0` is serial number column which is of no use. We'll remove that row
- And there are two columns which is of type object
- vote is our target value, we need to classify new data point to the classes given in vote

In [13]:
df["vote"].value_counts()

vote
Labour          1063
Conservative     462
Name: count, dtype: int64

In [7]:
df.drop(columns = "Unnamed: 0", axis = 1, inplace = True)

In [8]:
def data_quality_check(df):
    '''
        Function for checking the quality of the data, i.e., null values
    '''
    print("Number of null values")
    print(df.isnull().sum())
    
    

In [9]:
data_quality_check(df)

Number of null values
vote                       0
age                        0
economic.cond.national     0
economic.cond.household    0
Blair                      0
Hague                      0
Europe                     0
political.knowledge        0
gender                     0
dtype: int64


- There are no null values in any column, so we need not do any type of imputation. We can go for EDA.

In [10]:
df.head()

Unnamed: 0,vote,age,economic.cond.national,economic.cond.household,Blair,Hague,Europe,political.knowledge,gender
0,Labour,43,3,3,4,1,2,2,female
1,Labour,36,4,4,4,4,5,2,male
2,Labour,35,4,4,5,2,3,2,male
3,Labour,24,4,2,2,1,4,0,female
4,Labour,41,2,2,1,1,6,2,male


In [None]:
df.describe()   # Summary statistics for numerical data

Unnamed: 0,age,economic.cond.national,economic.cond.household,Blair,Hague,Europe,political.knowledge
count,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0
mean,54.182295,3.245902,3.140328,3.334426,2.746885,6.728525,1.542295
std,15.711209,0.880969,0.929951,1.174824,1.230703,3.297538,1.083315
min,24.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,41.0,3.0,3.0,2.0,2.0,4.0,0.0
50%,53.0,3.0,3.0,4.0,2.0,6.0,2.0
75%,67.0,4.0,4.0,4.0,4.0,10.0,2.0
max,93.0,5.0,5.0,5.0,5.0,11.0,3.0
