In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot

# Data : Portugese elections

In the paper Pereira (2018) (https://repositorio-aberto.up.pt/bitstream/10216/111219/2/258366.pdf), consideration of the following political parties (without alliances) :

• BE (left-wing)

• CDS-PP (conservative)

• PCP-PEV (left-wing)

• PPD/PSD (center-right)

• PS (center-left)

### Raw data

In [2]:
# encoding error
portugal_2009 = pd.read_csv("data_new/lr_portugal_2009.csv", sep=";", encoding="cp860")
portugal_2009.head()

Unnamed: 0,Regions,Votes Total,BE,CDS-PP,CDS-PP-MPT,CDS-PP-MPT-PPM,CDS-PP-PPD/PSD,Grupo de cidadaos,MPT,PAN,...,%Employees per Sector Administration and Services,Employees per Economic Sector Banks,%Employees per Economic Sector Banks,Employees per Economic Sector Others,%Employees per Economic Sector Others,Companies Dissolved,Unemployment Total,%Unemployment Total,Unemployment <25,%Unemployment <25
0,Arcos de Valdevez,14683,0,973,0,0,0,0,0,0,...,-1,59,-1,0,-1,14,900.5,-1,147.9,-1
1,Caminha,10980,256,0,0,0,0,0,0,0,...,-1,53,-1,0,-1,17,729.9,-1,85.9,-1
2,Melgaco,5689,0,0,0,0,0,0,0,0,...,-1,29,-1,0,-1,4,165.7,-1,39.4,-1
3,Moncao,11851,0,0,0,0,0,0,0,0,...,-1,47,-1,0,-1,23,501.7,-1,71.8,-1
4,Paredes de Coura,6299,0,0,0,0,0,0,0,0,...,-1,20,-1,0,-1,10,398.6,-1,52.6,-1


In [3]:
portugal_2013 = pd.read_csv("data_new/lr_portugal_2013.csv", sep=";", encoding="cp860")
portugal_2013 = portugal_2013.rename(columns={"∩╗┐Regions":"Regions"})
portugal_2013.head()

Unnamed: 0,Regions,Votes Total,BE,CDS-PP,CDS-PP-MPT,CDS-PP-MPT-PPM,CDS-PP-PPD/PSD,Grupo de cidadaos,MPT,PAN,...,%Employees per Sector Administration and Services,Employees per Economic Sector Banks,%Employees per Economic Sector Banks,Employees per Economic Sector Others,%Employees per Economic Sector Others,Companies Dissolved,Unemployment Total,%Unemployment Total,Unemployment <25,%Unemployment <25
0,Arcos de Valdevez,13872,0,1687,0,0,0,0,0,0,...,0.032339,52,0.010849,0,0.0,17,1179.8,0.052944,201.2,0.170537379
1,Caminha,10573,0,0,0,0,0,0,0,0,...,0.089518,46,0.011732,0,0.0,16,963.0,0.058488,98.3,0.102076843
2,Melgaco,4855,0,0,0,0,0,0,0,0,...,0.030427,25,0.014628,0,0.0,2,250.6,0.02824,49.1,0.195929769
3,Moncao,11909,0,2221,0,0,0,0,0,0,...,0.044062,41,0.008211,0,0.0,19,727.2,0.038413,124.2,0.170792079
4,Paredes de Coura,6090,0,131,0,0,0,0,0,0,...,0.107837,19,0.011912,0,0.0,9,613.9,0.067917,102.9,0.167616876


In [4]:
portugal_2017 = pd.read_csv("data_new/lr_portugal_2017.csv", sep=";", encoding="cp860")
portugal_2017.head()

Unnamed: 0,Regions,Votes Total,BE,CDS-PP,CDS-PP-MPT,CDS-PP-MPT-PPM,CDS-PP-PPD/PSD,Grupo de cidadaos,MPT,PAN,...,%Employees per Sector Administration and Services,Employees per Economic Sector Banks,%Employees per Economic Sector Banks,Employees per Economic Sector Others,%Employees per Economic Sector Others,Companies Dissolved,Unemployment Total,%Unemployment Total,Unemployment <25,%Unemployment <25
0,Arcos de Valdevez,13550,0,802,0,0,0,0,0,0,...,-1,58,-1,0,-1,47,787.3,-1,129.6,-1
1,Caminha,10346,0,139,0,0,0,0,0,0,...,-1,39,-1,0,-1,33,646.3,-1,71.3,-1
2,Melgaco,4819,0,0,0,0,0,0,0,0,...,-1,21,-1,0,-1,10,196.8,-1,30.0,-1
3,Moncao,11789,0,605,0,0,0,0,0,0,...,-1,43,-1,0,-1,34,506.6,-1,60.0,-1
4,Paredes de Coura,5353,0,0,0,0,0,0,0,0,...,-1,14,-1,0,-1,5,320.3,-1,40.8,-1


#### check consistency between files

all columns are the same

In [5]:
print(np.all(portugal_2009.columns == portugal_2013.columns))
print(np.all(portugal_2009.columns == portugal_2017.columns))
print(np.all(portugal_2013.columns == portugal_2017.columns))

True
True
True


check if the same individuals (regions) are present : problem of encoding

In [6]:
print(np.where(portugal_2013['Regions'] != portugal_2009['Regions']))
print(np.where(portugal_2009['Regions'] != portugal_2017['Regions']))
print(np.where(portugal_2013['Regions'] != portugal_2017['Regions']))

(array([ 61,  63,  69,  86,  96, 120, 126, 128, 129, 131, 147, 160, 204,
       223, 233, 261, 273, 282, 285, 286, 292, 298], dtype=int64),)
(array([ 63,  69, 120, 126, 128, 129, 147, 204, 233, 273, 282, 285, 286,
       292], dtype=int64),)
(array([ 61,  63,  69,  86,  96, 120, 126, 128, 129, 131, 147, 160, 204,
       223, 233, 261, 273, 282, 285, 286, 292, 298], dtype=int64),)


In [7]:
print(portugal_2009["Regions"][[61, 126, 292]])
print(portugal_2013["Regions"][[61, 126, 292]])
print(portugal_2017["Regions"][[61, 126, 292]])

61     Freixo de Espada _ Cinta
126                       T÷bua
292           Sáo Roque do Pico
Name: Regions, dtype: object
61     Freixo de Espada ┼í Cinta
126                       T╦Ábua
292          SΓÇáo Roque do Pico
Name: Regions, dtype: object
61     Freixo de Espada _ Cinta
126                       T_bua
292           Sµo Roque do Pico
Name: Regions, dtype: object


In [8]:
print(len(portugal_2009['Regions'].unique()))
print(len(portugal_2013['Regions'].unique()))
print(len(portugal_2017['Regions'].unique()))

308
308
308


This problem seems to be only linked to encoding, we have the right number of unique values.

- drop some columns with too many missing values 
- drop labels not considered
- drop regions identifiants

In [9]:
other_lbls = ['CDS-PP-MPT','CDS-PP-MPT-PPM', 'CDS-PP-PPD/PSD', 'Grupo de cidadaos', 'MPT', 'PAN',
              'PCTP/MRPP', 'PND', 'PNR', 'PPD/PSD-CDS-PP',
              'PPD/PSD-CDS-PP-MPT', 'PPD/PSD-CDS-PP-MPT-PPM', 'PPD/PSD-CDS-PP-PPM',
              'PPD/PSD-CDS-PP-PPM-MPT', 'PPD/PSD-MPT', 'PPD/PSD-MPT-PPM',
              'PPD/PSD-PPM', 'PPD/PSD-PPM-MPT', 'PPM', 'PPM-PPV', 'PPM-PPV-PND',
                'PPV', 'PS-BE-PND-MPT-PTP-PAN', 'PS-PTP-PND-BE', 'PTP']

In [10]:
portugal_2009_new = portugal_2009.drop(other_lbls, axis=1)
portugal_2013_new = portugal_2013.drop(other_lbls, axis=1)
portugal_2017_new = portugal_2017.drop(other_lbls, axis=1)

In [11]:
portugal_2009_new.describe()

Unnamed: 0,Votes Total,BE,CDS-PP,PCP-PEV,PPD/PSD,PS,Residents Total,Residents 0-14,%Residents 0-14,Residents 15-64,...,%Employees per Sector Administration and Services,Employees per Economic Sector Banks,%Employees per Economic Sector Banks,Employees per Economic Sector Others,%Employees per Economic Sector Others,Companies Dissolved,Unemployment Total,%Unemployment Total,Unemployment <25,%Unemployment <25
count,308.0,308.0,308.0,308.0,308.0,308.0,308.0,308.0,308.0,308.0,...,308.0,308.0,308.0,308.0,308.0,308.0,308.0,308.0,308.0,308.0
mean,17434.159091,542.535714,555.353896,1752.253247,4123.821429,6767.474026,34312.720779,5274.558442,-1.0,22823.016234,...,-1.0,182.233766,-1.0,10.366883,-1.0,97.834416,1710.273052,-1.0,197.460714,-1.0
std,26004.303876,1315.75504,1428.466103,3850.676019,6207.154643,10617.533217,55980.424383,8633.467908,0.0,37492.011162,...,0.0,1161.176604,0.0,29.522229,0.0,399.85729,3185.714859,0.0,325.114378,0.0
min,267.0,0.0,0.0,0.0,0.0,0.0,417.0,45.0,-1.0,289.0,...,-1.0,0.0,-1.0,0.0,-1.0,0.0,0.0,-1.0,0.0,-1.0
25%,4666.75,0.0,0.0,99.0,173.5,1770.0,7382.75,870.0,-1.0,4291.5,...,-1.0,14.0,-1.0,0.0,-1.0,6.0,241.0,-1.0,35.575,-1.0
50%,8631.0,0.0,33.5,401.5,1971.5,3513.5,14763.0,2082.0,-1.0,9463.5,...,-1.0,33.5,-1.0,0.0,-1.0,16.0,538.8,-1.0,75.0,-1.0
75%,18226.25,464.0,486.25,1448.25,4869.0,7352.25,36896.75,5630.25,-1.0,24416.75,...,-1.0,96.5,-1.0,14.0,-1.0,71.5,1719.075,-1.0,217.575,-1.0
max,273158.0,12795.0,18467.0,27949.0,38008.0,123372.0,550466.0,70450.0,-1.0,339542.0,...,-1.0,18607.0,-1.0,440.0,-1.0,6524.0,28491.3,-1.0,2899.5,-1.0


In [12]:
portugal_2009_new[portugal_2009_new==-1].sum(axis=0)

Regions                                                 0
Votes Total                                           0.0
BE                                                    0.0
CDS-PP                                                0.0
PCP-PEV                                               0.0
PPD/PSD                                               0.0
PS                                                    0.0
Ranking                                                 0
Residents Total                                       0.0
Residents 0-14                                        0.0
%Residents 0-14                                      -308
Residents 15-64                                       0.0
%Residents 15-64                                     -308
Residents 65+                                         0.0
%Residents 65+                                       -308
Population without Education                          0.0
%Population without Education                        -308
Population wit

In [13]:
var_to_drop = []
for x in list(portugal_2009.columns):
    if str(x).__contains__("%"):
        var_to_drop.append(x)
var_to_drop

['%Residents 0-14',
 '%Residents 15-64',
 '%Residents 65+',
 '%Population without Education',
 '%Population with High School',
 '%Employees per Sector Agriculture',
 '%Employees per Sector Extraction Industries',
 '%Employees per Sector Transformation Industries',
 '%Employees per Sector Construction',
 '%Employees per Sector Gross and Retails Markets',
 '%Employees per Sector Administration and Services',
 '%Employees per Economic Sector Banks',
 '%Employees per Economic Sector Others',
 '%Unemployment Total',
 '%Unemployment <25']

In [14]:
portugal_2009_new = portugal_2009_new.drop(var_to_drop, axis=1)
portugal_2013_new = portugal_2013_new.drop(var_to_drop, axis=1)
portugal_2017_new = portugal_2017_new.drop(var_to_drop, axis=1)

In [15]:
portugal_2009_last = portugal_2009_new.drop(["Regions", "Ranking"], axis=1)
portugal_2013_last = portugal_2013_new.drop(["Regions", "Ranking"], axis=1)
portugal_2017_last = portugal_2017_new.drop(["Regions", "Ranking"], axis=1)

### adapting the data to a compatible format

Variable ranking to refine (error in the encoding or preferences not correctly ordered)

- a : BE
- b : CDS-PP
- c : PCP-PEV
- d : PPD/PSD
- e : PS

In [16]:
portugal_2009_last = portugal_2009_last.rename(columns = {"BE":"a", "CDS-PP":"b", "PCP-PEV":"c", "PPD/PSD":"d", "PS":"e"})
portugal_2013_last = portugal_2013_last.rename(columns = {"BE":"a", "CDS-PP":"b", "PCP-PEV":"c", "PPD/PSD":"d", "PS":"e"})
portugal_2017_last = portugal_2017_last.rename(columns = {"BE":"a", "CDS-PP":"b", "PCP-PEV":"c", "PPD/PSD":"d", "PS":"e"})

In [17]:
s =  -portugal_2009_last.copy().iloc[0][["a", "b", "c", "d", "e"]].argsort()+5

In [18]:
s

a    5
b    3
c    4
d    1
e    2
Name: 0, dtype: int64

In [19]:
# fct to transform a list of rankings in string
def list_to_str(list_rank):
    return '>'.join([str(elem) for elem in list_rank])

In [24]:
def write_rank(data):
    new_data = data.copy()
    new_data['ranking'] = ""
    for i in range(len(new_data)):
        new_data['ranking'].iloc[i] = [0, 0, 0, 0, 0]
        series_sort =  -data.iloc[i][["a", "b", "c", "d", "e"]].argsort()+5
        for item in ["a", "b", "c", "d", "e"]:
            nb = series_sort[item]
            new_data.iloc[i]['ranking'][nb-1] = item
    new_data['ranking'] = new_data['ranking'].apply(list_to_str)
    return new_data

In [28]:
portugal_2009_ranked = write_rank(portugal_2009_last)
portugal_2009_ranked.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['ranking'].iloc[i] = [0, 0, 0, 0, 0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['ranking'].iloc[i] = [0, 0, 0, 0, 0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['ranking'].iloc[i] = [0, 0, 0, 0, 0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['ranking'].iloc[i] = 

Unnamed: 0,Votes Total,a,b,c,d,e,Residents Total,Residents 0-14,Residents 15-64,Residents 65+,...,Employees per Sector Transformation Industries,Employees per Sector Construction,Employees per Sector Gross and Retails Markets,Employees per Sector Administration and Services,Employees per Economic Sector Banks,Employees per Economic Sector Others,Companies Dissolved,Unemployment Total,Unemployment <25,ranking
0,14683,0,973,289,10403,3018,23297,2717,13572,7008,...,937,930,1005,175,59,0,14,900.5,147.9,d>e>b>c>a
1,10980,256,0,289,6093,4342,16839,2180,11037,3623,...,0,807,1046,346,53,0,17,729.9,85.9,d>e>c>a>b
2,5689,0,0,124,1390,4175,9393,870,5221,3303,...,202,273,323,49,29,0,4,165.7,39.4,e>d>c>b>a
3,11851,0,0,257,2558,8373,19448,2160,11963,5326,...,894,923,969,117,47,0,23,501.7,71.8,e>d>c>b>a
4,6299,0,0,317,2362,3620,9280,1146,5721,2414,...,258,267,324,69,20,0,10,398.6,52.6,e>d>c>b>a


In [29]:
portugal_2013_ranked = write_rank(portugal_2013_last)
portugal_2013_ranked.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['ranking'].iloc[i] = [0, 0, 0, 0, 0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['ranking'].iloc[i] = [0, 0, 0, 0, 0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['ranking'].iloc[i] = [0, 0, 0, 0, 0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['ranking'].iloc[i] = 

Unnamed: 0,Votes Total,a,b,c,d,e,Residents Total,Residents 0-14,Residents 15-64,Residents 65+,...,Employees per Sector Transformation Industries,Employees per Sector Construction,Employees per Sector Gross and Retails Markets,Employees per Sector Administration and Services,Employees per Economic Sector Banks,Employees per Economic Sector Others,Companies Dissolved,Unemployment Total,Unemployment <25,ranking
0,13872,0,1687,478,7837,3870,22284,2469,12831,6984,...,1181,766,920,155,52,0,17,1179.8,201.2,d>e>b>c>a
1,10573,0,0,442,4930,5201,16465,1965,10659,3842,...,378,610,925,351,46,0,16,963.0,98.3,e>d>c>b>a
2,4855,0,0,129,1462,3264,8874,813,4840,3221,...,210,208,313,52,25,0,2,250.6,49.1,e>d>c>b>a
3,11909,0,2221,206,4739,4743,18931,1963,11621,5347,...,735,801,985,220,41,0,19,727.2,124.2,e>d>b>c>a
4,6090,0,131,272,2308,3379,9039,1093,5488,2458,...,288,238,295,172,19,0,9,613.9,102.9,e>d>c>b>a


In [30]:
portugal_2017_ranked = write_rank(portugal_2017_last)
portugal_2017_ranked.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['ranking'].iloc[i] = [0, 0, 0, 0, 0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['ranking'].iloc[i] = [0, 0, 0, 0, 0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['ranking'].iloc[i] = [0, 0, 0, 0, 0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['ranking'].iloc[i] = 

Unnamed: 0,Votes Total,a,b,c,d,e,Residents Total,Residents 0-14,Residents 15-64,Residents 65+,...,Employees per Sector Transformation Industries,Employees per Sector Construction,Employees per Sector Gross and Retails Markets,Employees per Sector Administration and Services,Employees per Economic Sector Banks,Employees per Economic Sector Others,Companies Dissolved,Unemployment Total,Unemployment <25,ranking
0,13550,0,802,551,9674,2523,21473,2209,12370,6894,...,1299,735,926,212,58,0,47,787.3,129.6,d>e>b>c>a
1,10346,0,139,298,4202,5707,16038,1816,10257,3965,...,426,635,919,317,39,0,33,646.3,71.3,e>d>c>b>a
2,4819,0,0,119,0,3052,8479,732,4609,3139,...,268,180,284,59,21,0,10,196.8,30.0,e>c>d>b>a
3,11789,0,605,128,5774,5282,18330,1807,11206,5317,...,849,688,923,238,43,0,34,506.6,60.0,d>e>b>c>a
4,5353,0,0,262,807,4284,8773,1034,5282,2457,...,448,238,300,99,14,0,5,320.3,40.8,e>d>c>b>a


generate new files for label ranking prediction

In [34]:
portugal_2009_end = portugal_2009_ranked.drop(["a", "b", "c", "d", "e"], axis=1)
portugal_2013_end = portugal_2013_ranked.drop(["a", "b", "c", "d", "e"], axis=1)
portugal_2017_end = portugal_2017_ranked.drop(["a", "b", "c", "d", "e"], axis=1)

In [35]:
portugal_2009_end.to_csv("data_new/portugal_2009_end.txt", index=False, sep=",")

In [36]:
portugal_2013_end.to_csv("data_new/portugal_2013_end.txt", index=False, sep=",")

In [37]:
portugal_2017_end.to_csv("data_new/portugal_2017_end.txt", index=False, sep=",")