In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()
pd.set_option('display.max_rows', 500)

# Reading data

In [2]:
# List all sheet names
markets_list = pd.ExcelFile('../../data/raw/predictit-closed-markets-list.xlsx')
print(markets_list.sheet_names)

['Closed Markets']


In [3]:
# Read in sheet of closed markets
markets = markets_list.parse('Closed Markets')
print(markets.shape)
markets.head()

(2786, 6)


Unnamed: 0,Closed Markets,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,ID,Market,,Ticker,Start Date (ET),Closed Date (ET)
1,4586,How many tweets will @realDonaldTrump post fro...,,RDTS.071118,2018-07-04 08:00:00,2018-07-11 12:10:00.617000
2,3232,Who will be Trump's next Supreme Court nominee?,,SCOTUS2.DJT,2017-05-03 20:00:00,2018-07-10 12:40:07.293000
3,4571,Will Trump's next Supreme Court nominee be a w...,,WOMAN.SCOTUS2.DJT,2018-06-27 20:00:00,2018-07-10 12:36:52.197000
4,4605,"Will @realDonaldTrump or @potus tweet ""Barrett...",,BARR.TRUMPTWEET.070918,2018-07-08 20:00:00,2018-07-10 08:43:42.783000


In [4]:
# How many rows are not null in "Unnamed: 2"?
sum(~markets['Unnamed: 2'].isnull())

0

In [5]:
assert all(markets['Closed Markets'].value_counts() == 1)

Note:
- Header is in Row 1
- Column 2 is all null values
- ID column is all unique

In [6]:
# Re-reading data
markets = markets_list.parse('Closed Markets', header=1, index_col='ID')
markets = markets.drop(columns=['Unnamed: 2'])
print(markets.shape)
markets.head()

(2785, 4)


Unnamed: 0_level_0,Market,Ticker,Start Date (ET),Closed Date (ET)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4586,How many tweets will @realDonaldTrump post fro...,RDTS.071118,2018-07-04 08:00:00,2018-07-11 12:10:00.617
3232,Who will be Trump's next Supreme Court nominee?,SCOTUS2.DJT,2017-05-03 20:00:00,2018-07-10 12:40:07.293
4571,Will Trump's next Supreme Court nominee be a w...,WOMAN.SCOTUS2.DJT,2018-06-27 20:00:00,2018-07-10 12:36:52.197
4605,"Will @realDonaldTrump or @potus tweet ""Barrett...",BARR.TRUMPTWEET.070918,2018-07-08 20:00:00,2018-07-10 08:43:42.783
4604,"Will @realDonaldTrump or @potus tweet ""Hardima...",HARD.TRUMPTWEET.070918,2018-07-08 20:00:00,2018-07-10 08:42:42.360


In [7]:
# Any missing data?
markets.isnull().sum()

Market              0
Ticker              0
Start Date (ET)     0
Closed Date (ET)    0
dtype: int64

In [8]:
elections = markets[(markets['Market'].str.contains('elect')) & (~markets['Market'].str.contains('turnout')) & (~markets['Market'].str.contains('margin'))]
elections.head()

Unnamed: 0_level_0,Market,Ticker,Start Date (ET),Closed Date (ET)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4558,Who will place 2nd in the 2018 Mexican preside...,2ND.MEX.2018,2018-06-25 20:00:00,2018-07-02 11:51:16.383
3894,Who will be elected president of Mexico in 2018?,MEX.2018,2017-11-16 19:00:00,2018-07-02 11:50:15.087
4546,Will People's Alliance win a majority in Turki...,PA.TURKEY2018,2018-06-21 20:00:00,2018-06-30 08:14:53.873
4545,Will Erdoğan win outright majority in 1st roun...,MAJ.TURKEY2018,2018-06-21 20:00:00,2018-06-26 10:51:46.790
4182,Who will be elected president of Colombia in 2...,COLOMBIA.2018,2018-02-28 19:00:00,2018-06-19 16:24:51.793


In [9]:
elections.shape

(211, 4)

In [10]:
elections.to_csv('../../data/cleaned/predictit-closed-markets-list.csv')

From here, we manually removed non-US elections. The final results are at `../../data/cleaned/predictit-closed-markets-list-usa-only.csv`.