In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [53]:
election_data = pd.read_csv("data/filtered.csv")
len(election_data)

58602

In [54]:
# Filter out invalid sheets
election_data = election_data[election_data["Result_Sheet_Invalid"]==False]
len(election_data)

57927

In [55]:
# Filter out unclear sheets
election_data = election_data[election_data["Result_Sheet_Unclear"]==False]
election_data = election_data[election_data["Accredited_Voters"]>0]
len(election_data)

56694

In [56]:
election_data["PU-Code"].nunique()

56694

In [57]:
# Filter out election sheets that had been tampered with
election_data = election_data[election_data["Result_Sheet_Corrected"]==False]
len(election_data)

48694

In [58]:
"""
    Sampling Strategy:
        For each party:
            For each state:
                Sample 60% of results where the party won
                    Ensuring stratificatin across 10 bins 
            
        For any state where the number of sampled votes < 999:
            add more samples to make it up to 999 
"""
"""
    Reason: The parties had 'strongholds' in different states. We don't want the models overfitting to that in any way.
"""

"\n    Reason: The parties had 'strongholds' in different states. We don't want the models overfitting to that in any way.\n"

In [59]:
# Apply the function for each party
def apply_sampling_for_all_parties(df, state_column):
    parties = ['APC', 'LP', 'PDP', 'NNPP']
    all_samples = []

    for party in parties:
        party_samples = stratified_sampling_with_wins(
            df=df,
            party_column=party,
            state_column=state_column,
            n_bins=10,
            bin_samples=250,
            win_samples=85,
            random_state=42
        )
        all_samples.append(party_samples)

    return pd.concat(all_samples)

# Example usage
# df = pd.read_csv('election_results.csv')
# sampled_df = apply_sampling_for_all_parties(df, state_column='State')
# sampled_df.to_csv('sampled_results.csv', index=False)

In [60]:
unique_row_count = election_data.drop_duplicates().shape[0]

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("data/filtered.csv")

# Add a 'Winner' column to identify the party with the highest votes
def get_winner(row):
    parties = ["APC", "LP", "PDP"]#, "NNPP"] #NNPP  # Include other parties as necessary
    winner_party = row[parties].idxmax() if row[parties].max() > 0 else None
    winner_votes = row[parties].max() if row[parties].max() > 0 else None
    return pd.Series([winner_party, winner_votes])

df[["Winner", "Winner_Votes"]] = df.apply(get_winner, axis=1)

print(len(df))

# Remove rows where no party won
df = df.dropna(subset=["Winner"])
# Create empty DataFrames to hold the splits
train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

bins = [0, 10, 50, 100, 150, 200, 250, 300, 350, 400, 500, 600]

# Perform the sampling for each party and state
acc_len = 0
train = 0 
for party in ["APC", "LP", "PDP"]: #"NNPP"]:  # Add other parties as necessary
    for state in df["State"].unique():
        # Filter data for the current state and party
        state_party_data:pd.DataFrame = df[(df["Winner"] == party) & (df["State"] == state)]
        
        if state_party_data.empty:
            print(state, party, "empty")
            continue  # Skip if there are no results for the current party and state
        num_bins = min(6, len(state_party_data))  # Ensure at least 2 bins
        vote_bins = pd.qcut(
            state_party_data["Winner_Votes"], q=num_bins, duplicates="drop"
        )

        train_df = val_df = test_df = None
# Explicitly assign bins to the column
        state_party_data = state_party_data.assign(Vote_Bin=vote_bins)
        acc_len += len(state_party_data)

        # print(len(state_party_data))
        try: 
            train_df, temp_df = train_test_split(
                state_party_data,
                test_size=0.4,
                stratify=state_party_data["Vote_Bin"],
                random_state=42
            )
            # print(f"TrainDF: {len(train_df)}, tempDF: {len(temp_df)}")
        except:
            temp_df = state_party_data

        # Stratified split of temp into validation (20%) and testing (20%)
        try:
            val_df, test_df = train_test_split(
                temp_df,
                test_size=0.5,
                stratify=temp_df["Vote_Bin"],
                random_state=42
            )
        except:
            test_df = temp_df
        
        # Append the splits to the main DataFrames
        if (train_df is not None):
            train += len(train_df)
            train_data = pd.concat([train_data, train_df])
        if (val_df is not None):
            val_data = pd.concat([val_data, val_df])
        if (test_df is not None):
            test_data = pd.concat([test_data, test_df])

# Save each subset to a separate CSV file
train_data.to_csv("data_splits/train_data.csv", index=False)
val_data.to_csv("data_splits/val_data.csv", index=False)
test_data.to_csv("data_splits/test_data.csv", index=False)



print("Sampling completed. Files saved as 'data_splits/train_data.csv', 'data_splits/val_data.csv', and 'data_splits/test_data.csv'.")


58602
1199
1212
1836
1882
2201
2273
2570
2711
2792
3659
3981
5190
5274
5569
6227
7473
8135
8145
8197
8210
8811
9005
9772
10607
10730
11679
11797
13716
14259
14413
14491
14953
14996
YOBE LP empty
15680
15741
15760
16257
16687
16747
16844
16856
18210
18300
19528
19553
20394
20939
ZAMFARA LP empty
22816
23175
23187
SOKOTO LP empty
23667
23698
23833
24677
24945
25081
25266
25558
25978
26123
26135
26433
27149
27365
27376
28064
28180
28462
28713
29299
29395
29409
29592
29983
30159
30282
30291
30307
30489
30499
30518
31104
31210
31249
31413
31456
31466
31518
31678
32034
33374
33435
33702
Sampling completed. Files saved as 'train_data.csv', 'val_data.csv', and 'test_data.csv'.


In [69]:
len(train_data)

33702

In [70]:
len(test_data)

11442

In [71]:
len(val_data)

11203