In [None]:
import pandas as pd
wallets_new_370k = pd.read_csv('wallets_with_points_checkpoint.csv')
wallets_old_10k = pd.read_csv('wallets_with_points_10000.csv')

# To check data sets
print('New Wallets: \n', wallets_new_370k.info())
print('\nOld Wallets: \n', wallets_old_10k.info())

In [None]:
# Null values
null_wallets_370 = wallets_new_370k[wallets_new_370k['Points'].isnull()]
null_wallets_10k = wallets_old_10k[wallets_old_10k['Points'].isnull()]

print('Null Values 370k: \n', null_wallets_370)
print('Null Values 10k: \n', null_wallets_10k)

In [None]:
# Save wallets with Null points to get points data from AWS API
null_points_wallets = null_wallets_370[['Wallet']].rename(columns={'Wallet': 'wallet'}) # rename column for API script to work properly

# null_points_wallets.to_csv("null_points_wallets.csv", index=False) #  Save needed wallets USE WHEN NULL values exist

print("Wallets with null points:")
print(null_wallets_370)
print(null_points_wallets)

In [None]:
# Load dataset to fill null values in wallets_new_370k dataset
null_wallets_points = pd.read_csv('null_filled.csv')

print('\nPoints to fill: \n', null_wallets_points)
print('Null values check: ', null_wallets_points[null_wallets_points['Points'].isnull()])

In [None]:
# Merge the datasets using 'Wallet' column creating extra column with values to fill
merged_df = pd.merge(wallets_new_370k, null_wallets_points, on='Wallet', how='left', suffixes=('', '_new'))

print(merged_df)

In [None]:
# Fill null values in 'Points' with values from 'Points_new'
merged_df['Points'] = merged_df['Points'].fillna(merged_df['Points_new'])

# Drop the unneeded extra column
merged_df = merged_df.drop(columns=['Points_new'])

# Update the original DataFrame
wallets_new_370k = merged_df

# Print the updated DataFrame
print("Updated DataFrame:")
print(wallets_new_370k)

# Check Updated Data Frame for NULL values
print('\nNull Values Check: \n', wallets_new_370k[wallets_new_370k['Points'].isnull()])

In [None]:
# Save the updated DataFrame if needed
# merged_df.to_csv("updated_wallets_new_370k.csv", index=False)

### NEXT STEP - Concat datasets

In [None]:
# Check for common wallets
old_wallets = set(wallets_old_10k['Wallet'])
new_wallets = set(wallets_new_370k['Wallet'])

# Check for overlap
common_wallets = old_wallets.intersection(new_wallets)
print(f"Number of common wallets: {len(common_wallets)}")

In [None]:
# join OLD and NEW datasets
joined_data = pd.concat([wallets_new_370k, wallets_old_10k]).drop_duplicates().reset_index(drop=True)

print(joined_data)
print('\nNull Values Check: \n', joined_data[joined_data['Points'].isnull()])

In [None]:
# Group data to leave only distinct wallets with highest Points (when we have wallets duplicats with different points)
grouped_data = joined_data.groupby('Wallet')['Points'].max().reset_index()

print(grouped_data)
print("\nUnique wallets in grouped_data:", grouped_data['Wallet'].nunique())
print('\nNull Values Check: \n', grouped_data[grouped_data['Points'].isnull()])

In [8]:
# Save final table with distinct Wallets and Points
grouped_data.to_csv("updated_wallets_new_370k.csv", index=False)