In [73]:
import pandas as pd
import numpy
import matplotlib
import seaborn
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

In [74]:
fund_df = pd.read_excel('dataset_gofundme_2.xlsx')

In [75]:
print(fund_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7052 entries, 0 to 7051
Data columns (total 57 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_to_goal         6553 non-null   float64
 1   balance                7052 non-null   int64  
 2   bene_name              1706 non-null   object 
 3   cat_name               7052 non-null   object 
 4   category_id            7052 non-null   int64  
 5   cdn_pic                6927 non-null   object 
 6   cdn_thumbnail          6958 non-null   float64
 7   charity_id             205 non-null    float64
 8   charity_name           205 non-null    object 
 9   city                   7052 non-null   object 
 10  comment_count_full     7052 non-null   int64  
 11  country                7052 non-null   object 
 12  created_at             7052 non-null   object 
 13  currencycode           7052 non-null   object 
 14  custom_complete        7052 non-null   bool   
 15  dona

In [76]:
print(fund_df.head())

   amount_to_goal  balance      bene_name                    cat_name  \
0         66484.0   803516            NaN              Animals & Pets   
1             NaN    69589  Eric Lagstein  Medical, Illness & Healing   
2          8066.0    26934            NaN        Funerals & Memorials   
3         34112.0    15888            NaN              Animals & Pets   
4             NaN    34322            NaN        Funerals & Memorials   

   category_id                          cdn_pic  cdn_thumbnail  charity_id  \
0            3  89979711_1743449371491567_r.png            1.0    142704.0   
1           11  90174593_1744226355687585_r.png            1.0         NaN   
2            9  90213769_1744299031956318_r.png            1.0         NaN   
3            3  90202307_1744243930897535_r.png            1.0         NaN   
4            9   90139729_174404865371667_r.png            1.0         NaN   

     charity_name            city  ...  \
0  happyy cat Inc  Port Jefferson  ...   
1       

In [77]:
# Get columns by data type
data_types = fund_df.dtypes

# Group columns by dtype
dtype_groups = {}
for dtype in data_types.unique():
    dtype_groups[str(dtype)] = list(data_types[data_types == dtype].index)

# Print grouped columns
for dtype, cols in dtype_groups.items():
    print(f"\nColumns with data type '{dtype}':")
    for col in cols:
        print(f" - {col}")



Columns with data type 'float64':
 - amount_to_goal
 - cdn_thumbnail
 - charity_id
 - donation_count
 - facebook_id
 - geoloc/lat
 - geoloc/lng
 - goal_progress
 - good_to_share
 - has_recent_donations
 - heart_count
 - high_balance_score
 - is_popular_2
 - mediatype
 - popularity_1
 - popularity_2
 - popularity_2_fixed
 - recent_donation_count
 - recently_created
 - social_share_total
 - user_language_id

Columns with data type 'int64':
 - balance
 - category_id
 - comment_count_full
 - donation_count_full
 - goalamount
 - id
 - projecttype
 - realbalance
 - status
 - zip

Columns with data type 'object':
 - bene_name
 - cat_name
 - cdn_pic
 - charity_name
 - city
 - country
 - created_at
 - currencycode
 - funddescription
 - fundname
 - last_donation_at
 - locationtext
 - main_img_url
 - partner_codes/0
 - popularity_updated_at
 - state
 - thumb_img_url
 - timeout_start
 - updated_at
 - url
 - user_language_locale
 - user_language_name
 - username
 - youtube_url

Columns with data t

## Data cleaning


In [78]:
columns_to_drop = ['main_img_url', 'thumb_img_url', 'cdn_pic', 'url', 'youtube_url', 'mediatype','cdn_thumbnail', 'facebook_id', 'charity_id' ]
fund_df = fund_df.drop(columns=[col for col in columns_to_drop if col in fund_df.columns])

In [79]:
# Updating column names
fund_df.rename(columns={'geoloc/lat': 'geoloc_lat', 'geoloc/lng': 'geoloc_lng'}, inplace=True)

### Data Type Standardization

In [80]:
# List of date columns
date_columns = [
    'created_at', 
    'last_donation_at', 
    'popularity_updated_at', 
    'timeout_start', 
    'updated_at'
]

#Convert to datetime
for col in date_columns:
    fund_df[col] = pd.to_datetime(fund_df[col], errors='coerce')

print(fund_df[date_columns].dtypes)


created_at               datetime64[ns, UTC]
last_donation_at         datetime64[ns, UTC]
popularity_updated_at    datetime64[ns, UTC]
timeout_start            datetime64[ns, UTC]
updated_at               datetime64[ns, UTC]
dtype: object


In [81]:
#Check for any missing values in the date columns
missing_dates = {col: fund_df[col].isna().sum() for col in date_columns}
print(missing_dates)

{'created_at': np.int64(0), 'last_donation_at': np.int64(0), 'popularity_updated_at': np.int64(0), 'timeout_start': np.int64(0), 'updated_at': np.int64(0)}


In [82]:
# If required for analysis.
#fund_df['days_active'] = (fund_df['last_donation_at'] - fund_df['created_at']).dt.days
#fund_df['days_since_last_update'] = (fund_df['updated_at'] - fund_df['created_at']).dt.days

In [None]:
# Columns to convert to boolean
bool_columns = ['has_recent_donations', 'good_to_share', 'recently_created', 'is_popular_2']

# loop to handle missing values and convert to boolean
for col in bool_columns:
    fund_df[col] = fund_df[col].fillna(0).astype(bool)
    print(f"{col} - dtype: {fund_df[col].dtype}, unique: {fund_df[col].unique()}")



has_recent_donations - dtype: bool, unique: [ True False]
good_to_share - dtype: bool, unique: [ True False]
recently_created - dtype: bool, unique: [ True False]
is_popular_2 - dtype: bool, unique: [ True False]


In [84]:
#Handling int64 data type columns
fund_df['zip'] = fund_df['zip'].astype(str).str.zfill(5)  # Handles leading zeroes

In [85]:
for col in ['donation_count_full', 'comment_count_full']:
    fund_df[col + '_log'] = numpy.log1p(fund_df[col])  # For EDA

In [86]:
print("\nMissing values per column:")
print(fund_df.isnull().sum().sort_values(ascending=False))

# Check for duplicate rows
print(f"\nNumber of duplicate rows: {fund_df.duplicated().sum()}")


Missing values per column:
social_share_total         7045
partner_codes/0            7026
high_balance_score         7005
charity_name               6847
bene_name                  5346
recent_donation_count      2936
donation_count             1612
amount_to_goal              499
user_language_locale          8
user_language_id              8
user_language_name            8
heart_count                   6
funddescription               2
username                      1
city                          0
comment_count_full            0
cat_name                      0
balance                       0
geoloc_lng                    0
geoloc_lat                    0
fundname                      0
donation_count_full           0
currencycode                  0
custom_complete               0
country                       0
created_at                    0
category_id                   0
is_popular_2                  0
id                            0
goal_progress                 0
has_donation

### Handle Missing Values

In [87]:
# Copy the dataframe
fund_df_cleaned = fund_df.copy()

# 1. username
fund_df_cleaned['username'].fillna("Unknown", inplace=True)

# 2. funddescription
fund_df_cleaned['funddescription'].fillna("No description provided", inplace=True)

# 3. charity_name (object)
fund_df_cleaned['charity_name'].fillna("Independent", inplace=True)

# 4. partner_codes/0 
fund_df_cleaned['partner_codes/0'].fillna("None", inplace=True)

# 5. bene_name (object) → fill with "Unknown"
fund_df_cleaned['bene_name'].fillna("Unknown", inplace=True)


median_fill_cols = ['heart_count', 'amount_to_goal', 'donation_count', 'recent_donation_count', 'high_balance_score', 'social_share_total']
for col in median_fill_cols:
    median_val = fund_df_cleaned[col].median()
    fund_df_cleaned[col].fillna(median_val, inplace=True)

mode_fill_cols = ['user_language_id', 'user_language_locale', 'user_language_name']
for col in mode_fill_cols:
    mode_val = fund_df_cleaned[col].mode()[0]
    fund_df_cleaned[col].fillna(mode_val, inplace=True)

In [88]:
print(fund_df_cleaned.isnull().sum().sort_values(ascending=False))

amount_to_goal             0
balance                    0
bene_name                  0
cat_name                   0
category_id                0
charity_name               0
city                       0
comment_count_full         0
country                    0
created_at                 0
currencycode               0
custom_complete            0
donation_count             0
donation_count_full        0
funddescription            0
fundname                   0
geoloc_lat                 0
geoloc_lng                 0
goal_progress              0
goalamount                 0
good_to_share              0
has_donations              0
has_recent_donations       0
heart_count                0
high_balance_score         0
id                         0
is_popular_2               0
last_donation_at           0
locationtext               0
partner_codes/0            0
popularity_1               0
popularity_2               0
popularity_2_fixed         0
popularity_updated_at      0
projecttype   

In [89]:
fund_df_cleaned.to_csv('cleaned_fundme_data.csv', index=False)