In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Display settings for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)


In [15]:
# Load your dataset
df = pd.read_csv("C:/Users/Ninad/Desktop/Project_RAG/data/crowdfunding.csv")

# Display first few rows
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [16]:
# Shape of the dataset
print("Dataset shape (rows, columns):", df.shape)

# Column names
print("\nColumns in the dataset:")
print(df.columns.tolist())

# Basic info about datatypes
print("\nData info:")
df.info()


Dataset shape (rows, columns): (378661, 15)

Columns in the dataset:
['ID', 'name', 'category', 'main_category', 'currency', 'deadline', 'goal', 'launched', 'pledged', 'state', 'backers', 'country', 'usd pledged', 'usd_pledged_real', 'usd_goal_real']

Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                378661 non-null  int64  
 1   name              378657 non-null  object 
 2   category          378661 non-null  object 
 3   main_category     378661 non-null  object 
 4   currency          378661 non-null  object 
 5   deadline          378661 non-null  object 
 6   goal              378661 non-null  float64
 7   launched          378661 non-null  object 
 8   pledged           378661 non-null  float64
 9   state             378661 non-null  object 
 10  backers           378661 non-null  int64  
 1

In [17]:
# Count missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Check for duplicated rows (if any)
print("\nDuplicated rows:", df.duplicated().sum())




Missing values per column:
ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64

Duplicated rows: 0

Duplicated rows: 0


In [18]:
# Numerical summary
print("\nDescriptive statistics for numeric columns:")
print(df.describe())

# Unique value counts for important categorical fields
print("\nUnique main categories:", df['main_category'].nunique())
print("Top main categories:")
print(df['main_category'].value_counts().head(10))



Descriptive statistics for numeric columns:
                 ID          goal       pledged        backers   usd pledged  usd_pledged_real  usd_goal_real
count  3.786610e+05  3.786610e+05  3.786610e+05  378661.000000  3.748640e+05      3.786610e+05   3.786610e+05
mean   1.074731e+09  4.908079e+04  9.682979e+03     105.617476  7.036729e+03      9.058924e+03   4.545440e+04
std    6.190862e+08  1.183391e+06  9.563601e+04     907.185035  7.863975e+04      9.097334e+04   1.152950e+06
min    5.971000e+03  1.000000e-02  0.000000e+00       0.000000  0.000000e+00      0.000000e+00   1.000000e-02
25%    5.382635e+08  2.000000e+03  3.000000e+01       2.000000  1.698000e+01      3.100000e+01   2.000000e+03
50%    1.075276e+09  5.200000e+03  6.200000e+02      12.000000  3.947200e+02      6.243300e+02   5.500000e+03
75%    1.610149e+09  1.600000e+04  4.076000e+03      56.000000  3.034090e+03      4.050000e+03   1.550000e+04
max    2.147476e+09  1.000000e+08  2.033899e+07  219382.000000  2.033899e+0

In [19]:
# Distribution of project states
print("\nProject state distribution:")
print(df['state'].value_counts())

# Calculate success rate
success_rate = (df['state'].str.lower() == 'successful').mean() * 100
print(f"\nOverall success rate: {success_rate:.2f}%")



Project state distribution:
state
failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: count, dtype: int64

Overall success rate: 35.38%


In [20]:
# Top 5 countries by number of projects
print("\nTop 5 countries:")
print(df['country'].value_counts().head(5))



Top 5 countries:
country
US    292627
GB     33672
CA     14756
AU      7839
DE      4171
Name: count, dtype: int64


In [21]:
# Check date format
print("\nSample launch and deadline values:")
print(df[['launched', 'deadline']].head())

# Convert to datetime to check durations (preview)
df['launched'] = pd.to_datetime(df['launched'], errors='coerce')
df['deadline'] = pd.to_datetime(df['deadline'], errors='coerce')
df['duration_days'] = (df['deadline'] - df['launched']).dt.days

print("\nAverage campaign duration:", df['duration_days'].mean())



Sample launch and deadline values:
              launched    deadline
0  2015-08-11 12:12:28  2015-10-09
1  2017-09-02 04:43:57  2017-11-01
2  2013-01-12 00:20:50  2013-02-26
3  2012-03-17 03:24:11  2012-04-16
4  2015-07-04 08:35:03  2015-08-29

Average campaign duration: 33.48112163650336

Average campaign duration: 33.48112163650336


In [22]:
# Correlation between pledged and backers
correlation = df['pledged'].corr(df['backers'])
print(f"\nCorrelation between pledged and backers: {correlation:.2f}")



Correlation between pledged and backers: 0.72


In [23]:
print("\n===== DATASET SUMMARY =====")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print(f"Unique Categories: {df['main_category'].nunique()}")
print(f"Unique Countries: {df['country'].nunique()}")
print(f"Success Rate: {success_rate:.2f}%")
print(f"Average Duration: {df['duration_days'].mean():.1f} days")



===== DATASET SUMMARY =====
Rows: 378661, Columns: 16
Unique Categories: 15
Unique Countries: 23
Success Rate: 35.38%
Average Duration: 33.5 days
