### Load and Inspect the Data

In [39]:
import pandas as pd

# Load the dataset
suicide_df = pd.read_csv(r"C:\Users\HP\global_suicide_analysis\data\master.csv")
suicide_df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [None]:
# Shape of Data Set
print(f"Rows: {suicide_df.shape[0]}, Columns: {suicide_df.shape[1]}")

# Data info
suicide_df.info()

# Basic statistics for numeric columns
suicide_df.describe()

In [24]:
# Check for null values in each column
suicide_df.isnull().sum()

country                   0
year                      0
sex                       0
age                       0
suicides_no               0
population                0
suicides/100k pop         0
country-year              0
HDI for year          19456
 gdp_for_year ($)         0
gdp_per_capita ($)        0
generation                0
dtype: int64

In [40]:
# Preview unique countries
print("Countries:", suicide_df['country'].nunique())
print(suicide_df['country'].unique()[:10])  # First 10 countries

# Preview years
print("Years:", suicide_df['year'].unique())

# Preview genders
print("Sex:", suicide_df['sex'].unique())

# Preview age groups
print("Age groups:", suicide_df['age'].unique())


Countries: 101
['Albania' 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Australia'
 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain']
Years: [1987 1988 1989 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002
 2003 2004 2005 2006 2007 2008 2009 2010 1985 1986 1990 1991 2012 2013
 2014 2015 2011 2016]
Sex: ['male' 'female']
Age groups: ['15-24 years' '35-54 years' '75+ years' '25-34 years' '55-74 years'
 '5-14 years']


### Data Cleaning

Handle missing/nulls, Standardize column names, Convert data types (e.g., year to datetime if needed)


In [41]:
# Check for incomplete or null values in each column
suicide_df.isnull().sum()

country                   0
year                      0
sex                       0
age                       0
suicides_no               0
population                0
suicides/100k pop         0
country-year              0
HDI for year          19456
 gdp_for_year ($)         0
gdp_per_capita ($)        0
generation                0
dtype: int64

In [42]:
# Clean column names first
suicide_df.columns = (
    suicide_df.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace('/', '_')
    .str.replace('(', '')
    .str.replace(')', '')
)

# Now check the new column names
print(suicide_df.columns.tolist())


['country', 'year', 'sex', 'age', 'suicides_no', 'population', 'suicides_100k_pop', 'country-year', 'hdi_for_year', 'gdp_for_year_$', 'gdp_per_capita_$', 'generation']


In [43]:
# Now these names will match
suicide_df = suicide_df.drop(columns=['country-year', 'hdi_for_year'])

In [44]:
suicide_df

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides_100k_pop,gdp_for_year_$,gdp_per_capita_$,generation
0,Albania,1987,male,15-24 years,21,312900,6.71,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,2156624900,796,Boomers
...,...,...,...,...,...,...,...,...,...,...
27815,Uzbekistan,2014,female,35-54 years,107,3620833,2.96,63067077179,2309,Generation X
27816,Uzbekistan,2014,female,75+ years,9,348465,2.58,63067077179,2309,Silent
27817,Uzbekistan,2014,male,5-14 years,60,2762158,2.17,63067077179,2309,Generation Z
27818,Uzbekistan,2014,female,5-14 years,44,2631600,1.67,63067077179,2309,Generation Z


In [45]:
# Ensure year is treated as an integer
suicide_df['year'] = suicide_df['year'].astype(int)

# Optionally, convert to datetime (if using for time-series plots)
# suicide_df['year'] = pd.to_datetime(suicide_df['year'], format='%Y') 

# Clean GDP columns (remove commas, convert to int)
suicide_df['gdp_for_year_$'] = suicide_df['gdp_for_year_$'].str.replace(',', '').astype(int)
suicide_df['gdp_per_capita_$'] = suicide_df['gdp_per_capita_$'].astype(int)

In [46]:
suicide_df

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides_100k_pop,gdp_for_year_$,gdp_per_capita_$,generation
0,Albania,1987,male,15-24 years,21,312900,6.71,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,2156624900,796,Boomers
...,...,...,...,...,...,...,...,...,...,...
27815,Uzbekistan,2014,female,35-54 years,107,3620833,2.96,63067077179,2309,Generation X
27816,Uzbekistan,2014,female,75+ years,9,348465,2.58,63067077179,2309,Silent
27817,Uzbekistan,2014,male,5-14 years,60,2762158,2.17,63067077179,2309,Generation Z
27818,Uzbekistan,2014,female,5-14 years,44,2631600,1.67,63067077179,2309,Generation Z


This line below creates a new column in your DataFrame called suicide_rate. It calculates the number of suicides per 100,000 people for each row (which represents a specific country, year, gender, and age group).

In [47]:
suicide_df['suicide_rate'] = (suicide_df['suicides_no'] / suicide_df['population']) * 100000


In [48]:
suicide_df

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides_100k_pop,gdp_for_year_$,gdp_per_capita_$,generation,suicide_rate
0,Albania,1987,male,15-24 years,21,312900,6.71,2156624900,796,Generation X,6.711409
1,Albania,1987,male,35-54 years,16,308000,5.19,2156624900,796,Silent,5.194805
2,Albania,1987,female,15-24 years,14,289700,4.83,2156624900,796,Generation X,4.832585
3,Albania,1987,male,75+ years,1,21800,4.59,2156624900,796,G.I. Generation,4.587156
4,Albania,1987,male,25-34 years,9,274300,3.28,2156624900,796,Boomers,3.281079
...,...,...,...,...,...,...,...,...,...,...,...
27815,Uzbekistan,2014,female,35-54 years,107,3620833,2.96,63067077179,2309,Generation X,2.955121
27816,Uzbekistan,2014,female,75+ years,9,348465,2.58,63067077179,2309,Silent,2.582756
27817,Uzbekistan,2014,male,5-14 years,60,2762158,2.17,63067077179,2309,Generation Z,2.172215
27818,Uzbekistan,2014,female,5-14 years,44,2631600,1.67,63067077179,2309,Generation Z,1.671987
