In [None]:
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install matplotlib
!pip install seaborn

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
sns.set(style="whitegrid")

In [12]:
df = pd.read_csv('World Marriage Dataset.csv')            #Loading csv file
df.head(12)

Unnamed: 0,Sr.No.,Country,AgeGroup,Sex,MaritalStatus,DataProcess,Data Collection (Start Year),Data Collection (End Year),Data Source
0,1,Afghanistan,[15-19],Man,Divorced,Survey,1972,1974,National statistics
1,2,Afghanistan,[20-24],Man,Divorced,Survey,1972,1974,National statistics
2,3,Afghanistan,[25-29],Man,Divorced,Survey,1972,1974,National statistics
3,4,Afghanistan,[30-34],Man,Divorced,Survey,1972,1974,National statistics
4,5,Afghanistan,[35-39],Man,Divorced,Survey,1972,1974,National statistics
5,6,Afghanistan,[40-44],Man,Divorced,Survey,1972,1974,National statistics
6,7,Afghanistan,[45-49],Man,Divorced,Survey,1972,1974,National statistics
7,8,Afghanistan,[50-54],Man,Divorced,Survey,1972,1974,National statistics
8,9,Afghanistan,[55-59],Man,Divorced,Survey,1972,1974,National statistics
9,10,Afghanistan,[60-64],Man,Divorced,Survey,1972,1974,National statistics


In [13]:
df.shape              # Checking the dimentions of the file

(271604, 9)

In [14]:
df.info()             # Checking basic information of the file

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271604 entries, 0 to 271603
Data columns (total 9 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Sr.No.                        271604 non-null  int64 
 1   Country                       271604 non-null  object
 2   AgeGroup                      271604 non-null  object
 3   Sex                           271604 non-null  object
 4   MaritalStatus                 271604 non-null  object
 5   DataProcess                   271604 non-null  object
 6   Data Collection (Start Year)  271604 non-null  int64 
 7   Data Collection (End Year)    271604 non-null  int64 
 8   Data Source                   271604 non-null  object
dtypes: int64(3), object(6)
memory usage: 18.6+ MB


# Country

In [16]:
df['Country'].nunique()                       # No. of unique countries

235

In [17]:
countries = df['Country'].unique()            # Printing Unique countries
print(countries)

['Afghanistan' 'Albania' 'Algeria' 'American Samoa' 'Angola' 'Anguilla'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan'
 'Bolivia (Plurinational State of)' 'Bosnia and Herzegovina' 'Botswana'
 'Brazil' 'British Virgin Islands' 'Brunei Darussalam' 'Bulgaria'
 'Burkina Faso' 'Burundi' 'Cabo Verde' 'Cambodia' 'Cameroon' 'Canada'
 'Cayman Islands' 'Central African Republic' 'Chad' 'Channel Islands'
 'Chile' 'China' 'China, Hong Kong SAR' 'China, Macao SAR'
 'China, Taiwan Province of China' 'Colombia' 'Comoros' 'Congo'
 'Cook Islands' 'Costa Rica' "Côte d'Ivoire" 'Croatia' 'Cuba' 'Curaçao'
 'Cyprus' 'Czechia' "Dem. People's Rep. of Korea"
 'Democratic Republic of the Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Ethiopia' 'Faeroe Islands'
 'Falkland Islands (Malv

In [None]:
df['Country'].replace('The former Yugoslav Republic of Macedonia', 'TFYR Macedonia', inplace = True)
df['Country'].replace('Netherlands Antilles', 'Netherlands', inplace = True)
df['Country'].replace("Lao People's Dem. Republic", 'Lao People’s Democratic Republic', inplace = True)
df['Country'].replace("Lao People's Democratic Republic", 'Lao People’s Democratic Republic', inplace = True)
df['Country'].replace("Dem. People's Rep. of Korea", 'Republic of Korea', inplace = True)
# Corrceting same countries with different denotations

In [None]:
df['Country'].nunique()

In [None]:
df['Country'].value_counts().head(10)                # Top 10 Countries

In [None]:
top_country = df['Country'].value_counts().head(15).index          # Saving the top 10 countries for further use in analysis
top_countries = df[df['Country'].isin(top_country)]
top_countries.head()

In [None]:
fig, ax =plt.subplots(figsize= (20,8))
ax = sns.countplot(x = 'Country', hue = 'Sex' ,data = top_countries, order = top_countries['Country'].value_counts().index)
ax.tick_params(axis = 'x', rotation = 60)
ax.tick_params(axis = 'both', labelsize = 15)
ax.set_xlabel('Countries', fontsize = 17)
ax.set_ylabel('Counts', fontsize = 17)
ax.set_title('Top 10 Countries gender ratio')
plt.show()
fig.figure.savefig('one.png')

The above plot shows the gender distribution of the top 10 countries in the data.
It is clear that Men and Women are almost equal in numbers in the survey.

# Age Group

In [None]:
df['AgeGroup'].nunique()                     # Unique values in the column

In [None]:
df['AgeGroup'].unique()                      # Printing unique values in the column

In [None]:
df['AgeStart'] = df['AgeGroup'].str.extract('(\d+)').astype(int)
df.sort_values(by = 'AgeStart', ascending = True, inplace = True)
df.drop('AgeStart', axis= 1, inplace = True)
df.head()                                                               # Arranging Age Groups in ascending order

In [None]:
ax, fig = plt.subplots(figsize = (20,8))
ax = sns.histplot(x= 'AgeGroup', data= df)
ax.tick_params(axis = 'x', rotation = 90)
ax.tick_params(axis = 'both', labelsize = 15)
ax.set_xlabel('Age Groups', fontsize = 17)
ax.set_ylabel('Count', fontsize = 17)
ax.set_title('Distribution of Age Group')
plt.show()
fig.figure.savefig('two.png')

The above graph shows the distribution plot of the age groups in the dataset.
The distribution is slightly left skewed or negatively skewed. 

# Sex

In [None]:
df['Sex'].unique()

In [None]:
fig, ax = plt.subplots(figsize = (10,4))
ax = sns.countplot(x = 'Sex', data = df)
ax.tick_params(axis = 'both', labelsize = 10)
ax.set_xlabel('Gender', fontsize = 12)
ax.set_ylabel('Count', fontsize = 12)
ax.set_title('Distribution between Genders')
ax.bar_label(ax.containers[0])
plt.show()
fig.figure.savefig('three.png')

The above figure is the countplot of the column 'Sex'.
It shows the number of men and women in the dataset.
The number is very close.
So we can say that both the genders are equally represented here.

# Marital Status

In [18]:
df['MaritalStatus'].nunique()

35

In [19]:
df['MaritalStatus'].value_counts()

Widowed                                          57002
Married                                          53955
Divorced                                         51269
Single                                           48472
Separated                                        17766
Consensual union                                 11279
Never married                                    10429
Divorced or Separated                             4886
Living together                                   4536
Not living together                               4335
Married or in consensual union                    2314
Married or married but separated                  1520
Married or Living together                         892
Currently not married                              607
Ever married                                       509
Widowed or divorced                                290
Consensual union, not living together              265
Not in union                                       249
Married, i

In [20]:
marital_mapping = {
    'Married': 'Married',
    'Married or in consensual union': 'Married',
    'Married or Living together': 'Married',
    'Married monogamous': 'Married',
    'Married polygamous': 'Married',
    'Married spouse absent': 'Married',
    'Married spouse present': 'Married',
    'Marriage contract': 'Married',
    'Married gaunna not performed': 'Married',
    'Registred partnership': 'Married',
    'Married, in consensual unions or separated': 'Married',
    'Single': 'Single',
    'Never married': 'Single',
    'Currently not married': 'Single',
    'Not in union': 'Single',
    'Currently not married nor in consensual union': 'Single',
    'Single or in consensual unions': 'Single',
    'Ever married': 'Single',  # Assuming typo for "Never married"
    'Divorced': 'Divorced',
    'Divorced or Separated': 'Divorced',
    'Divorced or Widowed': 'Divorced',
    'Divorced or Separated or Widowed': 'Divorced',
    'Widowed': 'Widowed',
    'Widowed or divorced': 'Widowed',
    'Widowed, divorced or separated': 'Widowed',
    'Widowed or separated': 'Widowed',
    'Separated': 'Separated',
    'Married or married but separated': 'Separated',
    'Separated from consensual union': 'Separated',
    'Separated from marriage': 'Separated',
    'Consensual union': 'Consensual Union',
    'Living together': 'Consensual Union',
    'Not living together': 'Consensual Union',
    'Consensual union, not living together': 'Consensual Union',
    'Visiting partner': 'Consensual Union'
}
df['MaritalStatus'] = df['MaritalStatus'].replace(marital_mapping)
# Correcting some typos and assigning them the correct values

In [21]:
df['MaritalStatus'].value_counts()

Single              60340
Married             57713
Widowed             57363
Divorced            56319
Consensual Union    20519
Separated           19350
Name: MaritalStatus, dtype: int64

In [None]:
marital_trends = df.groupby(['Data Collection (Start Year)', 'MaritalStatus', 'Sex']).size().reset_index(name='Count')

fig, ax = plt.subplots(figsize = (50, 20))
ax = sns.lineplot(x='Data Collection (Start Year)', y='Count', hue='MaritalStatus', data= marital_trends)
ax.set_title('Marital status trends over time', fontsize = 45)
ax.tick_params(axis = 'both', labelsize = 25)
ax.set_xlabel('Start year of data collection', fontsize = 45)
ax.set_ylabel('Count', fontsize = 45)
legend = plt.legend(title='Marital Status', loc='upper left', borderpad=1, labelspacing=1, edgecolor='black')
plt.setp(legend.get_texts(), fontsize= 15) 
plt.show()
fig.figure.savefig('four.png')

The above figure is the lineplot of marital status over the period of time.
Married status remains consistently the most common throughout the years.
Single status also shows a steady presence but is significantly lower than married.
Divorced and Widowed statuses remain relatively stable but at much lower counts.
There may be fluctuations in data collection volumes over time, possibly due to survey frequency or country participation.

In [None]:
gender_trends = df.groupby(['Data Collection (Start Year)', 'MaritalStatus', 'Sex']).size().reset_index(name='Count')

fig, ax = plt.subplots(figsize = (50, 20))
ax = sns.lineplot(x='Data Collection (Start Year)', y='Count', hue='MaritalStatus',style = 'Sex', data=gender_trends)
ax.set_title('Gender-wise Marital Status trends over time', fontsize = 45)
ax.tick_params(axis = 'both', labelsize = 25)
ax.set_xlabel('Start year of data collection', fontsize = 45)
ax.set_ylabel('Count', fontsize = 45)
legend = plt.legend(title='Marital status and gender', loc='upper left', borderpad=1, labelspacing=1, edgecolor='black')
plt.setp(legend.get_texts(), fontsize=15) 
plt.show()
fig.figure.savefig('five.png')

The above figure is the lineplot of marital status and gender over the period of time.
Married status is consistently the most common status for both men and women, with slightly higher counts for women over time.
Single status has similar patterns for both genders, but slightly more prevalent among men.
Widowed status is significantly higher among women, reflecting longer life expectancy or cultural patterns.
Divorced status is fairly balanced between genders with no dramatic differences over time.

In [None]:
fig, ax = plt.subplots(figsize = (40, 15))
sns.countplot(x= 'AgeGroup', hue= 'MaritalStatus', data = df, order = sorted(df['AgeGroup'].unique()))
ax.set_title('Marital status distribution across age groups', fontsize = 35)
ax.set_xlabel('Age Group', fontsize = 35)
ax.set_ylabel('Count', fontsize = 35)
legend = plt.legend(title='Marital Status')
ax.tick_params(axis = 'x' ,rotation=90)
ax.tick_params(axis = 'both', labelsize = 18)
plt.show()
fig.figure.savefig('six.png')

The above figure shows the marital status across the age groups.
Younger Age Groups ([15-19], [20-24]) are Dominated by "Single" status.
Middle Age Groups ([25-39]) : "Married" becomes the most prevalent status.
Older Age Groups (40+): Increase in "Widowed" and "Divorced" statuses.

There's a clear life-cycle pattern:
Young people are predominantly single.
Middle-aged people are mostly married.
Older people have higher proportion of widowed individuals.

# Data Source

In [None]:
df['Data Source'].nunique()

In [None]:
df['Data Source'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
ax = sns.countplot(x= 'Data Source', data = df, order = df['Data Source'].value_counts().index)
ax.bar_label(ax.containers[0])
ax.tick_params(axis='x', rotation=60)
ax.set_xlabel('Data Source', fontsize = 13)
ax.set_ylabel('Count', fontsize = 13)
plt.show()
fig.figure.savefig('seven.png')

The above figure is the countplot of Data Source column.
It shows the different data sources the the number of data collected from each source.
It is clear that UNSD is the primary source of information for this data set.
The other sources have significantly less data collected from them.

# Data Process

In [None]:
df['DataProcess'].nunique()

In [None]:
df['DataProcess'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize = (10,5))
ax = sns.countplot(x = 'DataProcess', data = df, order = df['DataProcess'].value_counts().index)
ax.bar_label(ax.containers[0])
ax.tick_params(axis = 'both', labelsize = 10)
ax.set_xlabel('Process through which data was collected', fontsize = 14) 
ax.set_ylabel('Count', fontsize = 14)
ax.set_title('Data Process')
plt.show()
fig.figure.savefig('eight.png')

The above plot is the countplot of Data Processes.
It shows the different processes through which data was collected.
The figure shows that the data was mostly collected by 3 methods -- Census , Estimate and Survey.
Census being the highest.
Significantly less data is collected through the remaining three methods or processes.