In [None]:
# Importing the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import calendar
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

**Data Loading, Exploration & Wrangling**

In [None]:
import pandas as pd
df=pd.read_csv('/content/drive/MyDrive/cognorise/Data analytics/1_Unemployment in India.csv')
df

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
# Check the columns

df.columns

In [None]:
# Renaming the columns for easier access

df = df.rename(columns={'Region' : 'States', ' Date' : 'Date',' Frequency': 'Frequency' ,
                        ' Estimated Unemployment Rate (%)' : 'Est_Unemp_Rate', ' Estimated Employed' : 'Est_Emp',
                        ' Estimated Labour Participation Rate (%)' : 'Est_Labour_Rate'}).reset_index(drop = True)


In [None]:
# Rounding estimated employed column for a better visualizing

df['Est_Emp'] = round((df['Est_Emp']/1000000),2)

df.head(2)

In [None]:
# df=df.drop(['Frequency'],axis=1, inplace=True)

In [None]:
# Deatailed information about the dataset

df.info()

In [None]:
df.dtypes

In [None]:
cols=['States','Date','Frequency','Area']
for col in cols:
  df[col]=df[col].fillna(df[col].mode()[0])

In [None]:
cols=['Est_Unemp_Rate','Est_Labour_Rate']
for col in cols:
  df[col]=df[col].fillna(df[col].mean())

In [None]:
df['Est_Emp']=df['Est_Emp'].fillna(df['Est_Emp'].mode()[0])

In [None]:
# prompt: i want to delete  5 entire rows in last empty values rows

# df.drop(df.tail(5).index, inplace=True)


In [None]:
df.isna().sum()

In [None]:
# Checking for percentage of missing value

round(df.isnull().sum()/df.shape[0]*100,2)

In [None]:
(740, 7)
# Changing the dtype of 'Date' column

df['Date'] = pd.to_datetime(df['Date'])

# Converting 'Area' columns to categorical data type

df['Area'] = df['Area'].astype('category')

# Dropping the 'Frequency' column

df = df.drop(['Frequency'],axis = 1)

In [None]:
# Creating new columns for year and month

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

# Mapping integer month values to abbreviated month names


df['Month_Name'] = df['Month'].apply(lambda x : calendar.month_abbr[x])

In [None]:
df.head(3)

In [None]:
df.info()

**Exploratory Analysis and Visualizations**



In [None]:
# Descriptive Statistics

df_stat = df[['Est_Unemp_Rate', 'Est_Emp', 'Est_Labour_Rate']]
round(df_stat.describe(),2)

*Mean Unemployment Rate: 11.79%

*Median Unemployment Rate: 8.35%

*Standard Deviation of Unemployment Rate: 10.72%

In [None]:
# Grouping the data by 'Date' and calculate the mean unemployment rate for each date

mean_unemployment_over_time = df.groupby('Date')['Est_Unemp_Rate'].mean()

# Create a line plot

plt.figure(figsize=(10, 6))
plt.plot(mean_unemployment_over_time.index, mean_unemployment_over_time.values, marker='+', linestyle='--', color='red')
plt.xlabel('Date')
plt.ylabel('Estimated Unemployment Rate (%)')
plt.title('Unemployment Rate Over Time')
plt.grid(True)

plt.show()

The plot reveals periods of particularly high unemployment rates, exceeding 20%, in May and June of 2020.

In [None]:
# Analysing the 'Estimated Unemployment Rate' column using Boxplot for years 2019 and 2020

plt.figure(figsize=(5,5))
plt.style.use('ggplot')
sns.boxplot(y = df['Est_Unemp_Rate'], x = df['Year'], color ='green')

font = {'family': 'serif', 'color': 'blue', 'weight': 'normal', 'size': 12}
plt.xlabel('Year', fontdict=font)
plt.ylabel('Estimated Unemployment Rate (%)', fontdict=font)
plt.title('Unemployment Rate in 2019 vs 2020', fontdict=font)
plt.show()

Due to the lockdown imposed in the wake of Covid-19 pandemic, there was sudden increase in the Unemployment Rate in 2020.

In [None]:
# Bar plot of average Unemployment Rate by State

df_unemp = df[['Est_Unemp_Rate', 'States']].groupby('States').mean().reset_index()
df_unemp = df_unemp.sort_values('Est_Unemp_Rate')
fig = px.bar(df_unemp, x='States', y='Est_Unemp_Rate', color='States',
             labels={'States' : 'States', 'Est_Unemp_Rate' : 'Average Unemployment Rate (%)'},
             title='Average Unemployment Rate in each state', template='plotly_white')
fig.show()


Overall unemployment rate was high for

* Tripura
* Haryana
* Jharkhand
* Bihar

Overall Lowest unemployment rate was for

* Meghalaya
* Odisha
* Assam
* Utarakhand

In [None]:
# Box plot of Unemployment Rate by State

fig = px.box(df, x='States', y='Est_Unemp_Rate', color='States',
             labels={'States' : 'States', 'Est_Unemp_Rate' : 'Unemployment Rate (%)'},
             title='Unemployment Rate Per States', template='plotly')

fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.show()

These box plots show extreme variablity in the unemployment rate for all the sates especially in one side (skewed).

In [None]:
# Sunburst chart showing unemployment rate in each area and state

fig = px.sunburst(df, path=['Area', 'States'], values='Est_Unemp_Rate',
                  title='Sunburst Plot of Estimated Unemployment Rate by States and Area',
                  height=650, template='ggplot2')
fig.show()


In [None]:
# Analysing the 'Labour Rate' column with Violin Plot

plt.figure(figsize=(5,5))
plt.style.use('ggplot')
sns.violinplot(y = df['Est_Labour_Rate'], x = df['Year'], color ='cyan')

font = {'family': 'serif', 'color': 'green', 'weight': 'normal', 'size': 12}
plt.xlabel('Year', fontdict=font)
plt.ylabel('Labour Rate (%)', fontdict=font)
plt.title('Labour Rate 2019 vs 2020', fontdict=font)
plt.show()

Labour Rate shows high variablity in 2020 due to the lockdown in wake of Covid 19.

In [None]:
# Scatter plot of labour Rate by State

plt.figure(figsize=(8,5))
plt.style.use('ggplot')
sns.scatterplot(x = df['Est_Labour_Rate'], y = df['States'], hue = df['Year'], palette =['green','red'])

font = {'family': 'serif', 'color': 'black', 'weight': 'normal', 'size': 12}
plt.xlabel('Labour Rate (%)', fontdict=font)
plt.ylabel('States', fontdict=font)
plt.title('Labour Rate v/s States', fontdict=font)
plt.show()


The Labour Rate is shwoing decreasing tendecy during 2020 for all the states.

In [None]:
# Bar plot of average Labour Rate by States

df_lr = df[['Est_Labour_Rate', 'States']].groupby('States').mean().reset_index()
df_lr = df_lr.sort_values('Est_Labour_Rate')
fig = px.bar(df_lr, x='States', y='Est_Labour_Rate', color='States',
             labels={'States' : 'States', 'Est_Labour_Rate' : 'Labour Rate (%)'},
             title='Average Labour Rate in each States', template='gridon')
fig.show()

In [None]:
# Analysing the Area column

df.Area.value_counts(normalize = True)*100

In [None]:
# Descriptive Statistics based on Area

area_stats = df.groupby(['Area'])[['Est_Unemp_Rate', 'Est_Emp', 'Est_Labour_Rate']]
round(area_stats.describe(),2).T

The Mean and Median Unemployment rate were higher for Urban areas

In [None]:
# Pie chart for Rural and Urban count

plt.figure(figsize=[6, 5])
plt.pie(df['Area'].value_counts(), labels=df['Area'].value_counts().index, autopct='%1.1f%%')
plt.title('Percentage share: Rural vs Urban')
plt.show()

This shows the dataset was taken almost equally from both urban and rural areas


In [None]:
# Average Unemployment Rate in each Area

df_ar_unemp = df[['Est_Unemp_Rate', 'Area']].groupby('Area').mean().reset_index()
df_ar_unemp = df_ar_unemp.sort_values('Est_Unemp_Rate')
fig = px.bar(df_ar_unemp, x='Area', y='Est_Unemp_Rate', color='Area',
             labels={'Area' : 'Area', 'Est_Unemp_Rate' : 'Average Unemployment Rate (%)'},
             title='Average Unemployment Rate in each Area', template='gridon')
fig.show()

The Mean Unemployment Rate was higher for Urban areas

In [None]:
# Mean Unemploymnet rate in each area for 2019 vs 2020

plt.figure(figsize = [6,5])
palette ={2019: "black", 2020: "cyan"}
sns.barplot(data = df, x = 'Area', y="Est_Unemp_Rate", estimator='mean', hue='Year', errorbar=None, palette=palette)

font = {'family': 'serif', 'color': 'darkred', 'weight': 'normal', 'size': 12}
plt.xlabel('Area', fontdict=font)
plt.ylabel('Estimated Unemployment Rate (%)', fontdict=font)
plt.title('Average Unemployment Rate: Rural vs Urban', fontdict=font)
plt.show()

Both Rural and Urban areas had higher unemployment rate in 2020 than 2019.

Urban areas had higher unemployment rate than rural areas.

In [None]:
# Calculate correlation matrix

df_htmp = df[['Est_Unemp_Rate', 'Est_Emp', 'Est_Labour_Rate', 'Year', 'Month']].corr()
plt.figure(figsize=(6, 4))
sns.heatmap(df_htmp, annot=True, cmap='plasma', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

When unemployment rates go up, the number of employed people tends to go down, showing an opposite relationship.

The effect of year on unemployment in this dataset is clear from the correlation coeeficient.

In [None]:
# Scatter matrix cosidering the employed and unemployed rates

fig = px.scatter_matrix(df, template='plotly',
                        dimensions=['Est_Unemp_Rate', 'Est_Emp', 'Est_Labour_Rate'],
                        color='States')
fig.show()

**Lockdown Impact**

In [None]:
# Creating new dataframes for before and after lockdown period during the same months

before_lockdown = df[(df['Month'] >= 4) & (df['Month'] <= 6) & (df['Year'] ==2019)]
after_lockdown = df[(df['Month'] >= 4) & (df['Month'] <= 6) & (df['Year'] ==2020)]

In [None]:
# Calculating the mean unemployment rate before lockdown by state

bf_lockdown= before_lockdown.groupby('States')['Est_Unemp_Rate'].mean().reset_index()

# Calculating the mean unemployment rate after lockdown by state

af_lockdown = after_lockdown.groupby('States')['Est_Unemp_Rate'].mean().reset_index()

# Combining the mean unemployment rates before and after lockdown by state

combined_df = pd.merge(bf_lockdown, af_lockdown, on='States')

combined_df.columns=['States','Unemployment Rate Before Lockdown','Unemployment Rate After Lockdown']
combined_df.head()

In [None]:
# Percentage change in unemployment rate

combined_df['Rate Change in Unemployment'] = round(combined_df['Unemployment Rate After Lockdown'] - combined_df['Unemployment Rate Before Lockdown']/combined_df['Unemployment Rate Before Lockdown'],2)
percent_change_df = combined_df.sort_values('Rate Change in Unemployment')

In [None]:
# Percentage change in unemployment after lockdown

fig = px.bar(percent_change_df, x='States',y='Rate Change in Unemployment',color='Rate Change in Unemployment',
            labels={'States' : 'States', 'Rate Change in Unemployment' : '% Change in Unemployment'},
            title='% Change in Unemployment Rate from 2019 to 2020 for Apr-Jun ',
            color_continuous_scale = 'viridis', width=900, height=500)
fig.show()

Most impacted States/Union Territories

* Puducherry
* Jharkhand
* Bihar
* Haryana
* Tamil Nadu

**Observations**

>Unemployment Rate

* Mean Unemployment Rate: 11.79%
* Median Unemployment Rate: 8.35%
* Standard Deviation of Unemployment Rate: 10.72%
* Due to the lockdown imposed in the wake of Covid-19 pandemic, there was sudden increase in the Unemployment Rate in 2020.
* There were periods of particularly high unemployment rates, exceeding 20%, in May and June of 2020.
* Overall unemployment rate was high for

    * Tripura
    * Haryana
    * Jharkhand
    * Bihar
* Overall Lowest unemployment rate was for

   *  Meghalaya
    * Odisha
    * Assam
    * Utaarakhand
* There was extreme variablity in the unemployment rate for all the sates especially in one side (skewed).

**Labour Rate**
* Labour Rate shows high variablity in 2020 due to the lockdown in wake of Covid 19.
* The Labour Rate is shwoing decreasing tendecy during 2020 for all the states.
**Rural vs Urban**
* The Mean and Median Unemployment rate were higher for Urban areas
* The dataset was taken almost equally from both urban and rural areas
* The Mean Unemployment Rate was higher for Urban areas
* Both Rural and Urban areas had higher unemployment rate in 2020 than 2019.
* Urban areas had higher unemployment rate than rural areas.

**Miscellaneous**
* When unemployment rates go up, the number of employed people tends to go down, showing an opposite relationship.
* The effect of year on unemployment in this dataset is clear from the correlation coeeficient

* Most impacted States/Union Territories

  *  Puducherry
  *  Jharkhand
  *  Bihar
  *  Haryana
  *  Tamil Nadu