In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotly.express as px

In [None]:
# Ignore warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the dataset
data = pd.read_csv("Unemployment in India.csv")

In [None]:
# Display basic information about the dataset
print(data.info())

In [5]:
# Check for null values
print("Null values in each column:")
print(data.isnull().sum())

In [6]:
# Renaming the columns to remove leading and trailing spaces
data.columns = [col.strip() for col in data.columns]

In [None]:
# Recheck for null values after renaming
print("Null values after renaming columns:")
print(data.isnull().sum())

In [None]:
# Columns to check for null values
columns_to_check = ['Region', 'Date', 'Frequency', 
                    'Estimated Unemployment Rate (%)', 
                    'Estimated Employed', 
                    'Estimated Labour Participation Rate (%)', 
                    'Area']

In [9]:
# Identify rows with null values in specified columns
null_rows = data[data[columns_to_check].isnull().any(axis=1)]

In [None]:
# Print the rows with null values
print("Rows with null values:")
print(null_rows)

In [None]:
# Drop rows with null values
data.dropna(inplace=True)

In [12]:
# Check for null values again
print("Null values after dropping:")
print(data.isnull().sum())

In [None]:
# Display data types
print("Data types:")
print(data.dtypes)

In [None]:
# Convert 'Date' to datetime format and extract additional time features
data['Date'] = pd.to_datetime(data['Date'])
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data['MM YYYY'] = data['Date'].dt.strftime('%m-%Y')

In [None]:
# Display the first few rows of the modified DataFrame
print(data.head())

In [None]:
# Descriptive statistics
print("Descriptive statistics:")
print(data.describe())

In [None]:
# Value counts for Area, Region, and Frequency
print("Area value counts:")
print(data.Area.value_counts())
print("Region value counts:")
print(data.Region.value_counts())
print("Frequency value counts:")
print(data.Frequency.value_counts())

In [18]:
# Remove trailing spaces in the Frequency column
data['Frequency'] = data['Frequency'].str.strip()

In [None]:
# Check updated Frequency value counts
print("Updated Frequency value counts:")
print(data.Frequency.value_counts())

In [None]:
# Unique values in Region
print("Unique Regions:")
print(data.Region.unique())
print("Number of unique Regions:", data.Region.nunique())

In [None]:
# Plot average estimated employed in each region
plot_emp = data[['Estimated Employed', 'Region']]
data_emp = plot_emp.groupby('Region').mean().reset_index()
data_emp = data_emp.sort_values('Estimated Employed')
fig = px.bar(data_emp, x='Region', y='Estimated Employed', color='Region',
             title='Average Estimated Employed in Each Region', template='plotly')
fig.show()

In [None]:
# Plot average estimated labour participation rate by area
area = data.groupby('Area')[['Estimated Unemployment Rate (%)', 
                              'Estimated Employed', 
                              'Estimated Labour Participation Rate (%)']].mean()
area = area.reset_index()

fig = px.bar(area, x='Area', y='Estimated Labour Participation Rate (%)', color='Area',
             title='Average Estimated Labour Participation Rate (%) by Area')
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.show()

In [None]:
# Plotting sunburst chart for unemployment rates
unemployment = data[['Region', 'Area', 'Estimated Unemployment Rate (%)']]
figure = px.sunburst(unemployment, path=['Area', 'Region'],
                     values='Estimated Unemployment Rate (%)',
                     width=700, height=600, color_continuous_scale='RdYlGn',
                     title="Unemployment Rate (%) in India")
figure.show()

In [None]:
# Plot average unemployment rate in each region
plot_ump = data[['Estimated Unemployment Rate (%)', 'Region']]
data_unemp = plot_ump.groupby('Region').mean().reset_index()
data_unemp = data_unemp.sort_values('Estimated Unemployment Rate (%)')

fig = px.bar(data_unemp, x='Region', y='Estimated Unemployment Rate (%)', color='Region',
             title='Average Unemployment Rate (%) in Each Region', template='plotly')
fig.show()