In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
df = pd.read_csv('Unemployment in India.csv')

In [None]:
print(df)

In [None]:
#first five rows and columns in dataset
df.head()

In [None]:
#last five rows and columns in datset
df.tail()

In [None]:
#total number of rows and columns in dataset
df.shape

In [None]:
#checking for missing values
df.isnull().sum()

In [None]:
#dropping missing values
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
#column's data type
df.dtypes

In [None]:
#names of columns in dataset
df.columns

In [None]:
#removing empty spaces before and after the column names
df.columns=df.columns.str.strip()
df

In [None]:
#checking for duplicate values
print(df.duplicated().sum())

In [None]:
#detailed information about dataset
df.info()

In [None]:
#Descriptive statistics
df.describe()

In [None]:
#Adding column to dataset
df['Date'] = pd.to_datetime(df['Date'])
df['MM YYYY'] = df['Date'].dt.strftime('%m %Y')

In [None]:
df

In [None]:
#counting unique values of frequency
df.value_counts('Frequency')

In [None]:
#replacing ' Monthly' to 'Monthly'
df['Frequency']=df['Frequency'].replace(' Monthly','Monthly')

In [None]:
df.value_counts('Frequency')

In [None]:
#counting unique values in region
df.value_counts('Region')

In [None]:
df['Region'].nunique()

In [None]:
#counting unique values in area
df.value_counts('Area')

In [None]:
area_count = df['Area'].value_counts()
fig = px.pie(area_count, 
             values=area_count.values, 
             names=area_count.index, 
             title='Area Distribution', 
             hole=0.3)
fig.show()

In [None]:
region_count = df['Region'].value_counts()
fig = px.pie(region_count, 
             values=region_count.values, 
             names=region_count.index, 
             title='Region Distribution', 
             hole=0.3)
fig.show()

In [None]:
rural = df[df.Area == 'Rural']
urban = df[df.Area == 'Urban']

In [None]:
rural.pivot_table(index = 'MM YYYY', values = 'Estimated Unemployment Rate (%)', aggfunc = np.mean)

In [None]:
plt.figure(figsize = (14,7))
sns.barplot(x = 'MM YYYY', y = 'Estimated Unemployment Rate (%)', data = rural, errorbar=('ci',0), palette='pastel')
plt.xlabel('MM YYYY')
plt.ylabel('Unemployment Rate (%)')
plt.title("Rural - Unemployment Rate (%)");
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize = (14,7))
sns.barplot(x = 'MM YYYY', y = 'Estimated Unemployment Rate (%)', data = urban, errorbar=('ci', 0))
plt.xlabel('Month-Year')
plt.ylabel('Unemployment Rate (%)')
plt.title("Urban - Unemployment Rate (%)");

In [None]:
fig = px.sunburst(df, path=['MM YYYY', 'Area'], values='Estimated Unemployment Rate (%)', title='Sunburst Plot of Unemployment Rate Comparison')
fig.show()

In [None]:
from matplotlib.ticker import FuncFormatter
rural = pd.DataFrame(df)
def millions_formatter(x, pos):
    return f'{x*1e-6:.1f}M'
plt.figure(figsize=(10, 5))
sns.barplot(x='MM YYYY', y='Estimated Employed', data=rural, errorbar=('ci', 0), palette='Set2')
plt.xlabel('Month-Year')
plt.ylabel('Estimated Employed')
plt.xticks(rotation=45)
plt.title("Rural - Estimated Employed")
plt.gca().yaxis.set_major_formatter(FuncFormatter(millions_formatter))
plt.show()

In [None]:
from matplotlib.ticker import FuncFormatter
rural = pd.DataFrame(df)
def millions_formatter(x, pos):
    return f'{x*1e-6:.1f}M'
plt.figure(figsize=(10, 5))
sns.barplot(x='MM YYYY', y='Estimated Employed', data=urban, errorbar=('ci', 0), palette='Set2')
plt.xlabel('Month-Year')
plt.ylabel('Estimated Employed')
plt.xticks(rotation=45)
plt.title("Urban - Estimated Employed")
plt.gca().yaxis.set_major_formatter(FuncFormatter(millions_formatter))
plt.show()

In [None]:
fig = px.sunburst(df, path=['MM YYYY', 'Area'], values='Estimated Employed', title='Sunburst Plot of Employment Rate Comparison')
fig.show()

In [None]:
plt.figure(figsize = (12,6))
sns.barplot(x = 'MM YYYY', y = 'Estimated Labour Participation Rate (%)', data = rural, errorbar=('ci', 0), palette='Set2')
plt.xlabel('Month-Year')
plt.ylabel('Labour Participation Rate')
plt.title("Rural - Labour Participation Rate");

In [None]:
plt.figure(figsize = (12,6))
sns.barplot(x = 'MM YYYY', y = 'Estimated Labour Participation Rate (%)', data = urban,errorbar=('ci', 0),palette='Set2')
plt.xlabel('Month-Year')
plt.ylabel('Labour Participation Rate (%)')
plt.title("Urban - Labour Participation Rate");

In [None]:
px.scatter(df,x='MM YYYY',y='Estimated Labour Participation Rate (%)',color='Area')

In [None]:
#Estimated unemployment rate over time
plt.figure(figsize=(12, 6))
sns.lineplot(x='MM YYYY', y='Estimated Unemployment Rate (%)', data=df, marker='o')
plt.title('Estimated Unemployment Rate (%) Over Time')
plt.xlabel('MM YYYY')
plt.ylabel('Estimated Unemployment Rate (%)')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
figure = px.bar(df, x = 'Date', y = 'Estimated Labour Participation Rate (%)', color = 'Date', title = 'Estimated Labour Participation Rate (%)')
figure.show()

In [None]:
#Average, highest and lowest unemployment rate in India
#calculating average unemployment rate by region
average_unemployment_rate = df.groupby('Region')['Estimated Unemployment Rate (%)'].mean()

#State with highest unemployment rate
state_with_highest_unemployment_rate = average_unemployment_rate.idxmax()
highest_unemployment_rate = average_unemployment_rate.max()

#State with lowest unemployment rate
state_with_lowest_unemployment_rate = average_unemployment_rate.idxmin()
lowest_unemployment_rate = average_unemployment_rate.min()

print("State with highest unemployment rate:", state_with_highest_unemployment_rate)
print("Highest unemployment rate:", highest_unemployment_rate)
print("State with lowest unemployment rate:", state_with_lowest_unemployment_rate)
print("Lowest unemployment rate:", lowest_unemployment_rate)

In [None]:
sns.set_palette("Set1")
plt.figure(figsize=(12, 6))
average_unemployment_rate.sort_values(ascending=False).plot(kind='bar')
plt.title("Average Unemployment Rate by State")
plt.xlabel("Region")
plt.ylabel("Average Unemployment Rate (%)")
plt.xticks(rotation=90)
plt.show()

In [None]:
figure = px.bar(df, x = 'Date', y = 'Estimated Employed', color = 'Date', title = 'Estimated Employed People')
figure.show()

In [None]:
#Average, highest and lowest employment rate in India
#calculating average employment rate by region
average_employment_rate = df.groupby('Region')['Estimated Employed'].mean()

#State with highest unemployment rate
state_with_highest_employment_rate = average_employment_rate.idxmax()
highest_employment_rate = average_employment_rate.max()

#State with lowest unemployment rate
state_with_lowest_employment_rate = average_employment_rate.idxmin()
lowest_employment_rate = average_employment_rate.min()

print("State with highest employment rate:", state_with_highest_employment_rate)
print("Highest employment rate:", highest_employment_rate)
print("State with lowest employment rate:", state_with_lowest_employment_rate)
print("Lowest employment rate:", lowest_employment_rate)

In [None]:
from matplotlib.ticker import FuncFormatter
rural = pd.DataFrame(df)
def millions_formatter(x, pos):
    return f'{x*1e-6:.1f}M'
sns.set_palette('Set2')
plt.figure(figsize=(10, 5))
average_employment_rate.sort_values(ascending=False).plot(kind='bar')
plt.xlabel('Regionr')
plt.ylabel('Average Employment Rate (%)')
plt.xticks(rotation=90)
plt.title("Average Employment Rate by State")
plt.gca().yaxis.set_major_formatter(FuncFormatter(millions_formatter))
plt.show()

In [None]:
#Average, highest and lowest employment rate in India
#calculating average employment rate by region
average_labour_participation_rate = df.groupby('Region')['Estimated Labour Participation Rate (%)'].mean()

#State with highest unemployment rate
state_with_highest_labour_participation_rate = average_labour_participation_rate.idxmax()
highest_labour_participation_rate = average_labour_participation_rate.max()

#State with lowest unemployment rate
state_with_lowest_labour_participation_rate = average_labour_participation_rate.idxmin()
lowest_labour_participation_rate = average_labour_participation_rate.min()

print("State with highest labour participation rate:", state_with_highest_labour_participation_rate)
print("Highest labour participation rate:", highest_labour_participation_rate)
print("State with lowest labour participation rate:", state_with_lowest_labour_participation_rate)
print("Lowest labour participation rate:", lowest_labour_participation_rate)

In [None]:
sns.set_palette("viridis")
plt.figure(figsize=(12, 6))
average_labour_participation_rate.sort_values(ascending=False).plot(kind='bar')
plt.title("Average labour participation Rate by State")
plt.xlabel("Region")
plt.ylabel("Average labour participation Rate (%)")
plt.xticks(rotation=90)
plt.show()

In [None]:
fig = px.sunburst(df, path=['Area', 'Region'], values='Estimated Unemployment Rate (%)', title='Sunburst Plot of Estimated Unemployment Rate by Region and Area')
fig.show()

In [None]:
correlation = df[['Estimated Unemployment Rate (%)', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']].corr()
plt.figure(figsize=(10,8))
sns.heatmap(correlation, cmap='coolwarm')
for i in range(correlation.shape[0]):
    for j in range(correlation.shape[1]):
        plt.text(j+0.5, i+0.5, "{:.2f}".format(correlation.iloc[i, j]), ha='center', va='center', color='white')
plt.title('Correlation Heatmap')
plt.show()

# Conclusion:

1. The unemployment rate in rural areas exceeded that in urban areas from May 2019 to May 2020.
2. However, by June 2020, the unemployment rates in both urban and rural areas had nearly equalized. 
3. Notably, in April and May 2020, the unemployment rate surged significantly, coinciding with the onset of the COVID-19 pandemic's economic impact.
4. The rural areas have a higher employed population compared to urban areas. 
5. However, there was a sudden decrease in the employed population during April and May 2020.
6. The labor participation rate is higher in rural areas than in urban areas.
7. However, there was a sudden decrease in the labor participation rate in April 2020.
8. The state with highest unemployment rate is Tripura, while the state with lowest unemployment rate is Meghalaya.
9. The state with highest employment rate is Uttar Pradesh, while the state with lowest employment rate is Sikkim.
10. The state with highest labour participation rate is Tripura, while the state with lowest labour participation rate is uttarakhand.
11. Most of the people are employed at January 31,2020
12. From the given data set, the labour participation rate was high in October and November 2019.