## Library imports

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Read Data


In [None]:
data = pd.read_csv("/content/india-state-wise-data-analysis.csv")
data.head(40)

## Vis Data

In [None]:

# Data preprocessing to extract state and district names
data['State'] = data['State & District'].apply(lambda x: x.split(",")[1].split(":")[1])
data['District'] = data['State & District'].apply(lambda x: x.split(",")[2].split(":")[1])

# Top 10 districts by population
top_10_districts = data.nlargest(10, 'Population')

# Setting up the plotting area
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 15))

# Plot 1: Total Population
sns.barplot(x='Population', y='District', data=top_10_districts, ax=axes[0], palette="Blues_d")
axes[0].set_title('Top 10 Districts by Population')
axes[0].set_xlabel('Population')
axes[0].set_ylabel('District')

# Plot 2: Literacy Rate
top_10_districts['Literacy Rate'] = (top_10_districts['Literate'] / top_10_districts['Population']) * 100
sns.barplot(x='Literacy Rate', y='District', data=top_10_districts, ax=axes[1], palette="Greens_d")
axes[1].set_title('Literacy Rate in Top 10 Districts')
axes[1].set_xlabel('Literacy Rate (%)')
axes[1].set_ylabel('')

# # Plot 3: Gender Distribution
# top_10_districts_melted = top_10_districts.melt(value_vars=['Male', 'Female'], id_vars=['District'], var_name='Gender', value_name='Population')
# sns.barplot(x='Population', y='District', hue='Gender', data=top_10_districts_melted, ax=axes[2], palette="coolwarm")
# axes[2].set_title('Gender Distribution in Top 10 Districts')
# axes[2].set_xlabel('Population')
# axes[2].set_ylabel('')

plt.tight_layout()
plt.show()


In [None]:
# Aggregate the population by state
state_population = data.groupby(['State'])['Population'].sum().reset_index()

# Sort states by population for better visualization
state_population_sorted = state_population.sort_values(by='Population', ascending=False)

# Create a bar graph
plt.figure(figsize=(10, 10))
sns.barplot(x='Population', y='State', data=state_population_sorted, palette='viridis')
plt.title('Population Distribution by State')
plt.xlabel('Population')
plt.ylabel('State')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Question 1: Literacy rate across different states
state_literacy = data.groupby("State")[['Literate', 'Population']].sum()
state_literacy['Literacy Rate'] = (state_literacy['Literate'] / state_literacy['Population']) * 100
state_literacy = state_literacy.sort_values('Literacy Rate', ascending=False).reset_index()

# Question 2: Gender ratio (number of males per 100 females) vary across states
state_gender_ratio = data.groupby('State')[['Male', 'Female']].sum()
state_gender_ratio['Gender Ratio'] = (state_gender_ratio['Male'] / state_gender_ratio['Female']) * 100
state_gender_ratio = state_gender_ratio.sort_values('Gender Ratio', ascending=False).reset_index()

# Setting up the plotting area
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 15))

# Plot for Literacy Rate
sns.barplot(x='Literacy Rate', y='State', data=state_literacy, ax=axes[0], palette="coolwarm")
axes[0].set_title('Literacy Rate Across Different States')
axes[0].set_xlabel('Literacy Rate (%)')
axes[0].set_ylabel('State')

# Plot for Gender Ratio
sns.barplot(x='Gender Ratio', y='State', data=state_gender_ratio, ax=axes[1], palette="viridis")
axes[1].set_title('Gender Ratio (Number of Males per 100 Females) Across States')
axes[1].set_xlabel('Gender Ratio')
axes[1].set_ylabel('State')

plt.tight_layout()
plt.show()


In [None]:
# What is the distribution of the Scheduled Caste (SC) population across states?
# Aggregating SC population by state
state_sc_population = data.groupby('State')['SC'].sum().sort_values(ascending=False).reset_index()

# Since there are many states, for visualization clarity, we might consider top states by SC population
top_states_by_sc_population = state_sc_population.head(10)

# Plotting
plt.figure(figsize=(8, 7))
sns.barplot(x='SC', y='State', data=top_states_by_sc_population, palette='magma')
plt.title('Top 10 States by SC Population Distribution in India')
plt.xlabel('SC Population')
plt.ylabel('State')
plt.show()

In [None]:
# What is the relationship between literacy rates and the proportion of graduate or higher education in states?
# Calculating the Graduate or Higher Education Rate for each state
state_education = data.groupby('State')[['Graduate_Education', 'Population']].sum()
state_education['Graduate Education Rate'] = (state_education['Graduate_Education'] / state_education['Population']) * 100

# Merging the literacy rate and graduate education rate dataframes
state_comparison = pd.merge(state_literacy[['State', 'Literacy Rate']], state_education[['Graduate Education Rate']], on='State')

# Plotting the scatter plot
plt.figure(figsize=(6, 4))
sns.scatterplot(x='Literacy Rate', y='Graduate Education Rate', data=state_comparison, s=100, color="blue", alpha=0.6)
plt.title('Comparison of Literacy Rates and Graduate Education Rates Across States')
plt.xlabel('Literacy Rate (%)')
plt.ylabel('Graduate Education Rate (%)')
plt.grid(True)
plt.show()