# Importing the required libraries.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# **Dataset-1:** Analysis of Budget proposal and alllocation of all the states and UTs, from the fiscal year 2015-16 to 2022-2023.

In [None]:
# Reading the dataset
df1 = pd.read_csv("C:\\Users\\yashg\\OneDrive\\Desktop\\Group2\\nhm---july-2022.csv")

In [None]:
# Displaying top 5 rows
df1.head()

# Dropping the columns with NaN values.

In [None]:
# Dropping the columns with all NULL Values
df1 = df1.drop(['Unnamed: 11',
                        'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15',
                        'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19',
                        'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23'],axis=1)

In [None]:
# Displaying number of rows and columns
df1.shape

In [None]:
# Dropping the rows
df1 = df1.dropna(axis=0, how='all')

In [None]:
df1.shape

In [None]:
df1.info()

In [None]:
df1.isnull().sum()

In [None]:
# Percentage of NULL values in each column
df1.isnull().sum()*100/(len(df1))

In [None]:
# Displaying all column names:
df1.columns

In [None]:
df1.dtypes

In [None]:
# Replace 'NA' with NaN
df1.replace('NA', np.nan, inplace=True)

# Create a heatmap to visualize null values
plt.figure(figsize=(10, 6))
sns.heatmap(df1.isnull(), cmap='viridis', cbar=False)
plt.title('Null Values in Dataset')
plt.show()


# Interpreting the Heatmap:
# In the heatmap:
# Yellow (or light color) cells represent missing (null) values.
# Dark cells represent non-null values.
# The heatmap allows you to quickly identify columns with missing data (yellow cells).
# This visualization helps in understanding the completeness of your dataset and
# deciding how to handle missing values during data preprocessing.

In [None]:
# Budget Approved for the States/UTs

df1['Budget Approved for the States/UTs  '].plot(kind='hist', bins=10, title='Budget Approved for the States/UTs  ')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
pivot_df = df1.pivot_table(index='Fiscal Year', columns='State_UT', values='Budget Approved for the States/UTs  ', aggfunc='mean')

# Set up the plot
plt.figure(figsize=(12, 8))

# Plot each state/UT as a separate line
for state in pivot_df.columns:
    sns.lineplot(x=pivot_df.index, y=pivot_df[state], label=state)

# Add labels and title
plt.title('Budget Approved for States/UTs Over Financial Years')
plt.xlabel('Fiscal Year')
plt.ylabel('Budget Approved (in crore)')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')  # Place legend outside the plot

# Show the plot
plt.tight_layout()  # Adjust layout to prevent overlapping labels
plt.show()

# Line plot to visualize variation of budget approved for States/UTs over financial Year

In [None]:
states = df1['State_UT'].unique()

# Plot individual bar graphs for each state
for state in states:
    # Filter the DataFrame for the specific state
    state_df = df1[df1['State_UT'] == state]

    # Set up the plot for the current state
    plt.figure(figsize=(10, 6))

    # Plot a bar graph for budget approved over fiscal years
    sns.barplot(x='Fiscal Year', y='Budget Approved for the States/UTs  ', data=state_df)
    plt.title(f'Budget Approved Over Fiscal Years for {state}')
    plt.xlabel('Fiscal Year')
    plt.ylabel('Budget Approved (in crore)')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

    plt.tight_layout()
    plt.show()

    # Reason for Ladakh not showing budget allocation before 2019 since Ladakh became a Union Territory in October 2019.

In [None]:
# Get the list of unique states/UTs in the DataFrame
states = df1['State_UT'].unique()

# Plot individual visualizations for each state
for state in states:
    # Filter the DataFrame for the specific state
    state_df = df1[df1['State_UT'] == state]

    # Set up the plot for the current state
    plt.figure(figsize=(10, 6))

    # Plot side-by-side bar chart for proposed vs. approved budgets over fiscal years
    sns.barplot(x='Fiscal Year', y='Budget Proposed by the States/UTs', data=state_df, color='skyblue', alpha=0.7, label='Proposed')
    sns.barplot(x='Fiscal Year', y='Budget Approved for the States/UTs  ', data=state_df, color='orange', alpha=0.7, label='Approved')

    plt.title(f'Comparison of Proposed vs. Approved Budgets for {state}')
    plt.xlabel('Fiscal Year')
    plt.ylabel('Budget (in crore)')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
# Set up separate plots for each state
states = df1['State_UT'].unique()  # Get unique states

# Create a separate plot for each state
for state in states:
    plt.figure(figsize=(10, 6))

    # Filter data for the current state
    state_data = df1[df1['State_UT'] == state]

    # Plot budget approved over fiscal years for the current state
    sns.lineplot(x='Fiscal Year', y='Budget Approved for the States/UTs  ', data=state_data, marker='o')

    # Set title and labels
    plt.title(f'Budget Approved for {state} Over Fiscal Years')
    plt.xlabel('Fiscal Year')
    plt.ylabel('Budget Approved (in crore)')
    plt.xticks(rotation=45)

    # Show plot
    plt.tight_layout()
    plt.show()

In [None]:
total_budget_approved_per_year = df1.groupby('Fiscal Year')['Budget Approved for the States/UTs  '].sum()

# Display the total budget approved for each financial year
print("Total Budget Approved for Each Financial Year:")
print(total_budget_approved_per_year)

# Visualize the total budget approved per financial year
plt.figure(figsize=(10, 6))
sns.barplot(x=total_budget_approved_per_year.index, y=total_budget_approved_per_year.values, palette='muted')
plt.title('Total Budget Approved per Financial Year')
plt.xlabel('Financial Year')
plt.ylabel('Total Budget Approved (in crore)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Function to plot opening balance with bars pointing up or down based on sign
def plot_opening_balance(state_data):
    plt.figure(figsize=(8, 6))

    # Plot bar chart with custom color based on value sign
    colors = ['red' if x < 0 else 'green' for x in state_data['Opening Balance with the States/UTs']]
    plt.bar(state_data['Fiscal Year'], state_data['Opening Balance with the States/UTs'], color=colors)

    # Set title and labels
    plt.title(f'Opening Balance for {state_data["State_UT"].iloc[0]} Over Fiscal Years')
    plt.xlabel('Fiscal Year')
    plt.ylabel('Opening Balance (in crore)')
    plt.xticks(rotation=45)

    # Show plot
    plt.show()

# Group by state and plot opening balance for each state
for state, state_data in df1.groupby('State_UT'):
    plot_opening_balance(state_data)

In [None]:
# Filter relevant columns for analysis
data = df1[['Opening Balance with the States/UTs', 'Budget Approved for the States/UTs  ']]

# Compute correlation matrix
correlation_matrix = data.corr()

# Extract correlation coefficient between Opening Balance and Approved Budget
correlation_coefficient = correlation_matrix.loc['Opening Balance with the States/UTs', 'Budget Approved for the States/UTs  ']

print("Correlation Coefficient between Opening Balance and Approved Budget:")
print(correlation_coefficient)

# Create a scatter plot to visualize the relationship
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Opening Balance with the States/UTs', y='Budget Approved for the States/UTs  ', data=df1)
plt.title('Relationship between Opening Balance and Approved Budget')
plt.xlabel('Opening Balance (in crore)')
plt.ylabel('Approved Budget (in crore)')
plt.show()

In [None]:
# To identify states/UTs where the approved budget is more than the proposed budget for each fiscal year in your dataset,

# Filter the DataFrame to include states/UTs where Approved Budget > Proposed Budget for each fiscal year
approved_greater_than_proposed = df1[df1['Budget Approved for the States/UTs  '] > df1['Budget Proposed by the States/UTs']]

# Group the filtered DataFrame by Fiscal Year and list the states/UTs where Approved Budget > Proposed Budget
states_by_year = approved_greater_than_proposed.groupby('Fiscal Year')['State_UT'].apply(list)

# Display the states/UTs where Approved Budget > Proposed Budget for each fiscal year
for year, states_list in states_by_year.items():
    print(f"For Fiscal Year {year}:")
    print(states_list)
    print()

In [None]:
# calculate the average extent of budget approved against budget proposed across all states/UTs for each fiscal year


# Group by Fiscal Year and calculate the average extent of budget approved against budget proposed
average_approval_ratio_by_year = df1.groupby('Fiscal Year')['Extent of Budget Approved Against Budget Proposed '].mean()

# Convert the grouped series to a DataFrame for plotting
average_approval_ratio_df = average_approval_ratio_by_year.reset_index()

# Plotting the average extent of budget approved against budget proposed for each fiscal year
plt.figure(figsize=(10, 6))
plt.bar(average_approval_ratio_df['Fiscal Year'], average_approval_ratio_df['Extent of Budget Approved Against Budget Proposed '], color='skyblue')
plt.title('Average Extent of Budget Approved Against Budget Proposed by Fiscal Year')
plt.xlabel('Fiscal Year')
plt.ylabel('Average Approval Ratio (%)')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()

In [None]:
# Iterate over each state and create separate side-by-side bar charts
for state in df1['State_UT'].unique():
    state_data = df1[df1['State_UT'] == state]
    fiscal_years = state_data['Fiscal Year'].unique()

    # Set up the figure and axes for the bar chart
    fig, ax = plt.subplots(figsize=(12, 8))

    # Define the width of each bar (adjust as needed)
    bar_width = 0.2
    index = range(len(fiscal_years))

    # Plotting each parameter as a separate set of bars
    ax.bar(index, state_data['Budget Proposed by the States/UTs'], width=bar_width, label='Budget Proposed')
    ax.bar([p + bar_width for p in index], state_data['Budget Approved for the States/UTs  '], width=bar_width, label='Budget Approved')
    ax.bar([p + 2*bar_width for p in index], state_data["Release of Government of India's Fund "], width=bar_width, label='Release of Funds')
    ax.bar([p + 3*bar_width for p in index], state_data["Total Expenditure Reported (Including States' Share)"], width=bar_width, label='Total Expenditure')

    # Customize the plot
    ax.set_xlabel('Fiscal Year')
    ax.set_ylabel('Amount (in crore)')
    ax.set_title(f'Variation of Parameters Over Fiscal Years - {state}')
    ax.set_xticks([p + 1.5*bar_width for p in index])
    ax.set_xticklabels(fiscal_years)
    ax.legend()

    # Show the plot for the current state
    plt.show()

In [None]:
# analyze how the release of Government of India's fund varies across different fiscal years and visualize this variation

# Group by Fiscal Year and calculate the total release of Government of India's fund for each year
total_fund_release_by_year = df1.groupby('Fiscal Year')['Release of Government of India\'s Fund '].sum()

# Convert the grouped series to a DataFrame for plotting
total_fund_release_df = total_fund_release_by_year.reset_index()

# Plotting the variation of Government of India's fund release across fiscal years
plt.figure(figsize=(10, 6))
plt.plot(total_fund_release_df['Fiscal Year'], total_fund_release_df['Release of Government of India\'s Fund '], marker='o', color='green', linestyle='-', linewidth=2)
plt.title('Variation of Government of India\'s Fund Release Across Fiscal Years')
plt.xlabel('Fiscal Year')
plt.ylabel('Total Fund Release (in crore)')
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

In [None]:
# visualize the distribution of budget proposed by different states/UTs for each fiscal year using pie charts.
# Iterate over each fiscal year and plot a pie chart for budget distribution
# Calculate total budget proposed by each state/UT for each fiscal year
budget_by_state = df1.groupby(['Fiscal Year', 'State_UT'])['Budget Proposed by the States/UTs'].sum().unstack().fillna(0)

# Set the number of top states/UTs to display in each pie chart
top_n = 10
# Define custom colors for pie chart slices (change or add colors as needed)
custom_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf','#DFFF00']

for fiscal_year in df1['Fiscal Year'].unique():
    year_data = budget_by_state.loc[fiscal_year]

    # Filter out states with zero budget proposed for the current fiscal year
    year_data = year_data[year_data > 0]

    # Sort states by budget proposed (descending order) for better visualization
    year_data_sorted = year_data.sort_values(ascending=False)

    # Get top N states and their budget proposed
    top_states = year_data_sorted.head(top_n)
    other_states = year_data_sorted.iloc[top_n:]


    # Summing up budget proposed for other states
    other_budget = other_states.sum()

  # Combining top N states and "Others" into a new Series
    pie_data = pd.concat([top_states, pd.Series({'Others': other_budget})])

    # Plotting the pie chart with percentages and legend
    plt.figure(figsize=(8, 5))
    plt.pie(pie_data, labels=pie_data.index, autopct='%1.1f%%', startangle=140, colors=custom_colors)
    plt.title(f'States/UTs - Budget Proposed Distribution ({fiscal_year})')
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle

    # Add legend to the plot
    plt.legend(title='States/UTs', loc='best', bbox_to_anchor=(1, 0.5))

    # Show the pie chart for the current fiscal year
    plt.show()

# Data cleaning:

In [None]:
# Replace 0 with NaN in specified columns
columns_to_replace = ['Budget Proposed by the States/UTs', 'Budget Approved for the States/UTs  ',"Release of Government of India's Fund ","Total Expenditure Reported (Including States' Share)"]
df1[columns_to_replace] = df1[columns_to_replace].replace(0, np.nan)

In [None]:
df1[df1['State_UT'] == 'Ladakh']

In [None]:
# Dropping NULL values
# Filter rows where State_UT is 'Ladakh' and Fiscal Year is before 2020
mask = (df1['State_UT'] == 'Ladakh') & (df1['Fiscal Year'] < '2020-21')

# Drop rows that match the condition
df1.drop(df1[mask].index, inplace=True)

# Reset index after dropping rows
df1.reset_index(drop=True, inplace=True)



In [None]:
df1[df1['State_UT'] == 'Ladakh']

In [None]:
df1.isnull().sum()

In [None]:
# Replace null values in the "Budget Proposed" column with the mean budget proposed for the same state over all years
# Calculate mean budget proposed for each state
state_mean_budget = df1.groupby('State_UT')['Budget Proposed by the States/UTs'].mean()

# Function to fill missing values with state-wise mean
def fill_missing_budget(row):
    state = row['State_UT']
    budget_proposed = row['Budget Proposed by the States/UTs']
    if pd.isnull(budget_proposed):
        return state_mean_budget[state]
    else:
        return budget_proposed

# Apply the function to fill missing values in 'Budget_Proposed' column
df1['Budget Proposed by the States/UTs'] = df1.apply(fill_missing_budget, axis=1)

In [None]:
df1.isnull().sum()

In [None]:
# Similarly replace NULL values for Budget Approved for the States/UTs ,Release of Government of India's Fund , Total Expenditure Reported                   3
# Calculate mean budget proposed for each state
state_mean_budget = df1.groupby('State_UT')['Budget Approved for the States/UTs  '].mean()

# Function to fill missing values with state-wise mean
def fill_missing_budget(row):
    state = row['State_UT']
    budget_proposed = row['Budget Approved for the States/UTs  ']
    if pd.isnull(budget_proposed):
        return state_mean_budget[state]
    else:
        return budget_proposed

# Apply the function to fill missing values in 'Budget_Proposed' column
df1['Budget Approved for the States/UTs  '] = df1.apply(fill_missing_budget, axis=1)


In [None]:
state_mean_budget = df1.groupby('State_UT')["Release of Government of India's Fund "].mean()

# Function to fill missing values with state-wise mean
def fill_missing_budget(row):
    state = row['State_UT']
    budget_proposed = row["Release of Government of India's Fund "]
    if pd.isnull(budget_proposed):
        return state_mean_budget[state]
    else:
        return budget_proposed

# Apply the function to fill missing values in 'Budget_Proposed' column
df1["Release of Government of India's Fund "] = df1.apply(fill_missing_budget, axis=1)

In [None]:
state_mean_budget = df1.groupby('State_UT')["Total Expenditure Reported (Including States' Share)"].mean()

# Function to fill missing values with state-wise mean
def fill_missing_budget(row):
    state = row['State_UT']
    budget_proposed = row["Total Expenditure Reported (Including States' Share)"]
    if pd.isnull(budget_proposed):
        return state_mean_budget[state]
    else:
        return budget_proposed

# Apply the function to fill missing values in 'Budget_Proposed' column
df1["Total Expenditure Reported (Including States' Share)"] = df1.apply(fill_missing_budget, axis=1)

In [None]:
df1.isnull().sum()

In [None]:
# Define a function to fill null values in 'Extent_Budget_Approved_vs_Proposed'
def fill_extent_approved_vs_proposed(row):
    budget_proposed = row['Budget Proposed by the States/UTs']
    budget_approved = row['Budget Approved for the States/UTs  ']
    if pd.isnull(row['Extent of Budget Approved Against Budget Proposed ']):
        if budget_proposed != 0:
            return (budget_approved / budget_proposed) * 100
        else:
            return None
    else:
        return row['Extent of Budget Approved Against Budget Proposed ']

# Define a function to fill null values in 'Extent_Funds_Utilised_vs_Approved'
def fill_extent_utilised_vs_approved(row):
    budget_approved = row['Budget Approved for the States/UTs  ']
    expenditure = row["Total Expenditure Reported (Including States' Share)"]
    if pd.isnull(row['Extent of Funds Utilised Against Budget Approved']):
        if budget_approved != 0:
            return (expenditure / budget_approved) * 100
        else:
            return None
    else:
        return row['Extent of Funds Utilised Against Budget Approved']

# Define a function to fill null values in 'Extent_Funds_Utilised_vs_Proposed'
def fill_extent_utilised_vs_proposed(row):
    budget_proposed = row['Budget Proposed by the States/UTs']
    expenditure = row["Total Expenditure Reported (Including States' Share)"]
    if pd.isnull(row['Extent of Funds Utilised Against Budget Proposed ']):
        if budget_proposed != 0:
            return (expenditure / budget_proposed) * 100
        else:
            return None
    else:
        return row['Extent of Funds Utilised Against Budget Proposed ']

# Apply the custom functions to fill null values in respective columns
df1['Extent of Budget Approved Against Budget Proposed '] = df1.apply(fill_extent_approved_vs_proposed, axis=1)
df1['Extent of Funds Utilised Against Budget Approved'] = df1.apply(fill_extent_utilised_vs_approved, axis=1)
df1['Extent of Funds Utilised Against Budget Proposed '] = df1.apply(fill_extent_utilised_vs_proposed, axis=1)

In [None]:
df1.isnull().sum()

In [None]:
# Drop the column 'Opening Balance with the States/UTs'
column_to_drop = 'Opening Balance with the States/UTs'
df1.drop(column_to_drop, axis=1, inplace=True)

In [None]:
# Create a heatmap to visualize null values
plt.figure(figsize=(10, 6))
sns.heatmap(df1.isnull(), cmap='viridis', cbar=False)
plt.title('Null Values in Dataset')
plt.show()

In [None]:
# Now our data is complelety clean

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
# Making a copy for our data 
df=df1.copy()

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
#  perform one-hot encoding on categorical columns
categorical_columns = ['State_UT', 'Fiscal Year']

# Perform One-Hot Encoding using pd.get_dummies()
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Display the encoded DataFrame
print("Encoded DataFrame:")
print(df_encoded)

In [None]:
df_encoded.dtypes

In [None]:
# Performing Standardization:

from sklearn.preprocessing import StandardScaler
# Separate numerical
numerical_columns = ['Budget Proposed by the States/UTs','Budget Approved for the States/UTs  ',"Release of Government of India's Fund ","Total Expenditure Reported (Including States' Share)"]

# Apply Standardization to Numerical Columns
scaler = StandardScaler()
df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])


In [None]:
df_encoded.head()

In [None]:
df_encoded.columns

In [None]:
# List of columns to drop from df_encoded
columns_to_drop = [
    'State_UT_Code',
    'Extent of Budget Approved Against Budget Proposed ',
    'Extent of Funds Utilised Against Budget Approved',
    'Extent of Funds Utilised Against Budget Proposed '
]

# Drop specified columns from df_encoded
df_encoded = df_encoded.drop(columns=columns_to_drop)

In [None]:
df_encoded.columns

# Feature Engineering:

In [None]:
# Convert numerical feature into categorical bins
# Dividing the budget proposed by the states into low , medium, and high
df1['Budget_Category'] = pd.cut(df1['Budget Proposed by the States/UTs'], bins=3, labels=['Low', 'Medium', 'High'])


In [None]:
from scipy.stats import skew
# Assuming 'Budget Proposed by the States/UTs' is the column of interest in your DataFrame (df)
column_name = 'Budget Proposed by the States/UTs'

# Calculate skewness
skewness = skew(df1[column_name])

# Print skewness value
print(f"Skewness of '{column_name}': {skewness:.2f}")

# Visualize the distribution using a histogram and density plot
plt.figure(figsize=(10, 6))
sns.histplot(df1[column_name], kde=True, color='blue', bins=30)
plt.title(f'Distribution of "{column_name}" (Skewness: {skewness:.2f})')
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.show()

# Determine distribution type based on skewness
if skewness > 0:
    print(f"'{column_name}' has a right-skewed distribution (positive skewness)")
elif skewness < 0:
    print(f"'{column_name}' has a left-skewed distribution (negative skewness)")
else:
    print(f"'{column_name}' has an approximately symmetric distribution (skewness close to 0)")

In [None]:
df1['Budget_proposed_Log_Transformed'] = np.log1p(df1['Budget Proposed by the States/UTs'])

In [None]:
column_name = 'Budget_proposed_Log_Transformed'

# Calculate skewness
skewness = skew(df1[column_name])

# Print skewness value
print(f"Skewness of '{column_name}': {skewness:.2f}")

# Visualize the distribution using a histogram and density plot
plt.figure(figsize=(10, 6))
sns.histplot(df1[column_name], kde=True, color='blue', bins=30)
plt.title(f'Distribution of "{column_name}" (Skewness: {skewness:.2f})')
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.show()

# Determine distribution type based on skewness
if skewness > 0:
    print(f"'{column_name}' has a right-skewed distribution (positive skewness)")
elif skewness < 0:
    print(f"'{column_name}' has a left-skewed distribution (negative skewness)")
else:
    print(f"'{column_name}' has an approximately symmetric distribution (skewness close to 0)")

In [None]:
df1['Budget_proposed_SquareRoot_Transformed'] = np.sqrt(df['Budget Proposed by the States/UTs'])

In [None]:
column_name = 'Budget_proposed_SquareRoot_Transformed'

# Calculate skewness
skewness = skew(df1[column_name])

# Print skewness value
print(f"Skewness of '{column_name}': {skewness:.2f}")

# Visualize the distribution using a histogram and density plot
plt.figure(figsize=(10, 6))
sns.histplot(df1[column_name], kde=True, color='blue', bins=30)
plt.title(f'Distribution of "{column_name}" (Skewness: {skewness:.2f})')
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.show()

# Determine distribution type based on skewness
if skewness > 0:
    print(f"'{column_name}' has a right-skewed distribution (positive skewness)")
elif skewness < 0:
    print(f"'{column_name}' has a left-skewed distribution (negative skewness)")
else:
    print(f"'{column_name}' has an approximately symmetric distribution (skewness close to 0)")

# Feature Selection:

In [None]:
# Feature Selection:

# Compute the correlation matrix
correlation_matrix = df.corr(numeric_only=True)

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Identify features with highest absolute correlation with the target variable
target_correlation = correlation_matrix["Total Expenditure Reported (Including States' Share)"].abs().sort_values(ascending=False)
important_features = target_correlation[target_correlation >= 0.5].index.tolist()
print("Important Features based on Correlation:")
print(important_features)

# Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
#  Define features (X) and target variable (y)
X = df_encoded.drop("Total Expenditure Reported (Including States' Share)", axis=1)
y = df_encoded["Total Expenditure Reported (Including States' Share)"]

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train LinearRegression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., using Mean Squared Error for regression)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.4f}')

In [None]:
# Create scatter plot of predicted vs. actual values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', linewidth=2)
plt.title('Predicted vs. Actual Values (Linear Regression)')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Initialize Linear Regression model
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Initialize Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Initialize Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Initialize SVR model
svr_model = SVR(kernel='rbf')

# Scale features for SVR (optional, but recommended for SVM models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVR model
svr_model.fit(X_train_scaled, y_train)


def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate R-squared (R²)
    r_squared = r2_score(y_test, y_pred)
    
    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)
    
    return r_squared, mse

# Evaluate Linear Regression model
lr_r_squared, lr_mse = evaluate_model(lr_model, X_test, y_test)

# Evaluate Random Forest Regressor model
rf_r_squared, rf_mse = evaluate_model(rf_model, X_test, y_test)

# Evaluate SVR model (using scaled features)
svr_r_squared, svr_mse = evaluate_model(svr_model, X_test_scaled, y_test)

print(f"Linear Regression - R-squared: {lr_r_squared:.4f}, MSE: {lr_mse:.4f}")
print(f"Random Forest Regressor - R-squared: {rf_r_squared:.4f}, MSE: {rf_mse:.4f}")
print(f"SVR - R-squared: {svr_r_squared:.4f}, MSE: {svr_mse:.4f}")


In [None]:
def plot_predictions(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Create scatter plot of predicted vs. actual values
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', linewidth=2)
    plt.title(f'Predicted vs. Actual Values ({type(model).__name__})')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.show()

# Plot predictions for Linear Regression model
plot_predictions(lr_model, X_test, y_test)

# Plot predictions for Random Forest Regressor model
plot_predictions(rf_model, X_test, y_test)

# Plot predictions for SVR model
plot_predictions(svr_model, X_test_scaled, y_test)


# DataSet_2:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Reading the dataset:
df2 = pd.read_csv("C:\\Users\\yashg\\OneDrive\\Desktop\\Group2\\INFRASTRUCTURE_FACILITIES.csv")

In [None]:
# Displaying top 10 rows:
df2.head(10)

In [None]:
# Displaying last 10 rows
df2.tail()

In [None]:
# Number of Rows and Columns:
df2.shape

In [None]:
# prints information about the DataFrame
df2.info()

In [None]:
df2[df2['State/UT'] == 'Chandigarh']

In [None]:
# As we can see UT Chandigarh theirs is no data available for any year so we drop all the rows having State/UT 'Chandigarh':

df2=df2[df2['State/UT'] != 'Chandigarh']

In [None]:
# data types of each column
df2.dtypes

In [None]:
# As we can see many columns should be numerical like PHCs functoning on 24X7 basis', 'With Labour Room','With OT', 'With at least 4 beds'
# But they are given as object so convert them into int
df2[['PHCs functoning on 24X7 basis', 'With Labour Room','With OT', 'With at least 4 beds',
   'Without Electric Supply','Without Regular Water Supply','With Telephone']] = df2[['PHCs functoning on 24X7 basis', 'With Labour Room','With OT', 'With at least 4 beds',
   'Without Electric Supply','Without Regular Water Supply','With Telephone']].astype(int)
df2.info()

In [None]:
# Calculate PHCs functioning with electricity supply and regular water supply
df2['With Electric Supply'] = df2['Number of PHCs Functioning'] - df2['Without Electric Supply']
df2['With Regular Water Supply'] = df2['Number of PHCs Functioning'] - df2['Without Regular Water Supply']


In [None]:
df2.head()

In [None]:
# Checking Null values:
df2.isnull().sum()

In [None]:
# visualize null values in our dataset
plt.figure(figsize=(10, 6))
sns.heatmap(df2.isnull(), cmap='viridis', cbar=False, yticklabels=False)
plt.title('Visualization of Null Values in Dataset')
plt.show()

In [None]:
# No null values in the dataset

In [None]:
# Displaying name of all fields
df2.columns

In [None]:
# Drop the 'Serial Number' column as it is of no use
df2.drop('S.No.', axis=1, inplace=True)

In [None]:
df2.head()

In [None]:
# Get descriptive statistics for each group (fiscal year)
grouped_data = df2.groupby('Fiscal Year')

# Get descriptive statistics for each group (fiscal year)
fiscal_year_stats = grouped_data.describe()

# Print the results
print(fiscal_year_stats)


In [None]:
# visualize the total number of PHCs functioning over each fiscal year 

# Aggregate by fiscal year to get total number of PHCs functioning
total_phcs_by_year = df2.groupby('Fiscal Year')['Number of PHCs Functioning'].sum()

# Plotting the total number of PHCs functioning over each fiscal year using a bar plot
plt.figure(figsize=(10, 6))
total_phcs_by_year.plot(kind='bar', color='skyblue')
plt.title('Total Number of PHCs Functioning Over Fiscal Years')
plt.xlabel('Fiscal Year')
plt.ylabel('Total Number of PHCs Functioning')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
# visualize the change in infrastructure facilites over each fiscal year 

# List of facilities (columns other than 'State/UT' and 'Fiscal Year')
facilities = ['PHCs functoning on 24X7 basis', 'With Labour Room', 'With OT',
              'With at least 4 beds', 'Without Electric Supply',
              'Without Regular Water Supply', 'With Telephone']

# Plotting each facility's total number over each fiscal year using bar plots

for i, facility in enumerate(facilities, 1):
    plt.figure(figsize=(10, 6))
    total_facility_by_year = df2.groupby('Fiscal Year')[facility].sum()
    total_facility_by_year.plot(kind='bar', color='skyblue')
    plt.title(f'Total {facility} Over Fiscal Years')
    plt.xlabel('Fiscal Year')
    plt.ylabel(f'Total {facility}')
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
# Visluaizing number of PHCs functioning over Fiscal Years for each state/ut:
# Get unique State/UT names
states = df2['State/UT'].unique()

# Plotting separate line plots for each State/UT with sorted fiscal years
for state in states:
    state_data = df2[df2['State/UT'] == state]
    state_data_sorted = state_data.sort_values(by='Fiscal Year')  # Sort data by fiscal year
    plt.figure(figsize=(10, 6))
    plt.plot(state_data_sorted['Fiscal Year'], state_data_sorted['Number of PHCs Functioning'],
             marker='o', linestyle='-', color='b')
    plt.title(f'Number of PHCs Functioning Over Fiscal Years - {state}')
    plt.xlabel('Fiscal Year')
    plt.ylabel('Number of PHCs Functioning')
    plt.xticks(rotation=45)
    plt.show()

# Data cleaning

In [None]:
# From above visulalizations we can see for some states there is a data missing 
# for some particular year and they have filled with 0. 
# To address the issue of missing or zero values in specific columns for 
# certain states and fiscal years, you can replace these zeros with NaN (NULL) values
# and then fill these missing values with the mean of the corresponding data for the
# same state over fiscal years. This process helps to impute missing values based on the
# average behavior of the data within each state. 

# Columns of interest for zero replacement and imputation
columns_of_interest = ['Number of PHCs Functioning', 'PHCs functoning on 24X7 basis', 
                       'With Labour Room', 'With OT', 'With at least 4 beds', 
                       'With Electric Supply', 'With Regular Water Supply','With Telephone']

# Replace 0 values with NaN (NULL) for specified columns
df2[columns_of_interest] = df2[columns_of_interest].replace(0, np.nan)





In [None]:
df2.isnull().sum()*100/len(df2)

In [None]:
# visualize null values in our dataset
plt.figure(figsize=(10, 6))
sns.heatmap(df2.isnull(), cmap='viridis', cbar=False, yticklabels=False)
plt.title('Visualization of Null Values in Dataset')
plt.show()

In [None]:
# Fill missing values with the mean of the same state over fiscal years for specified columns
df2[columns_of_interest] = df2.groupby('State/UT', group_keys=False)[columns_of_interest].apply(lambda group: group.fillna(group.mean()))

In [None]:
df2.isnull().sum()

In [None]:
# Calculate percentage of PHCs functioning with labour room
df2['Percentage with Labour Room'] = (df2['With Labour Room'].astype(int) * 100) / df2['Number of PHCs Functioning']

# Display the updated DataFrame
print(df2['Percentage with Labour Room'])

In [None]:
# Filter states with less than 50% of PHCs with labour room for each fiscal year
filtered_df = df2[df2['Percentage with Labour Room'] < 50]

# Group filtered DataFrame by fiscal year and list states with less than 50% labour room
result = (filtered_df.groupby(['Fiscal Year', 'State/UT'])
          .agg({'Percentage with Labour Room': 'mean'})
          .reset_index())

# Display the states with less than 50% labour room for each fiscal year
for year, group_df in result.groupby('Fiscal Year'):
    print(f"For Fiscal Year {year}:")
    for index, row in group_df.iterrows():
        print(f"- State/UT: {row['State/UT']}, Percentage with Labour Room: {row['Percentage with Labour Room']:.2f}%")
    print()  # Print empty line for separation between fiscal years

In [None]:
# Filter states with more than 95% of PHCs with labour room for each fiscal year
filtered_df = df2[df2['Percentage with Labour Room'] > 95]

# Group filtered DataFrame by fiscal year and list states with less than 50% labour room
result = (filtered_df.groupby(['Fiscal Year', 'State/UT'])
          .agg({'Percentage with Labour Room': 'mean'})
          .reset_index())

# Display the states with less than 50% labour room for each fiscal year
for year, group_df in result.groupby('Fiscal Year'):
    print(f"For Fiscal Year {year}:")
    for index, row in group_df.iterrows():
        print(f"- State/UT: {row['State/UT']}, Percentage with Labour Room: {row['Percentage with Labour Room']:.2f}%")
    print()  # Print empty line for separation between fiscal years

In [None]:
# Group DataFrame by fiscal year and state/UT
grouped_df = df2.groupby(['Fiscal Year', 'State/UT'])

# Identify states with the highest and lowest number of PHCs functioning for each fiscal year
max_phcs_functioning = grouped_df['Number of PHCs Functioning'].max().reset_index()
min_phcs_functioning = grouped_df['Number of PHCs Functioning'].min().reset_index()

# Identify states with the highest and lowest number of PHCs functioning on 24X7 basis for each fiscal year
max_phcs_24x7 = grouped_df['PHCs functoning on 24X7 basis'].max().reset_index()
min_phcs_24x7 = grouped_df['PHCs functoning on 24X7 basis'].min().reset_index()

# Print results for highest and lowest number of PHCs functioning
print("States with the highest number of PHCs functioning:")

print(max_phcs_functioning.loc[max_phcs_functioning.groupby('Fiscal Year')['Number of PHCs Functioning'].idxmax()])

print("\nStates with the lowest number of PHCs functioning:")
print(min_phcs_functioning.loc[min_phcs_functioning.groupby('Fiscal Year')['Number of PHCs Functioning'].idxmin()])

# Print results for highest and lowest number of PHCs functioning on 24X7 basis
print("\nStates with the highest number of PHCs functioning on 24X7 basis:")
print(max_phcs_24x7.loc[max_phcs_24x7.groupby('Fiscal Year')['PHCs functoning on 24X7 basis'].idxmax()])

print("\nStates with the lowest number of PHCs functioning on 24X7 basis:")
print(min_phcs_24x7.loc[min_phcs_24x7.groupby('Fiscal Year')['PHCs functoning on 24X7 basis'].idxmin()])


In [None]:
# Count of health centers without both electric supply and regular water supply 
# in the line plot for each state over fiscal years,

# Iterate over unique states and create individual graphs
for state in df2['State/UT'].unique():
    state_df = df2[df2['State/UT'] == state]
    
    # Sort state_df by 'Fiscal Year'
    state_df = state_df.sort_values(by='Fiscal Year')
    
    # Plotting the line plot for the current state
    plt.figure(figsize=(10, 6))
    sns.lineplot(x='Fiscal Year', y='Without Electric Supply', data=state_df, label='Without Electric Supply')
    sns.lineplot(x='Fiscal Year', y='Without Regular Water Supply', data=state_df, label='Without Regular Water Supply')
    
    plt.title(f'Availability of Health Centers without Essential Supplies in {state}')
    plt.xlabel('Fiscal Year')
    plt.ylabel('Count of Health Centers')
    plt.legend()
    plt.xticks(rotation=45)
    
    plt.show()

In [None]:
# Group by 'Fiscal Year' and calculate the sum of each column
yearly_totals = df2.groupby('Fiscal Year').sum(numeric_only=True)

# Plotting line plots for each column over fiscal years
plt.figure(figsize=(12, 8))

# Iterate over each column and plot a line
for column in ['Number of PHCs Functioning',"PHCs functoning on 24X7 basis",
               'With Labour Room', 'With OT', 'With at least 4 beds','With Electric Supply','With Regular Water Supply']:
    plt.plot(yearly_totals.index, yearly_totals[column], label=column)

plt.title('Change in Health Center Facilities over Fiscal Years')
plt.xlabel('Fiscal Year')
plt.ylabel('Total Count')
plt.xticks(rotation=45)

plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.grid(True)
plt.show()

In [None]:
# Plotting the changes in infrastructure facilities over fiscal years for each state 


# Get unique list of State/UTs
states = df2['State/UT'].unique()

# Iterate over each State/UT and plot infrastructure trends
for state in states:
    state_df = df2[df2['State/UT'] == state]
    
    
    # Sort the DataFrame by fiscal years in ascending order
    state_df = state_df.sort_values(by='Fiscal Year')
    
    # Extract fiscal years and infrastructure parameters
    fiscal_years = state_df['Fiscal Year']
    parameters = state_df.columns.difference(['State/UT', 'Fiscal Year','Without Electric Supply', 'Without Regular Water Supply'])
    
    # Plotting the changes in infrastructure facilities over fiscal years (ascending order)
    plt.figure(figsize=(12, 8))
    
    for param in parameters:
        plt.plot(fiscal_years, state_df[param], label=param, marker='o', linestyle='-')
    
    plt.title(f'Changes in Infrastructure Facilities Over Fiscal Years - {state}')
    plt.xlabel('Fiscal Year')
    plt.ylabel('Value')
    plt.xticks(fiscal_years)  # Set x-axis ticks to fiscal years
    # Place the legend outside the plot area
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.show()

In [None]:
# Distrution of functioning PHCs among various states over different fiscal years:

grouped_data = df2.groupby('Fiscal Year')

# Create a separate bar graph for each fiscal year
for fiscal_year, group_df in grouped_data:
    plt.figure(figsize=(25, 6))
    sns.barplot(x='State/UT', y='Number of PHCs Functioning', data=group_df)
    plt.title(f'Distribution of Functioning PHCs in Fiscal Year {fiscal_year}')
    plt.xlabel('State/UT')
    plt.ylabel('Number of PHCs Functioning')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# Group by 'State/UT' and 'Fiscal Year' to calculate total PHCs functioning
state_year_grouped = df2.groupby(['State/UT', 'Fiscal Year'])['Number of PHCs Functioning'].sum().reset_index()

# Get top 10 states based on total PHCs functioning across all years
top_states = state_year_grouped.groupby('State/UT')['Number of PHCs Functioning'].sum().nlargest(10).index.tolist()

# Combine all other states into 'Others'
state_year_grouped['State/UT'] = state_year_grouped['State/UT'].apply(lambda x: x if x in top_states else 'Others')
state_year_grouped = state_year_grouped.groupby(['State/UT', 'Fiscal Year'])['Number of PHCs Functioning'].sum().reset_index()

# Color palette for pie chart
colors = sns.color_palette('tab10')

# Iterate over each fiscal year and plot pie chart
for year in df2['Fiscal Year'].unique():
    year_data = state_year_grouped[state_year_grouped['Fiscal Year'] == year]
    
    labels = year_data['State/UT'].tolist()
    sizes = year_data['Number of PHCs Functioning'].tolist()
    
    # Plotting the pie chart
    plt.figure(figsize=(5, 8))
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140,colors=colors)
    plt.title(f'Distribution of PHCs Functioning by State in {year}')
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

In [None]:
# Iterate over each state and create separate side-by-side bar charts
for state in df2['State/UT'].unique():
    state_data = df2[df2['State/UT'] == state]
    fiscal_years = state_data['Fiscal Year'].unique()

    # Set up the figure and axes for the bar chart
    fig, ax = plt.subplots(figsize=(10, 5))

    # Define the width of each bar (adjust as needed)
    bar_width = 0.15
    index = range(len(fiscal_years))

    # Plotting each parameter as a separate set of bars
    ax.bar(index, state_data['Number of PHCs Functioning'], width=bar_width, label='PHCs Functioning')
    ax.bar([p + bar_width for p in index], state_data['PHCs functoning on 24X7 basis'], width=bar_width, label='PHCs Functioning 24*7')
    ax.bar([p + 2*bar_width for p in index], state_data["With Labour Room"], width=bar_width, label='With Labour Room')
    ax.bar([p + 3*bar_width for p in index], state_data["With OT"], width=bar_width, label='With OT')
    ax.bar([p + 4*bar_width for p in index], state_data["With at least 4 beds"], width=bar_width, label='With at least 4 beds')

    # Customize the plot
    ax.set_xlabel('Fiscal Year')
    ax.set_ylabel('Number of PHCs')
    ax.set_title(f'Variation of Parameters Over Fiscal Years - {state}')
    ax.set_xticks([p + 1.5*bar_width for p in index])
    ax.set_xticklabels(fiscal_years)
    ax.legend()

    # Show the plot for the current state
    plt.show()

In [None]:
# Select relevant columns for analysis
columns_of_interest = ['Number of PHCs Functioning', 'PHCs functoning on 24X7 basis', 
                       'With Labour Room', 'With OT', 'With at least 4 beds','With Electric Supply',
           'With Regular Water Supply', 'With Telephone']

# Pairplot for multivariate analysis (scatter plots)
sns.pairplot(df2[columns_of_interest], kind='scatter', diag_kind='kde')
plt.suptitle('Pairplot of Infrastructure Facilities', y=1.02)
plt.show()



In [None]:
# Correlation matrix heatmap
correlation_matrix = df2[columns_of_interest].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Correlation Matrix of Infrastructure Facilities')
plt.show()

In [None]:
#  perform one-hot encoding on categorical columns
categorical_columns = ['State/UT', 'Fiscal Year']

# Perform One-Hot Encoding using pd.get_dummies()
df_encoded = pd.get_dummies(df2, columns=categorical_columns, drop_first=True)

# Display the encoded DataFrame
print("Encoded DataFrame:")
print(df_encoded)

In [None]:
df_encoded.dtypes

In [None]:
from sklearn.preprocessing import StandardScaler
# Extract features (columns to be scaled)
numerical_columns = ['Number of PHCs Functioning','PHCs functoning on 24X7 basis','With Labour Room','With OT','With at least 4 beds','Without Electric Supply','Without Regular Water Supply','With Telephone',
                    'With Electric Supply','With Regular Water Supply']
# Perform standardization
scaler = StandardScaler()
df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])
df_encoded.head()


# **Feature Engineering**

In [None]:
# Convert 'Fiscal Year' column to datetime format with explicit date format
df2['Fiscal Year'] = pd.to_datetime(df2['Fiscal Year'], format='%Y-%y')

In [None]:
# Step 5: Grouping Features
# Grouping States/UTs into regions based on geographical location or any other criteria
north_states = ['Delhi', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Punjab', 'Rajasthan', 'Uttarakhand', 'Uttar Pradesh']
south_states = ['Andhra Pradesh', 'Karnataka', 'Kerala', 'Tamil Nadu', 'Telangana']
east_states = ['Bihar', 'Jharkhand', 'Odisha', 'West Bengal']
west_states = ['Goa', 'Gujarat', 'Maharashtra']
central_states = ['Chhattisgarh', 'Madhya Pradesh']
northeast_states = ['Arunachal Pradesh', 'Assam', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Sikkim', 'Tripura']

# Creating a new column 'Region' based on the grouping
df2['Region'] = np.where(df2['State/UT'].isin(north_states), 'North India',
                         np.where(df2['State/UT'].isin(south_states), 'South India',
                                  np.where(df2['State/UT'].isin(east_states), 'East India',
                                           np.where(df2['State/UT'].isin(west_states), 'West India',
                                                    np.where(df2['State/UT'].isin(central_states), 'Central India',
                                                             np.where(df2['State/UT'].isin(northeast_states), 'Northeast India', 'Other'))))))

# Step 6: Temporal Features

# Convert 'Fiscal Year' column to datetime format
df2['Fiscal Year'] = pd.to_datetime(df2['Fiscal Year'])

# You can inspect the modified DataFrame to see the changes
print(df2.head())

df2['Year'] = df2['Fiscal Year'].dt.year

# You can inspect the modified DataFrame to see the new features
print(df2.head())

In [None]:
df2.head()

# **Feature Selection**

In [None]:
features = df2.drop(['State/UT', 'Fiscal Year','Region','Without Electric Supply', 'Without Regular Water Supply'], axis=1)
corr_matrix = features.corr()
corr_with_target = corr_matrix['With Labour Room'].abs().sort_values(ascending=False)

# Step 4: Select Features
selected_features = corr_with_target[corr_with_target > 0.5].index.tolist()

# Step 5: Visualize Correlation
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

print("Important features based on correlation analysis:")
print(selected_features)

# Model **Training**

# Training Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# Selecting features and target variable
X = df_encoded[['Number of PHCs Functioning', 'PHCs functoning on 24X7 basis', 'With OT', 'With at least 4 beds',
          'With Electric Supply', 'With Regular Water Supply', 'With Telephone']]
y = df_encoded['With Labour Room']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot predicted vs actual
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', linewidth=2)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted PHCs functioning 24X7")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Initialize Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Initialize SVR model
svr_model = SVR(kernel='rbf')

# Scale features for SVR (optional, but recommended for SVM models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVR model
svr_model.fit(X_train_scaled, y_train)


def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate R-squared (R²)
    r_squared = r2_score(y_test, y_pred)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)

    return r_squared, mse

# Evaluate Linear Regression model
lr_r_squared, lr_mse = evaluate_model(lr_model, X_test, y_test)

# Evaluate Random Forest Regressor model
rf_r_squared, rf_mse = evaluate_model(rf_model, X_test, y_test)

# Evaluate SVR model (using scaled features)
svr_r_squared, svr_mse = evaluate_model(svr_model, X_test_scaled, y_test)

print(f"Linear Regression - R-squared: {lr_r_squared:.4f}, MSE: {lr_mse:.4f}")
print(f"Random Forest Regressor - R-squared: {rf_r_squared:.4f}, MSE: {rf_mse:.4f}")
print(f"SVR - R-squared: {svr_r_squared:.4f}, MSE: {svr_mse:.4f}")

In [None]:
def plot_predictions(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)

    # Create scatter plot of predicted vs. actual values
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', linewidth=2)
    plt.title(f'Predicted vs. Actual Values ({type(model).__name__})')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.show()

# Plot predictions for Linear Regression model
plot_predictions(lr_model, X_test, y_test)

# Plot predictions for Random Forest Regressor model
plot_predictions(rf_model, X_test, y_test)

# Plot predictions for SVR model
plot_predictions(svr_model, X_test_scaled, y_test)