In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [18]:
# Load the dataset (Update the file path as needed)
df = pd.read_csv(r"C:\Users\Urvi\Downloads\archive (4)\Unemployment_Rate_upto_11_2020.csv")  


In [19]:

df.columns = df.columns.str.strip()  # Remove extra spaces
df.columns = df.columns.str.lower()  # Convert to lowercase for consistency

print(df.columns)  # Check again


Index(['region', 'date', 'frequency', 'estimated unemployment rate (%)',
       'estimated employed', 'estimated labour participation rate (%)',
       'region.1', 'longitude', 'latitude'],
      dtype='object')


In [20]:
# Display dataset structure
print(df.head())

           region         date frequency  estimated unemployment rate (%)  \
0  Andhra Pradesh   31-01-2020         M                             5.48   
1  Andhra Pradesh   29-02-2020         M                             5.83   
2  Andhra Pradesh   31-03-2020         M                             5.79   
3  Andhra Pradesh   30-04-2020         M                            20.51   
4  Andhra Pradesh   31-05-2020         M                            17.43   

   estimated employed  estimated labour participation rate (%) region.1  \
0            16635535                                    41.02    South   
1            16545652                                    40.90    South   
2            15881197                                    39.18    South   
3            11336911                                    33.10    South   
4            12988845                                    36.46    South   

   longitude  latitude  
0    15.9129     79.74  
1    15.9129     79.74  
2    15.912

In [21]:
df['estimated labour participation rate (%)'].isnull().sum()  # Check for missing values


np.int64(0)

In [22]:
df['Unemployment_Level'] = df['estimated labour participation rate (%)'].apply(lambda x: 1 if x > df['estimated labour participation rate (%)'].median() else 0)


In [23]:
# Select features and target
X = df[['longitude', 'latitude']]  # Example features
y = df['Unemployment_Level']


In [24]:
# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [26]:
# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)


In [27]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)


Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.64      0.57        28
           1       0.47      0.35      0.40        26

    accuracy                           0.50        54
   macro avg       0.49      0.49      0.49        54
weighted avg       0.49      0.50      0.49        54



In [None]:
# Plot Confusion Matrix
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=['Low', 'High'], yticklabels=['Low', 'High'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()