<a href="https://colab.research.google.com/github/ParthPatel-DA/ML_Project/blob/master/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing Necessary Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from keras.layers import Dense

# **Importing the Dataset**

In [None]:
data = pd.read_csv('cost_of_living_us.csv')

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data['family_member_count'].value_counts()

# **Data Visualization**

In [None]:
plt.scatter(data['family_member_count'], data['median_family_income'])
plt.title('relation of fixed acidity with wine')
plt.xlabel('quality')
plt.ylabel('fixed acidity')
plt.show()

In [None]:
data[['housing_cost', 'food_cost', 'transportation_cost', 'healthcare_cost', 'other_necessities_cost', 'childcare_cost', 'taxes']].hist(bins=15, figsize=(15, 10))
plt.suptitle('Distribution of Various Costs')
plt.show()

In [None]:
avg_cost_by_state = data.groupby('state')['median_family_income'].mean()
avg_cost_by_state.plot(kind='bar', figsize=(12, 6))
plt.title('Average median family income by State')
plt.ylabel('Average median family income')
plt.xlabel('State')
plt.show()

In [None]:
avg_cost_by_state = data.groupby('state')['total_cost'].mean()
avg_cost_by_state.plot(kind='bar', figsize=(12, 6))
plt.title('Average Total Cost by State')
plt.ylabel('Average Total Cost')
plt.xlabel('State')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='isMetro', y='total_cost', data=data)
plt.title('Cost Distribution in Metro vs Non-Metro Areas')
plt.xlabel('Is Metro Area')
plt.ylabel('Total Cost')
plt.show()

In [None]:
plt.scatter(data['family_member_count'], data['total_cost'])
plt.title('Family Member Count vs Total Cost')
plt.xlabel('Family Member Count')
plt.ylabel('Total Cost')
plt.show()

In [None]:
corr = data[['housing_cost', 'food_cost', 'transportation_cost', 'healthcare_cost', 'other_necessities_cost', 'childcare_cost', 'taxes']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
sns.pairplot(data)

# **Data pre-processing**

Handle Missing Values

In [None]:
imputer = SimpleImputer(strategy='median')
data['median_family_income'] = imputer.fit_transform(data[['median_family_income']])

Encoding Categorical Variables

In [None]:
label_encoder = LabelEncoder()
data['state'] = label_encoder.fit_transform(data['state'])
data['isMetro'] = label_encoder.fit_transform(data['isMetro'])
data['family_member_count'] = label_encoder.fit_transform(data['family_member_count'])

Identifying `esi_category` using KMeans

In [None]:
# Constants for regional and tax adjustments
COLA_METRO = 1.1
COLA_NON_METRO = 1.0
EMERGENCY_FUND_RATE = 0.15  # 15% of disposable income for savings/emergency fund

# Calculate various ratios and factors
data['disposable_income'] = data['median_family_income'] - data['total_cost']
data['DIR'] = data['disposable_income'] / data['median_family_income']
data['CBR'] = (data['housing_cost'] + data['food_cost'] + data['transportation_cost'] +
               data['healthcare_cost'] + data['other_necessities_cost'] + data['childcare_cost']) / data['median_family_income']

# Function to calculate dependency ratio
def dependency_ratio(row):
    family_members = row['family_member_count']
    return 1 + (family_members - 1) * 0.1  # 10% increase for each additional family member

# Function to calculate regional cost adjustment
def regional_cost_adjustment(row):
    return COLA_METRO if row['isMetro'] else COLA_NON_METRO

# Function to calculate tax burden adjustment
def tax_burden_adjustment(row):
    return 1 - (row['taxes'] / row['median_family_income'])

# Calculate Economic Stability Index (ESI)
def calculate_esi(row):
    DIR = row['DIR']
    CBR = row['CBR']
    DR = dependency_ratio(row)
    RCA = regional_cost_adjustment(row)
    SEFF = row['disposable_income'] * EMERGENCY_FUND_RATE
    TBA = tax_burden_adjustment(row)

    esi = (DIR * SEFF * TBA) / (CBR * DR * RCA)
    return esi

# Apply the ESI calculation
data['economic_stability_index'] = data.apply(calculate_esi, axis=1)

# Calculate mean and standard deviation of ESI
mean_esi = data['economic_stability_index'].mean()
std_dev_esi = data['economic_stability_index'].std()

def categorize_esi(esi):
    if esi < mean_esi - std_dev_esi:
        return 'very low'
    elif mean_esi - std_dev_esi <= esi < mean_esi:
        return 'low'
    elif mean_esi <= esi < mean_esi + std_dev_esi:
        return 'moderate'
    elif mean_esi + std_dev_esi <= esi < mean_esi + 2 * std_dev_esi:
        return 'high'
    else:
        return 'very high'

data['esi_category'] = data['economic_stability_index'].apply(categorize_esi)

Standard Scaling for Numerical Features

In [None]:
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

Feature Selection

In [None]:
X = data.drop(['case_id', 'areaname', 'county', 'disposable_income', 'DIR', 'CBR', 'economic_stability_index', 'esi_category'], axis=1)
y = data['esi_category']

# **Splitting the into training set and test set**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# **Modelling**

In [None]:
def show_result_graph(actual, predicted):
  # Create a dataframe for easier plotting
  df_comparison = pd.DataFrame({'Actual': actual, 'Predicted': predicted})

  # Create a figure and a set of subplots
  fig, ax = plt.subplots(figsize=(12, 6))

  # Plot actual values
  df_comparison['Actual'].value_counts().sort_index().plot(kind='bar', ax=ax, position=0, width=0.4, label='Actual')

  # Plot predicted values
  df_comparison['Predicted'].value_counts().sort_index().plot(kind='bar', ax=ax, position=1, width=0.4, color='r', label='Predicted')

  # Adding titles and labels
  plt.title('Actual vs Predicted Values')
  plt.xlabel('Classes')
  plt.ylabel('Counts')
  plt.xticks(ticks=np.arange(len(df_comparison['Actual'].value_counts().sort_index())), labels=df_comparison['Actual'].value_counts().sort_index().index, rotation=0)
  plt.legend()
  plt.grid(True)
  plt.show()

def generate_report(actual, predicted):
  print(classification_report(actual, predicted))
  show_result_graph(actual, predicted)

Logistic Regression

In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", log_reg.score(X_test, y_test))
generate_report(y_test, y_pred_log_reg)

SGD Classifier

In [None]:
sgd_clf = SGDClassifier()
sgd_clf.fit(X_train, y_train)
y_pred_sgd = sgd_clf.predict(X_test)
print("SGD Classifier Accuracy:", sgd_clf.score(X_test, y_test))
generate_report(y_test, y_pred_sgd)

SVC

In [None]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)
print("SVC Accuracy:", svc.score(X_test, y_test))
generate_report(y_test, y_pred_svc)

Decision Tree

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred_decision_tree = decision_tree.predict(X_test)
print("Decision Tree Accuracy:", decision_tree.score(X_test, y_test))
generate_report(y_test, y_pred_decision_tree)

Random Forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
y_pred_random_forest = random_forest.predict(X_test)
print("Random Forest Accuracy:", random_forest.score(X_test, y_test))
generate_report(y_test, y_pred_random_forest)

MLP Classifier

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
print("MLP Classifier Accuracy:", mlp.score(X_test, y_test))
generate_report(y_test, y_pred_mlp)

Artificial Neural Networks

In [None]:
from keras.utils import to_categorical

# One-Hot Encoding of the Target Variable
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_train_encoded = to_categorical(y_train_encoded, num_classes=5)

y_test_encoded = encoder.transform(y_test)
y_test_encoded = to_categorical(y_test_encoded, num_classes=5)

ann = Sequential()
ann.add(Dense(units=16, activation='relu', input_dim=X_train.shape[1]))
ann.add(Dense(units=16, activation='relu'))
ann.add(Dense(units=5, activation='sigmoid'))  # Change to 3 units for 3 categories
ann.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
ann.fit(X_train, y_train_encoded, batch_size=32, epochs=100)

# Predicting the Test set results
y_pred_ann = ann.predict(X_test)
y_pred_ann = np.argmax(y_pred_ann, axis=1)  # Convert predictions to class labels

# Evaluating the Model
loss, accuracy = ann.evaluate(X_test, y_test_encoded)
print("ANN Accuracy:", accuracy)

# Reverse Transform to Original Class Names
y_pred_ann = encoder.inverse_transform(y_pred_ann)
y_test_original = encoder.inverse_transform(np.argmax(y_test_encoded, axis=1))

generate_report(y_test_original, y_pred_ann)