In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

ModuleNotFoundError: No module named 'pydotplus'

In [None]:
#Import the cleaned dataset 
data_path = ('../data/processed/cleaned_happiness_data.csv')
data_cleaned= pd.read_csv(data_path)

In [None]:
data_cleaned

In [None]:
# Define features and target variables
features = data_cleaned.drop(['HappinessIndicator'], axis=1)
target = data_cleaned['HappinessIndicator']

In [None]:
# Scale the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

In [None]:
# Fit the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

In [None]:
# Print the accuracy report
print("Accuracy Score:", accuracy)

In [None]:
# Print the classification report
print(classification_report(y_test, y_pred))

In [None]:
# Generate and display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

## Model Performance Summary

Upon fitting the logistic regression model to the dataset, we observed an exceedingly high accuracy score of approximately 98.9%. The classification report yielded high precision, recall, and F1-scores across both classes, indicating the model's proficient performance in classifying individuals as "happy" or "not happy." The following points detail the model's evaluation metrics:

- **Precision**: Exceptionally high for both classes, suggesting a minimal rate of false positives.
- **Recall**: Perfect for the "not happy" class, indicating no false negatives, and nearly perfect for the "happy" class.
- **F1-Score**: Close to 1 for both classes, denoting an excellent balance between precision and recall.
- **Support**: Reflects the balanced distribution of classes within the test dataset.

The confusion matrix provided additional insights:

- Correctly predicted "not happy" instances: 131
- Correctly predicted "happy" instances: 140
- False negatives: 3
- False positives: 0

### Interpretation and Recommendations

The high accuracy and F1-scores suggest the logistic regression model is highly effective. However, such high performance could potentially be a sign of overfitting. It is crucial to validate the model's reliability through further testing and evaluation. Recommendations to ensure robustness and validity of the model include:

1. **Cross-Validation**: To confirm consistency across various data subsets.
2. **Feature Importance Analysis**: To understand the drivers of the model's decisions.
3. **Testing More Complex Models**: Such as Random Forest or Gradient Boosting for comparative analysis.
4. **Regularization Techniques**: To prevent overfitting and enhance model generalization.
5. **Data Augmentation**: If data availability is limited, augmenting the dataset could improve model generalization.
6. **Anomaly Detection**: To identify and rectify data points that disproportionately influence the model.
7. **Domain Expert Consultation**: To align the model's predictions with expert knowledge.
8. **External Validation**: To assess performance on an unseen dataset and ensure generalizability.

These steps aim not only to potentially enhance model performance but to also affirm the reliability and applicability of the model to real-world scenarios.


### Random Forest Model

In [None]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier()

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [None]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test)

In [None]:
# Calculating the confusion matrix
rf_cm = confusion_matrix(y_test, rf_predictions)
rf_cm_df = pd.DataFrame(
    rf_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, rf_predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(rf_cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

In [None]:
# Generate and display the confusion matrix
sns.heatmap(rf_cm_df, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

### Decision Tree model

In [None]:
# Creating the decision tree classifier instance
decision_tree_model = tree.DecisionTreeClassifier()

In [None]:
# Fitting the model
decision_tree_model = decision_tree_model.fit(X_train, y_train)

In [None]:
# Making predictions using the testing data
decision_tree_predictions = decision_tree_model.predict(X_test)

In [None]:
# Calculating the confusion matrix
dt_cm = confusion_matrix(y_test, decision_tree_predictions)
dt_cm_df = pd.DataFrame(
    dt_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, decision_tree_predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(dt_cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, decision_tree_predictions))

In [None]:
# Generate and display the confusion matrix
sns.heatmap(dt_cm_df, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Create DOT data
dot_data = tree.export_graphviz(
    decision_tree_model, out_file=None, feature_names=features.columns, class_names=["0", "1"], filled=True
)

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)

# Show graph
Image(graph.create_png())
