<a href="https://colab.research.google.com/github/Prisha5718/AI-ML-Task/blob/main/Coding_Ninjas_Taks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!unzip employee_retention_project.zip -d employee_retention_project/

Archive:  employee_retention_project.zip
   creating: employee_retention_project/employee_retention_project/
  inflating: employee_retention_project/employee_retention_project/analysis_notebook.py  
  inflating: employee_retention_project/employee_retention_project/Employee_Performance_Retention.csv  
   creating: employee_retention_project/employee_retention_project/Graphical_Results/
  inflating: employee_retention_project/employee_retention_project/insights.txt  


In [1]:
from google.colab import files
uploaded = files.upload()



Saving Employee_Performance_Retention.csv to Employee_Performance_Retention.csv


In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Create output directory
os.makedirs("graphical_results", exist_ok=True)

# Load and clean data
data = pd.read_csv("Employee_Performance_Retention.csv")
print("Dataset shape:", data.shape)

# Basic data cleaning
data = data.dropna()
data = data.drop_duplicates()

# Save cleaned dataset
data.to_csv("cleaned_dataset.csv", index=False)
print("Cleaned dataset saved")

# Prepare features
X = data.drop(['Employee_ID', 'Attrition'], axis=1)
y = data['Attrition']

# Encode categorical variables
cat_cols = ['Department', 'Job_Satisfaction_Level', 'Promotion_in_Last_2_Years']
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le

target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

# Scale features for SVM
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_scaled, X_test_scaled, _, _ = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# 1. RANDOM FOREST
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print("Random Forest Accuracy:", rf_accuracy)

# 2. SVM MODELS
svm_results = {}
kernels = ['linear', 'poly', 'rbf']

for kernel in kernels:
    if kernel == 'poly':
        svm = SVC(kernel=kernel, degree=3, random_state=42)
    else:
        svm = SVC(kernel=kernel, random_state=42)

    svm.fit(X_train_scaled, y_train)
    svm_pred = svm.predict(X_test_scaled)
    svm_accuracy = accuracy_score(y_test, svm_pred)
    svm_results[f"SVM_{kernel}"] = svm_accuracy
    print(f"SVM {kernel} Accuracy:", svm_accuracy)

# Find best SVM
best_svm_name = max(svm_results, key=svm_results.get)
best_svm_accuracy = svm_results[best_svm_name]

# GRAPHICAL RESULTS

# 1. Model Comparison Chart
plt.figure(figsize=(10, 6))
models = ['Random Forest'] + list(svm_results.keys())
accuracies = [rf_accuracy] + list(svm_results.values())
plt.bar(models, accuracies, color=['green', 'blue', 'pink', 'yellow'])
plt.title('Model Performance Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('graphical_results/model_comparison.png', dpi=300, bbox_inches='tight')
plt.close()

# 2. Feature Importance (Random Forest)
plt.figure(figsize=(10, 8))
importance = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=True)
importance.plot(kind='barh')
plt.title('Random Forest Feature Importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig('graphical_results/feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()

# 3. Confusion Matrix - Random Forest
plt.figure(figsize=(8, 6))
cm_rf = confusion_matrix(y_test, rf_pred)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_encoder.classes_,
            yticklabels=target_encoder.classes_)
plt.title('Random Forest Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('graphical_results/confusion_matrix_rf.png', dpi=300, bbox_inches='tight')
plt.close()

# 4. Best SVM Confusion Matrix
best_kernel = best_svm_name.split('_')[1]
if best_kernel == 'poly':
    best_svm = SVC(kernel=best_kernel, degree=3, random_state=42)
else:
    best_svm = SVC(kernel=best_kernel, random_state=42)

best_svm.fit(X_train_scaled, y_train)
best_svm_pred = best_svm.predict(X_test_scaled)

plt.figure(figsize=(8, 6))
cm_svm = confusion_matrix(y_test, best_svm_pred)
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Reds',
            xticklabels=target_encoder.classes_,
            yticklabels=target_encoder.classes_)
plt.title(f'Best SVM ({best_kernel}) Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('graphical_results/confusion_matrix_svm.png', dpi=300, bbox_inches='tight')
plt.close()

# 5. Performance Metrics Comparison
from sklearn.metrics import precision_score, recall_score, f1_score

rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)

svm_precision = precision_score(y_test, best_svm_pred)
svm_recall = recall_score(y_test, best_svm_pred)
svm_f1 = f1_score(y_test, best_svm_pred)

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
rf_scores = [rf_accuracy, rf_precision, rf_recall, rf_f1]
svm_scores = [best_svm_accuracy, svm_precision, svm_recall, svm_f1]

x = range(len(metrics))
width = 0.35

plt.figure(figsize=(10, 6))
plt.bar([i - width/2 for i in x], rf_scores, width, label='Random Forest', color='green', alpha=0.8)
plt.bar([i + width/2 for i in x], svm_scores, width, label=f'Best SVM ({best_kernel})', color='blue', alpha=0.8)

plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Performance Metrics Comparison')
plt.xticks(x, metrics)
plt.legend()
plt.ylim(0, 1)

for i in range(len(metrics)):
    plt.text(i - width/2, rf_scores[i] + 0.01, f'{rf_scores[i]:.3f}', ha='center', fontsize=9)
    plt.text(i + width/2, svm_scores[i] + 0.01, f'{svm_scores[i]:.3f}', ha='center', fontsize=9)

plt.tight_layout()
plt.savefig('graphical_results/performance_metrics.png', dpi=300, bbox_inches='tight')
plt.close()

print("All graphical results saved to 'graphical_results/' folder")

# Generate insights
insights = f"""
EMPLOYEE RETENTION ANALYSIS - KEY INSIGHTS

1. DATASET INFORMATION:
   - Total employees: {len(data)}
   - Attrition rate: {(data['Attrition'] == 'Yes').mean():.1%}
   - Features analyzed: {len(X.columns)}

2. MODEL PERFORMANCE:
   - Random Forest Accuracy: {rf_accuracy:.3f}
   - Best SVM ({best_kernel}) Accuracy: {best_svm_accuracy:.3f}
   - Winner: {'Random Forest' if rf_accuracy > best_svm_accuracy else f'SVM ({best_kernel})'}

3. TOP 3 IMPORTANT FEATURES (Random Forest):
{importance.tail(3).to_string()}

4. KEY FINDINGS:
   - {'Random Forest' if rf_accuracy > best_svm_accuracy else 'SVM'} performed better for this dataset
   - Most important factor: {importance.index[-1]}
   - Least important factor: {importance.index[0]}

5. RECOMMENDATIONS:
   - Focus on improving {importance.index[-1]} to reduce attrition
   - Consider {importance.index[-2]} as secondary priority
   - Monitor {importance.index[-3]} for early warning signs
"""

with open('insights.txt', 'w') as f:
    f.write(insights)

print("Insights saved to 'insights.txt'")
print("Analysis complete!")


Dataset shape: (9000, 10)
Cleaned dataset saved
Random Forest Accuracy: 0.8022222222222222
SVM linear Accuracy: 0.805
SVM poly Accuracy: 0.805
SVM rbf Accuracy: 0.805


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


All graphical results saved to 'graphical_results/' folder
Insights saved to 'insights.txt'
Analysis complete!
