## 3 diverse problem statements to explore different perspectives:

- Employee Attrition Analysis: To understand and predict attrition rates.
- Employee Satisfaction and Retention: To improve satisfaction and retention.
- Performance and Compensation: To analyze if compensation aligns with performance.


In [96]:
# Identifying  all non-numeric columns (including object types)
non_numeric_cols = combined_df.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_cols)

Non-numeric columns: Index(['over18', 'educationfield', 'attrition', 'maritalstatus', 'department',
       'jobrole', 'overtime', 'businesstravel', 'gender'],
      dtype='object')


In [97]:
for col in non_numeric_cols:
    print(f"Unique values in '{col}':")
    print(combined_df[col].unique())
    print()

Unique values in 'over18':
['Y']

Unique values in 'educationfield':
['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']

Unique values in 'attrition':
['Yes' 'No']

Unique values in 'maritalstatus':
['Single' 'Married' 'Divorced']

Unique values in 'department':
['Sales' 'Research & Development' 'Human Resources']

Unique values in 'jobrole':
['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']

Unique values in 'overtime':
['Yes' 'No']

Unique values in 'businesstravel':
['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']

Unique values in 'gender':
['Female' 'Male']



In [100]:
import matplotlib.pyplot as plt

# Assuming combined_df is the DataFrame that contains your data
unique_values_count = combined_df.nunique()
unique_values_count


Unnamed: 0,0
over18,1
monthlyincome,1349
educationfield,6
attrition,2
distancefromhome,29
dailyrate,886
joblevel,5
numcompaniesworked,10
monthlyrate,1427
stockoptionlevel,4


In [148]:
# Drop the 'over18' column as all working force is over 18 
combined_df = combined_df.drop('over18', axis=1)




In [149]:
# Label encoding for binary categorical columns
binary_columns = ['overtime', 'gender','attrition']
for col in binary_columns:
    combined_df[col] = combined_df[col].map({'Yes': 1, 'No': 0, 'Female': 0, 'Male': 1})



In [150]:
from sklearn.preprocessing import LabelEncoder

# Multi-category columns
multi_columns = ['educationfield', 'maritalstatus', 'department', 'jobrole', 'businesstravel']
le = LabelEncoder()

# Apply label encoding to each column and display the mapping
for col in multi_columns:
    combined_df[col] = le.fit_transform(combined_df[col])
    print(f"Label encoding for '{col}':")
    print(dict(zip(le.classes_, le.transform(le.classes_))))
    print()  # Print an empty line for better readability




Label encoding for 'educationfield':
{'Human Resources': 0, 'Life Sciences': 1, 'Marketing': 2, 'Medical': 3, 'Other': 4, 'Technical Degree': 5}

Label encoding for 'maritalstatus':
{'Divorced': 0, 'Married': 1, 'Single': 2}

Label encoding for 'department':
{'Human Resources': 0, 'Research & Development': 1, 'Sales': 2}

Label encoding for 'jobrole':
{'Healthcare Representative': 0, 'Human Resources': 1, 'Laboratory Technician': 2, 'Manager': 3, 'Manufacturing Director': 4, 'Research Director': 5, 'Research Scientist': 6, 'Sales Executive': 7, 'Sales Representative': 8}

Label encoding for 'businesstravel':
{'Non-Travel': 0, 'Travel_Frequently': 1, 'Travel_Rarely': 2}



In [151]:
# Now check the updated dataframe
print(combined_df.head())

   monthlyincome  educationfield  attrition  distancefromhome  dailyrate  \
0           5993               1          1                 1       1102   
1           5130               1          0                 8        279   
2           2090               4          1                 2       1373   
3           2909               1          0                 3       1392   
4           3468               3          0                 2        591   

   joblevel  numcompaniesworked  monthlyrate  stockoptionlevel  education  \
0         2                   8        19479                 0          2   
1         2                   1        24907                 1          1   
2         1                   6         2396                 0          2   
3         1                   1        23159                 0          4   
4         1                   9        16632                 1          1   

   ...  businesstravel  gender  age  employeecount  relationshipsatisfaction  \


# Label Dictionary
- Label encoding for 'educationfield':
{'Human Resources': 0, 'Life Sciences': 1, 'Marketing': 2, 'Medical': 3, 'Other': 4, 'Technical Degree': 5}

- Label encoding for 'maritalstatus':
{'Divorced': 0, 'Married': 1, 'Single': 2}

- Label encoding for 'department':
{'Human Resources': 0, 'Research & Development': 1, 'Sales': 2}

- Label encoding for 'jobrole':
{'Healthcare Representative': 0, 'Human Resources': 1, 'Laboratory Technician': 2, 'Manager': 3, 'Manufacturing Director': 4, 'Research Director': 5, 'Research Scientist': 6, 'Sales Executive': 7, 'Sales Representative': 8}

- Label encoding for 'businesstravel':
{'Non-Travel': 0, 'Travel_Frequently': 1, 'Travel_Rarely': 2}
- Yes:1 No:0
- Male :1 Female: 0

In [152]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1470 entries, 0 to 1469
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   monthlyincome             1470 non-null   int64
 1   educationfield            1470 non-null   int64
 2   attrition                 1470 non-null   int64
 3   distancefromhome          1470 non-null   int64
 4   dailyrate                 1470 non-null   int64
 5   joblevel                  1470 non-null   int64
 6   numcompaniesworked        1470 non-null   int64
 7   monthlyrate               1470 non-null   int64
 8   stockoptionlevel          1470 non-null   int64
 9   education                 1470 non-null   int64
 10  percentsalaryhike         1470 non-null   int64
 11  maritalstatus             1470 non-null   int64
 12  worklifebalance           1470 non-null   int64
 13  yearsincurrentrole        1470 non-null   int64
 14  performancerating         1470 non-null   int

- Employee Attrition Prediction aims to identify which employees are likely to leave the organization (attrition) based on various features such as job role, salary, work environment, etc.
- Predicting attrition helps HR departments take preventive actions to retain valuable employees and reduce turnover costs.