In [None]:
pd.read_csv('worker_dataset.csv')

In [None]:
import pandas as pd
# Reading the CSV file into a DataFrame
df = pd.read_csv('worker_dataset.csv')

# Display the first few rows of the DataFrame
print(df.head())


#Exploratory Data Analysis (EDA) stage,


In [None]:
#Basic Statistics: Understand the basic statistics of each column.
summary = df.describe(include='all')
print(summary)

In [None]:
# Get the value counts for Occupation
occupation_counts = df['Occupation'].value_counts()
print("Occupation Distribution:")
print(occupation_counts)
print("\n")  # Just to add a new line for separation

In [None]:
# Get the value counts for ReasonForLeaving
reason_counts = df['ReasonForLeaving'].value_counts()
print("Reason For Leaving Distribution:")
print(reason_counts)

In [None]:
#Histograms: For numerical data like Age, to understand the distribution.
import matplotlib.pyplot as plt

# Generate a histogram for Age
plt.hist(df['Age'], bins=7, edgecolor='black', alpha=0.7)
plt.title('Age Distribution of Workers')
plt.xlabel('Age')
plt.ylabel('Number of Workers')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()


In [None]:
# Bar plot for Occupation
occupation_counts = df['Occupation'].value_counts()
occupation_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Number of Workers by Occupation')
plt.xlabel('Occupation')
plt.ylabel('Number of Workers')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', linewidth=0.5)
plt.show()

In [None]:
# Bar plot for ReasonForLeaving
reason_counts = df['ReasonForLeaving'].value_counts()
reason_counts.plot(kind='bar', color='salmon', edgecolor='black')
plt.title('Reasons for Leaving the Job')
plt.xlabel('Reason')
plt.ylabel('Number of Workers')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', linewidth=0.5)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Box plot for Age by Occupation
plt.figure(figsize=(12,6))
sns.boxplot(x='Occupation', y='Age', data=df, palette='pastel')
plt.title('Age Distribution by Occupation')
plt.xlabel('Occupation')
plt.ylabel('Age')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()

In [None]:
# Box plot for Age by ReasonForLeaving
plt.figure(figsize=(12,6))
sns.boxplot(x='ReasonForLeaving', y='Age', data=df, palette='pastel')
plt.title('Age Distribution by Reason for Leaving')
plt.xlabel('Reason for Leaving')
plt.ylabel('Age')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Calculate the counts of current and former employees
employee_status_counts = df['IsCurrentEmployee'].value_counts()

# Labels for the pie chart
labels = ['Former Employees', 'Current Employees'] if False in employee_status_counts.index else ['Current Employees', 'Former Employees']

# Colors for the sections
colors = ['#FF9999', '#66B2FF']

# Exploding the 2nd slice (i.e. 'Current Employees') for emphasis
explode = (0.1, 0) if False in employee_status_counts.index else (0, 0.1)

# Plotting the Pie chart
fig1, ax1 = plt.subplots()
ax1.pie(employee_status_counts, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
        shadow=True, startangle=90)

# Equal aspect ratio ensures that pie is drawn as a circle.
ax1.axis('equal')  
plt.title('Proportion of Current vs. Former Employees')
plt.show()


In [None]:
missing_data = df.isnull().sum()

# Display columns with missing data
print(missing_data[missing_data > 0])


In [None]:
import pandas as pd
from datetime import datetime

# Read the CSV file
df = pd.read_csv('worker_dataset.csv', parse_dates=['DateJoined', 'DateLeft'])

# Calculate years worked
now = datetime.now()
df['YearsWorked'] = df.apply(lambda row: (row['DateLeft'] if pd.notna(row['DateLeft']) else now) - row['DateJoined'], axis=1)

# Convert the time delta to years as a float
df['YearsWorked'] = df['YearsWorked'].dt.days / 365.25

# Optionally, you can round the number of years to a desired precision, e.g., 2 decimal places:
df['YearsWorked'] = df['YearsWorked'].round(2)

# Save the updated dataframe back to CSV
df.to_csv('worker_dataset_updated.csv', index=False)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the updated dataset
df = pd.read_csv('worker_dataset_updated.csv')

# Histogram
plt.figure(figsize=(10,6))
df['YearsWorked'].hist(bins=10, edgecolor='black', alpha=0.7)
plt.title('Distribution of Years Worked by Workers')
plt.xlabel('Years Worked')
plt.ylabel('Number of Workers')
plt.grid(False)
plt.show()


In [None]:
#Box Plot: Distribution of Years Worked
plt.figure(figsize=(10,6))
plt.boxplot(df['YearsWorked'])
plt.title('Box Plot of Years Worked by Workers')
plt.ylabel('Years Worked')
plt.show()


In [None]:
#Bar Plot: Average Years Worked by Occupation
avg_years_by_occupation = df.groupby('Occupation')['YearsWorked'].mean()

plt.figure(figsize=(12,6))
avg_years_by_occupation.plot(kind='bar', color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Average Years Worked by Occupation')
plt.ylabel('Average Years Worked')
plt.xlabel('Occupation')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Box Plot to visualize Age distribution by Reasons for Leaving
plt.figure(figsize=(14,8))
sns.boxplot(x='ReasonForLeaving', y='Age', data=df)
plt.title('Age distribution by Reason for Leaving')
plt.xlabel('Reason for Leaving')
plt.ylabel('Age')
plt.xticks(rotation=45)
plt.show()


In [None]:
correlation_matrix = df.corr()
print(correlation_matrix)


In [None]:
# Grouping by 'ReasonForLeaving' and calculating the mean 'YearsWorked'
avg_years_by_reason = df.groupby('ReasonForLeaving')['YearsWorked'].mean().sort_values()

print(avg_years_by_reason)



In [None]:
average_tenure = df['YearsWorked'].mean()
print(f"The average tenure of an employee is: {average_tenure:.2f} years.")


In [None]:
df['DateLeft'] = pd.to_datetime(df['DateLeft'], errors='coerce')


In [None]:
# Extracting the month when employees left
from datetime import datetime
df['LeaveMonth'] = df['DateLeft'].dt.month
monthly_attrition = df['LeaveMonth'].value_counts().sort_index()

# Plotting the data
monthly_attrition.plot(kind='bar', title='Monthly Attrition Rates')
plt.xlabel('Month')
plt.ylabel('Number of Employees Left')
plt.show()


In [None]:
# Extracting the year when employees left
df['LeaveYear'] = df['DateLeft'].dt.year
yearly_attrition = df['LeaveYear'].value_counts().sort_index()

# Plotting the data
yearly_attrition.plot(kind='bar', title='Yearly Attrition Rates')
plt.xlabel('Year')
plt.ylabel('Number of Employees Left')
plt.show()


In [None]:
# Define the age bins and their labels
age_bins = [20, 30, 40, 50, 60, 70]
age_labels = ['20-29', '30-39', '40-49', '50-59', '60+']

# Create a new column in the dataframe for age categories
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(data=df, x='AgeGroup', order=age_labels)
plt.title('Distribution of Workers in Different Age Groups')
plt.ylabel('Number of Workers')
plt.show()


Analyze Reasons:

What's the most common reason for leaving?
Is there a particular occupation that has a higher rate of a specific reason for leaving?

In [None]:
#1. Most Common Reason for Leaving
most_common_reason = df['ReasonForLeaving'].value_counts().idxmax()
count_most_common_reason = df['ReasonForLeaving'].value_counts().max()

print(f"The most common reason for leaving is '{most_common_reason}' with {count_most_common_reason} occurrences.")


In [None]:
# Analyzing by Occupation
occupation_reason = pd.crosstab(df['Occupation'], df['ReasonForLeaving'])

print(occupation_reason)


In [None]:
occupation_reason.plot(kind='bar', figsize=(15,7))
plt.title('Reason for Leaving by Occupation')
plt.ylabel('Number of Workers')
plt.show()


1. Average Age of Current vs. Former Employees

In [None]:
average_age = df.groupby('IsCurrentEmployee')['Age'].mean()

print(f"Average age of current employees: {average_age[True]:.2f}")
print(f"Average age of former employees: {average_age[False]:.2f}")


Occupation Distribution for Current vs. Former Employees

In [None]:
occupation_distribution = df.groupby(['Occupation', 'IsCurrentEmployee']).size().unstack()

print(occupation_distribution)


In [None]:
occupation_distribution.plot(kind='bar', stacked=True, figsize=(10,7))
plt.title('Occupation Distribution of Current vs. Former Employees')
plt.ylabel('Number of Workers')
plt.legend(title='Is Current Employee')
plt.show()


1. Detecting Outliers for Age

In [None]:
# Age
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

age_outliers = df[(df['Age'] < lower_bound) | (df['Age'] > upper_bound)]


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.boxplot(x=df['Age'])
plt.title('Boxplot of Age')
plt.show()


Detecting Outliers for DateJoined

In [None]:
df['DateJoined'].hist()
plt.title('Histogram of DateJoined')
plt.xlabel('Year Joined')
plt.ylabel('Number of Employees')
plt.show()


In [None]:
def categorize_age(age):
    if 25 <= age <= 39:
        return 'Young'
    elif 40 <= age <= 54:
        return 'Middle-aged'
    elif 55 <= age <= 59:
        return 'Old'
    else:
        return 'Other'

# Apply the function to the Age column
df['AgeGroup'] = df['Age'].apply(categorize_age)


In [None]:
def categorize_years_worked(years):
    if years <= 2:
        return 'New'
    elif 3 <= years <= 5:
        return 'Intermediate'
    else:
        return 'Veteran'

# Apply the function to the YearsWorked column
df['YearsWorkedCategory'] = df['YearsWorked'].apply(categorize_years_worked)


Time of Year Leaving: Since most workers leave between July and August, introduce a feature that indicates if an employee left during these peak months.

In [None]:
# Drop existing encoded columns if they exist
columns_to_drop = [col for col in df.columns if 'Reason_' in col]
df = df.drop(columns=columns_to_drop, errors='ignore')

# One-hot encode the 'ReasonForLeaving' column
encoded_df = pd.get_dummies(df['ReasonForLeaving'], prefix='Reason')

# Concatenate the encoded dataframe with the original dataframe
df = pd.concat([df, encoded_df], axis=1)

# Convert 'DateLeft' to datetime format if it's not already
df['DateLeft'] = pd.to_datetime(df['DateLeft'])

def left_during_peak(date_left):
    if date_left.month in [7, 8]:  # Check if the month is July or August
        return 1
    else:
        return 0

# Apply the function only to rows where DateLeft is not NaN
mask = df['DateLeft'].notna()
df.loc[mask, 'LeftDuringPeak'] = df.loc[mask, 'DateLeft'].apply(left_during_peak)

# Fill NaN values in the new column with 0
df['LeftDuringPeak'] = df['LeftDuringPeak'].fillna(0).astype(int)



In [None]:
#df.drop('ReasonForLeaving', axis=1, inplace=True)


In [None]:
print(df.columns)


In [None]:
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
print(numeric_columns)


In [None]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
print(categorical_columns)


In [None]:
df.drop('WorkerID', axis=1, inplace=True)

1. Data Preprocessing:


Handle missing values.
Normalize or standardize the data.
One-hot encode categorical variables.

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Assuming df is your dataframe

# Define numeric and categorical features
numeric_features = ['Age', 'YearsWorked', 'LeaveMonth', 'LeaveYear']  # Add other numeric columns here
categorical_features = ['Occupation', 'DateJoined', 'ReasonForLeaving', 'AgeGroup',
       'YearsWorkedCategory']  # Add other categorical columns here

# Create transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine transformers into a preprocessor step
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Preprocessing data
X = preprocessor.fit_transform(df) 


In [None]:
print(df.head())

Choose a Clustering Algorithm:
Let's choose K-Means for now

In [None]:
from sklearn.cluster import KMeans

# Let's assume 3 clusters for now
kmeans = KMeans(n_clusters=3, random_state=0)
clusters = kmeans.fit_predict(X_pca if 'X_pca' in locals() else X)


In [None]:
from sklearn.decomposition import TruncatedSVD

# Use TruncatedSVD instead of PCA
svd = TruncatedSVD(n_components=2)  # for visualization purposes
X_svd = svd.fit_transform(X)


In [None]:
X_dense = X.toarray()


Determine the Number of Clusters:
Use the elbow method to find an optimal number of clusters:

from sklearn.decomposition import PCA

pca = PCA(n_components=2)  # for visualization purposes
X_pca = pca.fit_transform(X)


In [None]:
# Compute clustering and compute inertia
inertias = []
ks = range(1, 10)

for k in ks:
    model = KMeans(n_clusters=k)
    model.fit(X_pca if 'X_pca' in locals() else X)
    inertias.append(model.inertia_)

# Plot ks vs inertias
import matplotlib.pyplot as plt

plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()


In [None]:
plt.scatter(X_svd[:, 0], X_svd[:, 1], c=clusters, cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar()
plt.show()


# One-hot encode the 'Occupation' column
occupation_encoded_df = pd.get_dummies(df['Occupation'], prefix='Occupation')

# Concatenate the encoded dataframe with the original dataframe
df = pd.concat([df, occupation_encoded_df], axis=1)



Safety Concerns Binary Feature: Given the prominence of safety concerns as a reason for leaving, especially among engineers, introduce a binary feature that indicates if an employee left due to safety concerns. take a note of every change including column hearders

# Create a binary column for Safety Concerns
df['LeftDueToSafetyConcerns'] = df['ReasonForLeaving'].apply(lambda x: 1 if x == 'Safety Concerns' else 0)


Interaction between occupation and reason for leaving:

valid_reasons = df['ReasonForLeaving'].dropna().unique()

for occupation in ['Engineer', 'Miner', 'Geologist', 'Surveyor', 'Mechanic']:
    for reason in valid_reasons:
        interaction_col_name = f'Occupation_{occupation}_Reason_{reason}'
        df[interaction_col_name] = df[f'Occupation_{occupation}'] * df[f'Reason_{reason}']



Interaction between age and reason for leaving

for reason in valid_reasons:
    interaction_col_name = f'Age_Reason_{reason}'
    df[interaction_col_name] = df['Age'] * df[f'Reason_{reason}']


Normalize Age: Given that age is a continuous variable and might have a wide range, normalize this feature to ensure that it's on the same scale as other features, making it more digestible for machine learning algorithms.

# Normalize the Age column
df['Age_normalized'] = (df['Age'] - df['Age'].min()) / (df['Age'].max() - df['Age'].min())


Date Features: Extract specific date-related features from 'DateJoined' such as:

Month of joining.
Year of joining.
Day of the week of joining.

# Ensure that 'DateJoined' is a datetime column
df['DateJoined'] = pd.to_datetime(df['DateJoined'])

# Extract month of joining
df['MonthJoined'] = df['DateJoined'].dt.month

# Extract year of joining
df['YearJoined'] = df['DateJoined'].dt.year

# Extract day of the week of joining
# The day of the week with Monday=0, Sunday=6
df['DayOfWeekJoined'] = df['DateJoined'].dt.dayofweek

# If you prefer day names instead of numbers for the day of the week
df['DayNameJoined'] = df['DateJoined'].dt.day_name()


Drop Irrelevant Features: Features like 'WorkerID' may not have any predictive power, so it's generally a good idea to drop such features.

df.drop('WorkerID', axis=1, inplace=True)


Current vs. Former Employees Ratio: Create a feature to capture the ratio of current to former employees within each occupation. This can help identify if specific job roles have high attrition rates.

# Group by 'Occupation' and 'IsCurrentEmployee' and get counts
grouped_counts = df.groupby(['Occupation', 'IsCurrentEmployee']).size().unstack(fill_value=0)

# Calculate the ratio of current to former employees for each occupation
grouped_counts['CurrentToFormerRatio'] = grouped_counts[True] / (grouped_counts[False] + 1)  # +1 to avoid division by zero

# Create a mapping from 'Occupation' to 'CurrentToFormerRatio'
occupation_to_ratio = grouped_counts['CurrentToFormerRatio'].to_dict()

# Map the ratio back to the main dataframe
df['CurrentToFormerRatio'] = df['Occupation'].map(occupation_to_ratio)


modeling process

# One-hot encode the 'Occupation' column
df = pd.get_dummies(df, columns=['Occupation'], prefix='Occupation')


df.drop(['DateJoined', 'DateLeft'], axis=1, inplace=True)



print(df.head())


Step 1: Data Preparation
Before feeding data into a machine learning algorithm, ensure the data is prepared:

Split the dataset into training and testing sets.
Ensure all categorical data is encoded (either one-hot or label encoding).
Normalize or standardize numerical features, if necessary.
Handle any missing values.

if 'ReasonForLeaving' in df.columns:
    df.drop('ReasonForLeaving', axis=1, inplace=True)


print(df.dtypes)


df = pd.get_dummies(df, columns=['AgeGroup'], drop_first=True)


df = pd.get_dummies(df, columns=['YearsWorkedCategory'], drop_first=True)


# Given a sample dataframe 'df' and a column 'DayOfWeekJoined'
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

for day in days:
    df[f'Day_{day}'] = (df['DayOfWeekJoined'] == day).astype(int)



df.info()



df.drop('DayOfWeekJoined', axis=1, inplace=True)


df = df.drop(columns=['LeaveYear', 'LeaveMonth'])


print(df.head())



if 'DayNameJoined' in df.columns:
    df.drop('DayNameJoined', axis=1, inplace=True)


from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)



# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming `df` is your dataframe


# Split the dataset into training and testing sets
X = df.drop(columns=['IsCurrentEmployee'])  # assuming 'IsCurrentEmployee' is the target variable
y = df['IsCurrentEmployee']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize/Normalize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Step 2: Model Selection & Training
Start with a simpler model and gradually explore more complex ones if needed:

Initialize the model.
Train the model using the training dataset.
Use cross-validation to get a more reliable measure of the model's performance

1. Splitting the Data:

from sklearn.model_selection import train_test_split

X = df.drop('IsCurrentEmployee', axis=1)  # Assuming 'IsCurrentEmployee' is your target variable
y = df['IsCurrentEmployee']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


2. Logistic Regression:

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, random_state=42)


X_train = X_train.fillna(X_train.mode())
X_test = X_test.fillna(X_train.mean())



X_train = X_train.dropna()
y_train = y_train[X_train.index]


from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)


from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, random_state=42)


model.fit(X_train, y_train)


from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


from sklearn.impute import SimpleImputer

# Mean imputation
imputer = SimpleImputer(strategy='mode')
df['numerical_column'] = imputer.fit_transform(df[['numerical_column']])

# Mode imputation for categorical columns
imputer = SimpleImputer(strategy='most_frequent')
df['categorical_column'] = imputer.fit_transform(df[['categorical_column']])



df.ffill(inplace=True)


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Initialize the model
estimator = LogisticRegression()

# Now, use the model in the cross_val_score function
scores = cross_val_score(estimator, X, y, cv=5, error_score='raise')


from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="most_frequent")  # or "median", "most_frequent" for mode
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)



from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10, random_state=42)
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Assuming df is your dataframe

# Define numeric and categorical features
numeric_features = ['Age', 'YearsWorked']  # Add other numeric columns here
categorical_features = ['Occupation', 'DayNameJoined', 'AgeGroup', 'YearsWorkedCategory']  # Add other categorical columns here

# Create transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine transformers into a preprocessor step
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Preprocessing data
X = preprocessor.fit_transform(df)



from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Average Cross-Validation Accuracy: {scores.mean():.2f}")


from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
dummy_score = dummy.score(X_test, y_test)
print(dummy_score)
