In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv(r"C:\Users\soura\Employee Productivity\data\employee_productivity.csv")  # Update with your actual file path

### 1️⃣ Basic Info About Data
print("Dataset Info:")
print(df.info())

# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Summary statistics
print("\nStatistical Summary:\n", df.describe())

# Unique values in categorical columns
print("\nUnique Work Modes:", df["work_mode"].unique())

# Convert categorical 'work_mode' to numerical
df['work_mode'] = df['work_mode'].map({'Remote': 1, 'In-Office': 0, 'Hybrid': 2}) 

# Convert 'productivity_level' to numerical
productivity_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df['productivity_level'] = df['productivity_level'].map(productivity_mapping)

# Drop missing values if any
df.dropna(inplace=True)

### 2️⃣ Data Distribution Visualization
# Distribution of Productivity Levels
plt.figure(figsize=(6,4))
sns.countplot(x=df["productivity_level"], palette="viridis")
plt.xticks(ticks=[0, 1, 2], labels=["Low", "Medium", "High"])
plt.title("Distribution of Productivity Levels")
plt.xlabel("Productivity Level")
plt.ylabel("Count")
plt.show()

# Histogram for Work Hours
plt.figure(figsize=(6,4))
sns.histplot(df["hours_worked"], bins=20, kde=True, color='blue')
plt.title("Distribution of Work Hours")
plt.xlabel("Hours Worked")
plt.ylabel("Frequency")
plt.show()

# Boxplot for Tasks Completed
plt.figure(figsize=(6,4))
sns.boxplot(y=df["tasks_completed"], color="orange")
plt.title("Distribution of Tasks Completed")
plt.ylabel("Tasks Completed")
plt.show()

### 3️⃣ Correlation Analysis
plt.figure(figsize=(8,5))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

### 4️⃣ Outlier Detection
# Boxplots for detecting outliers in numerical features
fig, axes = plt.subplots(1, 3, figsize=(15,5))

sns.boxplot(y=df["hours_worked"], ax=axes[0], color="skyblue")
axes[0].set_title("Hours Worked")

sns.boxplot(y=df["tasks_completed"], ax=axes[1], color="green")
axes[1].set_title("Tasks Completed")

sns.boxplot(y=df["experience"], ax=axes[2], color="red")
axes[2].set_title("Experience (Years)")

plt.show()

### 5️⃣ Work Mode vs Productivity Analysis
# Work Mode vs Productivity Level
plt.figure(figsize=(6,4))
sns.boxplot(x=df["work_mode"], y=df["productivity_level"], palette="Set2")
plt.xticks(ticks=[0, 1, 2], labels=["In-Office", "Remote", "Hybrid"])
plt.title("Work Mode vs. Productivity Level")
plt.xlabel("Work Mode")
plt.ylabel("Productivity Level")
plt.show()

# Work Hours by Work Mode
plt.figure(figsize=(6,4))
sns.boxplot(x=df["work_mode"], y=df["hours_worked"], palette="coolwarm")
plt.xticks(ticks=[0, 1, 2], labels=["In-Office", "Remote", "Hybrid"])
plt.title("Work Mode vs. Hours Worked")
plt.xlabel("Work Mode")
plt.ylabel("Hours Worked")
plt.show()

### 6️⃣ Pairplot for Feature Relationships
sns.pairplot(df, hue="productivity_level", palette="husl")
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\soura\\Employee Productivity\\data\\employee_productivity.csv'