In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Read the CSV file
df = pd.read_csv('Employee_Performance-1.csv')

# Check for missing values
print("## Missing Values")
missing_values = df.isnull().sum()
if missing_values.sum() == 0:
    print("No missing values found in the dataset.")
else:
    print(missing_values[missing_values > 0])

# Check for duplicates
print("\n## Duplicates")
duplicates = df.duplicated().sum()
if duplicates == 0:
    print("No duplicate rows found in the dataset.")
else:
    print(f"Found {duplicates} duplicate rows.")

# Check for outliers using IQR method
print("\n## Outliers")
numeric_columns = ['Experience', 'TrainingHours', 'PerformanceRating', 'Salary']

for column in numeric_columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    print(f"\nOutliers in {column}:")
    print(outliers[['EmployeeID', 'Department', column]])

# Recommendations
print("\n## Recommendations")
print("1. No missing values or duplicates found, so no action needed for these issues.")
print("2. Investigate the outliers in each numeric column, especially in PerformanceRating and Salary.")
print("3. Consider normalizing PerformanceRating across departments.")
print("4. Review salary structure, particularly for employees with 0 years of experience but higher salaries.")
print("5. Standardize TrainingHours if possible, rounding odd values to nearest standard value.")
print("6. Implement data validation rules for future data entry to ensure consistency.")
print("7. Consider analyzing each department separately due to department-specific patterns.")

FileNotFoundError: [Errno 2] No such file or directory: 'Employee_Performance-1.csv'

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
df = pd.read_csv('Employee_Performance-1.csv')

# Summary statistics
print("## Summary Statistics")
print(df.describe())

# Correlation matrix
print("\n## Correlation Matrix")
correlation_matrix = df.corr()
print(correlation_matrix)

# Visualizations
plt.figure(figsize=(20, 15))

# Histogram of Performance Ratings
plt.subplot(2, 2, 1)
sns.histplot(df['PerformanceRating'], kde=True)
plt.title('Distribution of Performance Ratings')
plt.xlabel('Performance Rating')
plt.ylabel('Count')

# Box plot of Salary by Department
plt.subplot(2, 2, 2)
sns.boxplot(x='Department', y='Salary', data=df)
plt.title('Salary Distribution by Department')
plt.xlabel('Department')
plt.ylabel('Salary')
plt.xticks(rotation=45)

# Scatter plot of Experience vs Salary
plt.subplot(2, 2, 3)
sns.scatterplot(x='Experience', y='Salary', hue='Department', data=df)
plt.title('Experience vs Salary by Department')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')

# Bar plot of Average Performance Rating by Department
plt.subplot(2, 2, 4)
df.groupby('Department')['PerformanceRating'].mean().plot(kind='bar')
plt.title('Average Performance Rating by Department')
plt.xlabel('Department')
plt.ylabel('Average Performance Rating')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Additional analysis
print("\n## Additional Analysis")
print("Average Salary by Department:")
print(df.groupby('Department')['Salary'].mean())

print("\nAverage Performance Rating by Gender:")
print(df.groupby('Gender')['PerformanceRating'].mean())

print("\nCorrelation between Experience and Salary:")
print(df['Experience'].corr(df['Salary']))

## Summary Statistics
        EmployeeID   Experience  TrainingHours  PerformanceRating  \
count  1468.000000  1468.000000    1468.000000        1468.000000   
mean   1734.500000     2.838556      32.144414           3.561512   
std     423.919411     2.527657      10.106029           1.044987   
min    1001.000000     0.000000       5.000000           1.000000   
25%    1367.750000     1.000000      25.000000           2.840000   
50%    1734.500000     2.000000      31.000000           3.630000   
75%    2101.250000     4.000000      39.000000           4.330000   
max    2468.000000     9.000000      50.000000           5.500000   

             Salary  
count   1468.000000  
mean   16107.623297  
std    12158.438481  
min     6000.000000  
25%     7700.000000  
50%    10100.000000  
75%    20000.000000  
max    53100.000000  

## Correlation Matrix


ValueError: could not convert string to float: 'IT'