In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data=pd.read_csv("/kaggle/input/jobs-in-data/jobs_in_data.csv")
data

In [None]:
ac=data[data.employee_residence=='Germany']
ac

In [None]:
ab= data[(data['employee_residence'] == "Germany") & (data['company_location'] == "Germany")]
ab

In [None]:
data.head(10)

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# Check for duplicates
duplicates = data.duplicated().sum()
print("\nNumber of duplicate rows:", duplicates)

In [None]:
# Drop duplicates if any
data_cleaned = data.drop_duplicates()

In [None]:
# Check for any inconsistencies or errors in the data
# For example, we can check for unique values in categorical columns
print("\nUnique values in 'job_category' column:", data_cleaned['job_category'].unique())
print("Unique values in 'experience_level' column:", data_cleaned['experience_level'].unique())
print("Unique values in 'employment_type' column:", data_cleaned['employment_type'].unique())
print("Unique values in 'work_setting' column:", data_cleaned['work_setting'].unique())
print("Unique values in 'company_location' column:", data_cleaned['company_location'].unique())
print("Unique values in 'company_size' column:", data_cleaned['company_size'].unique())

In [None]:
import pandas as pd

# Assuming 'data' is your DataFrame
# Create a copy of the DataFrame to avoid modifying the original data
data_copy = data.copy()

# Optimize data types for memory usage
data_copy['work_year'] = pd.to_numeric(data_copy['work_year'], downcast='integer')
data_copy['job_title'] = data_copy['job_title'].astype('category')
data_copy['job_category'] = data_copy['job_category'].astype('category')
data_copy['salary_currency'] = data_copy['salary_currency'].astype('category')
data_copy['salary'] = pd.to_numeric(data_copy['salary'], downcast='integer')
data_copy['salary_in_usd'] = pd.to_numeric(data_copy['salary_in_usd'], downcast='integer')
data_copy['employee_residence'] = data_copy['employee_residence'].astype('category')
data_copy['experience_level'] = data_copy['experience_level'].astype('category')
data_copy['employment_type'] = data_copy['employment_type'].astype('category')
data_copy['work_setting'] = data_copy['work_setting'].astype('category')
data_copy['company_location'] = data_copy['company_location'].astype('category')
data_copy['company_size'] = data_copy['company_size'].astype('category')

# Check the memory usage before and after optimization
print("Memory usage before optimization:")
print(data.memory_usage(deep=True).sum() / (1024 * 1024), "MB")

print("\nMemory usage after optimization:")
print(data_copy.memory_usage(deep=True).sum() / (1024 * 1024), "MB")



In [None]:
data.info()

In [None]:
# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing Values:")
print(missing_values)
# Hence No missing values

In [None]:
# Visualize salary distribution
plt.figure(figsize=(12, 6))
sns.histplot(data['salary'], bins=20, kde=True)
plt.title('Salary Distribution')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Visualize job categories
plt.figure(figsize=(12, 6))
sns.countplot(y='job_category', data=data, order=data['job_category'].value_counts().index)
# Add data labels
for index, value in enumerate(data['job_category'].value_counts()):
    plt.text(value, index, str(value))
plt.title('Job Categories')
plt.xlabel('Count')
plt.ylabel('Job Category')
plt.legend(loc=1)
plt.show()


In [None]:
# Average Salary by Job Category
avg_salary_by_category = data.groupby('job_category')['salary'].mean().sort_values(ascending=False)
print("\nAverage Salary by Job Category:")
print(avg_salary_by_category)

In [None]:
plt.figure(figsize=(8, 6))  # Adjusted figure size to 8x6
sns.countplot(x='experience_level', data=data, order=data['experience_level'].value_counts().index)

# Add data labels
for index, value in enumerate(data['experience_level'].value_counts()):
    plt.text(index, value, str(value), ha='center', va='bottom')

plt.title('Distribution of Experience Levels')
plt.xlabel('Experience Level')
plt.ylabel('Count')
plt.legend()
plt.show()

In [None]:
# Employment Types Distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='employment_type', data=data, order=data['employment_type'].value_counts().index)
# Add data labels
for index, value in enumerate(data['experience_level'].value_counts()):
    plt.text(index, value, str(value), ha='center', va='bottom')
plt.title('Employment Types Distribution')
plt.xlabel('Employment Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Company Sizes Distribution
plt.figure(figsize=(8, 6))  # Adjusted figure size to 8x6
sns.countplot(x='company_size', data=data, order=data['company_size'].value_counts().index)

# Add data labels
for index, value in enumerate(data['company_size'].value_counts()):
    plt.text(index, value, str(value), ha='center', va='bottom')
plt.title('Company Sizes Distribution')
plt.xlabel('Company Size')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

Certainly! Here's a summary of the insights derived from the analysis of the job data:

1. **Data Quality Check and Cleaning:**
   - The dataset was loaded and examined for basic information, including data types, missing values, and duplicates. No missing values were found, and duplicates were removed from the dataset.

2. **Memory Optimization:**
   - Data types were optimized to reduce memory usage, improving the efficiency of data storage and processing.

3. **Salary Distribution:**
   - The distribution of salaries was visualized using a histogram, providing insights into the salary ranges across different job positions.

4. **Job Categories Analysis:**
   - The count of job categories was visualized, revealing the popularity of different job roles within the dataset.

5. **Average Salary by Job Category:**
   - The average salary for each job category was calculated, highlighting differences in salary levels across various job roles.

6. **Experience Level Distribution:**
   - The distribution of experience levels among employees was visualized, showing the distribution of workforce experience within the dataset.

7. **Employment Types and Company Sizes:**
   - The distribution of employment types and company sizes was explored, providing insights into the diversity of employment structures and organizational sizes.

Overall, the analysis provided valuable insights into salary distributions, job category preferences, and employment trends within the dataset. These insights can be utilized for workforce planning, salary benchmarking, and recruitment strategies, ultimately contributing to better decision-making processes in human resources and talent management.