In [None]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Step 2: Load the Dataset
# Adjust the file path as necessary
data = pd.read_csv('data.xlsx - Sheet1.csv')  # Replace with the correct file path if needed
# Step 3: Data Cleaning
# Convert date columns to datetime format with explicit format
data['DOJ'] = pd.to_datetime(data['DOJ'], format='%m/%d/%Y', errors='coerce')
data['DOL'] = pd.to_datetime(data['DOL'], format='%m/%d/%Y', errors='coerce')
data['DOB'] = pd.to_datetime(data['DOB'], format='%m/%d/%Y', errors='coerce')
# Check for missing values
print(data.isnull().sum())
# Dropping rows with missing salary values
data = data.dropna(subset=['Salary'])

# Fill missing values for categorical columns if needed
data['Specialization'] = data['Specialization'].fillna('Unknown')

# Drop columns that are not useful for analysis
data.drop(columns=['Unnamed: 0'], inplace=True)

# Display the cleaned dataset info
print(data.info())
# Step 4: Univariate Analysis
# Univariate analysis for Salary
plt.figure(figsize=(10, 6))
plt.hist(data['Salary'], bins=30, alpha=0.7, color='blue')
plt.title('Salary Distribution')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.show()

# Boxplot for detecting salary outliers using matplotlib
plt.figure(figsize=(10, 6))
plt.boxplot(data['Salary'])
plt.title('Boxplot of Salary')
plt.ylabel('Salary')
plt.xticks([1], ['Salary'])  # Customizing x-ticks
plt.show()
# Histograms for other numerical variables
numerical_columns = ['10percentage', '12percentage', 'collegeGPA', 'English', 'Logical', 'Quant']
for col in numerical_columns:
    plt.figure(figsize=(10, 6))
    plt.hist(data[col], bins=30, alpha=0.7, color='orange')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()
    # Step 5: Bivariate Analysis
# Boxplot of Salary vs. Gender using matplotlib
plt.figure(figsize=(10, 6))
male_salaries = data[data['Gender'] == 'm']['Salary']
female_salaries = data[data['Gender'] == 'f']['Salary']
plt.boxplot([male_salaries, female_salaries], labels=['Male', 'Female'])
plt.title('Salary vs. Gender')
plt.ylabel('Salary')
plt.show()
# Boxplot of Salary vs. Specialization using matplotlib
plt.figure(figsize=(12, 6))
specialization_groups = data.groupby('Specialization')['Salary'].apply(list)
plt.boxplot(specialization_groups)
plt.title('Salary vs. Specialization')
plt.ylabel('Salary')
plt.xticks(rotation=90)  # Rotate x-ticks for better readability
plt.show()
# Boxplot of Salary vs. College Tier using matplotlib
plt.figure(figsize=(10, 6))
college_tier_groups = data.groupby('CollegeTier')['Salary'].apply(list)
plt.boxplot(college_tier_groups)
plt.title('Salary vs. College Tier')
plt.ylabel('Salary')
plt.xticks([1, 2, 3], ['Tier 1', 'Tier 2', 'Tier 3'])  # Customize based on your CollegeTier values
plt.show()
# Correlation heatmap for numerical variables
correlation_matrix = data[numerical_columns + ['Salary']].corr()
plt.figure(figsize=(10, 8))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none')
plt.colorbar()
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=45)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Heatmap')
plt.show()
# Step 6: Research Questions
# Test the claim from the Times of India article about salary range
salary_claim = data[(data['Salary'] >= 250000) & (data['Salary'] <= 300000)]
specializations_in_range = salary_claim['Specialization'].value_counts()
print(specializations_in_range)
