In [1]:
# Libraries for data manipulation and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import scipy.stats as stats

In [2]:
# Load the dataset
data = pd.read_csv("C:\\Users\\Vivek\\OneDrive\\Desktop\\archive (3)\\MBA.csv")


In [None]:
# Display basic data information
# View the first few rows of the dataset
data.head()  

In [None]:
 # Check the dimensions of the dataset
data.shape  

In [None]:
 # Check data types of all columns
data.dtypes  

In [None]:
# Get summary statistics for numerical columns
data.describe()  

In [None]:
# View the last few rows of the dataset
data.tail() 

In [None]:
 # Get a concise summary of the dataset
data.info()  

In [None]:
 # Count missing values in each column
data.isnull().sum() 

In [None]:
# Calculate percentage of missing values

data.isnull().sum() * 100 / len(data)  


In [None]:
# Separate numerical and categorical columns
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
cat_cols = data.select_dtypes(include=['object']).columns
print(numeric_cols)
print(cat_cols)

In [None]:
# Fill missing values in categorical columns with the mode
for col in cat_cols:
    most_frequent = data[col].mode()[0]
    data[col].fillna(most_frequent, inplace=True)

In [None]:
# Recheck for missing values
data.isnull().sum()


In [None]:
 #Visualization 1: Count plot for "gender"
# Shows the distribution of gender in the dataset
sns.countplot(data, x="gender")
plt.title("Gender Distribution")
plt.show()

In [None]:
# Visualization 2: Count plot for "international"
# Displays the count of international vs. non-international students
sns.countplot(data, x="international")
plt.title("International Students Distribution")
plt.show()

In [None]:
# Visualization 3: Histogram for "race" (using Plotly)
# Visualizes the frequency distribution of different races
fig = px.histogram(data, x="race", title="Race Distribution in Dataset")
fig.show()

In [None]:
# Visualization 4: Pie chart for "major" distribution
# Displays the proportion of students in different majors
fig = px.pie(data, names='major', title='Major Distribution in Dataset')
fig.show()

In [None]:
# Visualization 5: Pie chart for "work_industry" distribution
# Shows the proportion of students from various work industries
fig = px.pie(data, names="work_industry", title="Work Industry Distribution in Dataset")
fig.show()

In [None]:
# Visualization 6: Histogram for "gpa"
# Displays the distribution of GPA scores
sns.histplot(data, x="gpa")
plt.title("GPA Distribution")
plt.show()

In [None]:
# Visualization 7: Histogram for "work_exp"
# Displays the distribution of work experience
sns.histplot(data, x="work_exp")
plt.title("Work Experience Distribution")
plt.show()

In [None]:
# Visualization 8: Line plot for "work_industry" vs "work_exp"
# Shows the trend of work experience across different industries
sns.lineplot(data, x="work_industry", y="work_exp")
plt.title("Work Experience vs Work Industry")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Visualization 9: Scatter plot for "gpa" vs "admission"
# Visualizes the relationship between GPA and admission status
fig = px.scatter(data, x="gpa", y="admission", title="GPA vs Admission")
fig.show()

In [None]:
# Visualization 10: Scatter plot for "gpa" vs "gmat"
# Explores the relationship between GPA and GMAT scores
fig = px.scatter(data, x="gpa", y="gmat", title="GPA vs GMAT Score")
fig.show()

In [None]:
# Visualization 11: Bar plot for all categorical columns
# Displays frequency distributions for all categorical features
for col in cat_cols:
    plt.figure(figsize=(10, 5))
    sns.countplot(data, x=col)
    plt.title(f'Bar Plot of {col}')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# Visualization 12: KDE plot for "gpa"
# Shows the density estimation for GPA scores
sns.kdeplot(data['gpa'], shade=True)
plt.title('KDE Plot of GPA')
plt.show()

In [None]:
# Visualization 13: QQ Plot for "gpa"
# Assesses normality of GPA scores by comparing to a normal distribution
stats.probplot(data['gpa'], dist="norm", plot=plt)
plt.title('QQ Plot for GPA')
plt.show()