# Mini Project Python libraries - Numpy, Pandas, Data Vizualization library

# Student Performance and Attendance Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# Data Preperation

In [None]:
#Loading marks dataframe as "Marks".
Marks=pd.read_excel("Student_performance_and_attendance_data.xlsx",sheet_name="Marks")

In [None]:
Marks.head()

In [None]:
#Loading attendance dataframe as "Attendance".
Attendance=pd.read_excel("Student_performance_and_attendance_data.xlsx",sheet_name="Attendance")

In [None]:
Attendance.head()

In [None]:
#merging both dataframe Marks and Attendance as df
df=pd.merge(Marks,Attendance,on="Name",how="left")

In [None]:
#shape of df
df.shape

In [None]:
df.describe()

In [None]:
#checking first five rows of df
df.head()

In [None]:
#replacing " " with "_" in column names
df.columns=df.columns.str.replace(" ","_",regex=False)

In [None]:
#replacing names with a valid title format
df["Name"]=df["Name"].str.title()

In [None]:
#replacing "Y" with 1 and "N" with 0 in all the attendance columns
Attendance_columns=[col for col in df.columns if "Attendance" in col]
df[Attendance_columns]=df[Attendance_columns].replace({"Y":1,"N":0})
df.head()

In [None]:
#Changing column to numeric type

In [None]:
df["Mini_Test_1"]=pd.to_numeric(df["Mini_Test_1"],errors="coerce")

In [None]:
df["Mini_Test_2"]=pd.to_numeric(df["Mini_Test_2"],errors="coerce")

In [None]:
df["Live_Test"]=pd.to_numeric(df["Live_Test"],errors="coerce")

In [None]:
#Checking for null values
df.isnull().sum()

In [None]:
#filling nulls with 0

In [None]:
df.fillna(0,inplace=True)
df[df.isnull().any(axis=1)]

In [None]:
marks_columns=["Mini_Test_1","Mini_Test_2","Live_Test","Assignment"]

In [None]:
#box plot for identifying outliers or invalid values
for col in marks_columns:
    plt.figure(figsize=(5,5))
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

In [None]:
Min_marks=0                      #Min marks for all Marks_cols
Mini_Test_max_marks=10           #Max marks for Mini Tests
Live_Test_max_marks=15           #Max marks for live Tests
Assignment_max_marks=20          #Max marks for assignment

In [None]:
# Treating invalid marks for all Marks_cols

In [None]:
#column-wise max marks mapping
marks_mapping={"Mini_Test_1":Mini_Test_max_marks,
           "Mini_Test_2":Mini_Test_max_marks,
           "Live_Test":Live_Test_max_marks,
           "Assignment":Assignment_max_marks}

In [None]:
#Changing the outliers with median of the column
for col, max_marks in marks_mapping.items():
    invalid_rows=(df[col]<Min_marks) | (df[col]>max_marks)
    df.loc[invalid_rows,col]=df[col].median()

In [None]:
#Now i have 0 nan values for columns
df.isnull().sum()

# Data Transformation

In [None]:
#Calculating total marks obtained by each student
df["Total_Marks"]=df[marks_columns].sum(axis=1)
df.head()

In [None]:
#calculating percentage of marks obtained by students
Total_Marks=Mini_Test_max_marks*2 + Live_Test_max_marks + Assignment_max_marks
df["Percentage_Of_Marks"]=round((df["Total_Marks"]/Total_Marks)*100,2)
df.head()

In [None]:
#Calculating the students attendance percentage
df["Attend_Percentage"]=((df[Attendance_columns].sum(axis=1)/len(Attendance_columns))*100)
df.head()

In [None]:
#calculating weighted percentage
df["Weighted_Percentage"]=((
    (df["Attend_Percentage"]/100)*0.40
    + (df["Mini_Test_1"]/Mini_Test_max_marks)*0.10
    + (df["Mini_Test_2"]/Mini_Test_max_marks)*0.10
    + (df["Live_Test"]/Live_Test_max_marks)*0.20
    + (df["Assignment"]/Assignment_max_marks)*0.20
)*100).round(2)
df.head()

In [None]:
#Performance categories
categories=[
    df["Weighted_Percentage"] >= 85,
    df["Weighted_Percentage"].between(71,84),
    df["Weighted_Percentage"].between(50,70),
    df["Weighted_Percentage"]<50
]

choice=["Excellent","Good","Average","Needs Improvement"]

df["Performance_Categories"]=np.select(categories,choice,default="Needs Improvement")
df.head()

In [None]:
df["Performance_Categories"].value_counts()

# Analysis

In [None]:
#students with attendance below 75% but weighted percentage >50%

df[(df["Attend_Percentage"]<75) & (df["Weighted_Percentage"]>50)]

In [None]:
#top three students based on percentage of marks
top3=df.nlargest(3,"Percentage_Of_Marks").reset_index(drop=True)
top3

plt.figure(figsize=(6,4))
plt.bar(top3["Name"], top3["Percentage_Of_Marks"], color='skyblue')
plt.title("Top 3 Students by Percentage")
plt.xlabel("Student")
plt.ylabel("Percentage of marks")
plt.show()

top3[["Name","Percentage_Of_Marks"]]

In [None]:
#Impact of attendance on Tests/Assignment marks
impact_col=["Mini_Test_1", "Mini_Test_2", "Live_Test", "Assignment","Attend_Percentage"]
df[impact_col].corr()

### Since the correlation between Attendance percentage and tests is around -0.07, -0.02, -0.01, -0.04 so no correlation between attendace percentage and tests

In [None]:
#Heatmap for Impact of Attendance on Tests/Assignment Marks
plt.figure(figsize=(6,6))
sns.heatmap(df[impact_col].corr(),annot=True,cmap="magma")
plt.title("Impact of Attendance on Tests/Assignment Marks")
plt.show()

In [None]:
#Scatter plot for correlation between two variables
tests = ["Mini_Test_1", "Mini_Test_2", "Live_Test", "Assignment"]

for col in tests:
    plt.figure(figsize=(6,4))
    sns.scatterplot(x="Attend_Percentage", y=col, data=df)
    plt.title(f"{col} vs Attendance")
    plt.xlabel("Attendance (%)")
    plt.ylabel(col)
    plt.show()

# Visualization

In [None]:
#bar chart displaying weighted percentages for top 5 students
top5=df.nlargest(5,"Weighted_Percentage").reset_index(drop=True)
top5

plt.figure(figsize=(6,4))
plt.bar(top5["Name"], top5["Weighted_Percentage"], color='skyblue')
plt.title("Top 3 Students by Weighted_Percentage")
plt.xlabel("Student")
plt.ylabel("Weighted_Percentage")
plt.show()

In [None]:
#pie chart showing the distribution of students across the four performance categories

performance_categories=df["Performance_Categories"].value_counts()
performance_categories

plt.figure(figsize=(6,6))
plt.pie(performance_categories.values,labels=performance_categories.index,autopct="%1.1f%%",startangle=90,shadow=True)
plt.title("Student_Performance_Analysis")
plt.axis("equal")
plt.tight_layout()
plt.show()

In [None]:
#Boxplot for identifying outliers
fig, axes=plt.subplots(nrows=(len(marks_columns)//2)+1, ncols=2, figsize=(12,12))
axes=axes.flatten()  #makes indexing easier

for i,col in enumerate(marks_columns):
    sns.boxplot(x=df[col], ax=axes[i])
    axes[i].set_title(f"Boxplot of {col}")

#remove empty subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
#bar chart to show the students where attendance is less than 50%

low_attendance=df[df["Attend_Percentage"]<50]

plt.figure(figsize=(20,6))
plt.bar(low_attendance["Name"], low_attendance["Attend_Percentage"])

plt.title("Students")
plt.ylabel("Attendance Percentage below 50%")
plt.xticks(rotation=85)
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.ylim(0,50)

plt.show()

In [None]:
#Top Performers vs Low Performers Comparison as of avg of attendance percentage

top_10 = df.nlargest(10, "Weighted_Percentage")
bottom_10 = df.nsmallest(10, "Weighted_Percentage")

comparison = pd.DataFrame({
    "Top Performers": top_10["Attend_Percentage"].mean(),
    "Low Performers": bottom_10["Attend_Percentage"].mean()
}, index=["Average Attendance"])

comparison

plt.figure(figsize=(6,4))
plt.bar(comparison.columns, comparison.loc["Average Attendance"])
plt.ylabel("Average Attendance Percentage")
plt.title("Average Attendance: Top vs Low Performers")
plt.ylim(0, 100)
plt.grid(axis="y", alpha=0.4)
plt.show()