<a href="https://colab.research.google.com/github/Parveen-1713/Machinelearning/blob/main/Copy_of_StudentMarkAnalysis_parveen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Step 1: Create 150 Rows Dataset (CSV file)

We can create synthetic student dataset with random numbers using Python.

Each student will have:

ID, Name, Maths, Science, English, Attendance (%)

We’ll save as student_dataset.csv.

In [None]:
import pandas as pd
import numpy as np

# Random seed for repeatability
np.random.seed(42)

# Number of students
n = 150

# Generate sample data
data = {
    "ID": range(1, n+1),
    "Name": [f"Student_{i}" for i in range(1, n+1)],
    "Maths": np.random.randint(30, 100, size=n),
    "Science": np.random.randint(30, 100, size=n),
    "English": np.random.randint(30, 100, size=n),
    "Attendance": np.random.randint(60, 100, size=n)   # percentage
}

df = pd.DataFrame(data)

# Calculate Average & Pass/Fail
df["Average"] = df[["Maths","Science","English"]].mean(axis=1)
df["Result"] = np.where(df["Average"] >= 50, "Pass", "Fail")

# Save to CSVCSVCSVCSVCSVCSVCSVCSV
df.to_csv("student_dataset.csv", index=False)

print("CSV File Created: student_dataset.csv")
df.head()

CSV File Created: student_dataset.csv


Unnamed: 0,ID,Name,Maths,Science,English,Attendance,Average,Result
0,1,Student_1,81,68,86,64,78.333333,Pass
1,2,Student_2,44,31,65,94,46.666667,Fail
2,3,Student_3,90,32,74,71,65.333333,Pass
3,4,Student_4,50,85,49,84,61.333333,Pass
4,5,Student_5,53,88,94,80,78.333333,Pass


Step 2: Load Dataset & Explore

In [None]:
# Load dataset
df = pd.read_csv("student_dataset.csv")

# Show first 5 rows
df.head()


Unnamed: 0,ID,Name,Maths,Science,English,Attendance,Average,Result
0,1,Student_1,81,68,86,64,78.333333,Pass
1,2,Student_2,44,31,65,94,46.666667,Fail
2,3,Student_3,90,32,74,71,65.333333,Pass
3,4,Student_4,50,85,49,84,61.333333,Pass
4,5,Student_5,53,88,94,80,78.333333,Pass


Basic Pandas operations

# Load dataset
df = pd.read_csv("student_dataset.csv")

# Show first 5 rows
df.head()

# Shape of dataset
print("Rows, Columns:", df.shape)

# Column names
print(df.columns)

# Summary statistics
print(df.describe())

# Null values check
print(df.isnull().sum())

Numpy usage

In [None]:
import numpy as np

# Mean, Std, Variance of Maths
print("Maths mean:", np.mean(df["Maths"]))
print("Maths std:", np.std(df["Maths"]))
print("Maths variance:", np.var(df["Maths"]))


NameError: name 'df' is not defined

Step 3: Visualizations with Matplotlib

In [None]:
import matplotlib.pyplot as plt

# Histogram - Maths
plt.hist(df["Maths"], bins=10, edgecolor="black")
plt.title("Maths Score Distribution")
plt.xlabel("Marks")
plt.ylabel("Count")
plt.show()

# Bar chart - Pass vs Fail
df["Result"].value_counts().plot(kind="bar", color=["green","red"])
plt.title("Pass vs Fail")
plt.show()

# Scatter - Maths vs Science
plt.scatter(df["Maths"], df["Science"], alpha=0.6)
plt.xlabel("Maths")
plt.ylabel("Science")
plt.title("Maths vs Science")
plt.show()


Step 4: Machine Learning Basics

We’ll try 2 algorithms:

Logistic Regression (Binary classification Pass/Fail)

Decision Tree Classifier



Logistic Regression


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X = df[["Maths","Science","English","Attendance"]]
y = np.where(df["Average"] >= 50, 1, 0)   # 1=Pass, 0=Fail

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train, y_train)

# Predict
y_pred = log_reg.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


NameError: name 'df' is not defined

Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

dt = DecisionTreeClassifier(max_depth=3, random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

# Plot tree
plt.figure(figsize=(12,6))
tree.plot_tree(dt, feature_names=X.columns, class_names=["Fail","Pass"], filled=True)
plt.show()


In [None]:
# Feature Importance
import pandas as pd
importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": dt.feature_importances_
}).sort_values(by="Importance", ascending=False)

print(importance)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Create confusion matrix safely
cm = confusion_matrix(y_test, y_pred_dt)

# Create display object (auto-detects labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dt.classes_)

# Plot it
disp.plot(cmap=plt.cm.Blues)
plt.title("Decision Tree - Confusion Matrix")
plt.show()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.bar(importance["Feature"], importance["Importance"], color='skyblue')
plt.title("Feature Importance - Decision Tree")
plt.ylabel("Importance")
plt.show()
