In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [3]:
# Load Dataset
data = pd.read_csv("/content/drive/My Drive/Machine_Learning/student_data.csv")
print("Original Data:\n", data)

Original Data:
      Name   Age Gender  Marks
0     Anu  20.0      F   85.0
1    Ravi  22.0      M   78.0
2   Priya   NaN      F   92.0
3     Anu  23.0    NaN    NaN
4    Ravi  21.0      M   88.0
5  sakshi   NaN      F   98.0


In [5]:
# Handle Missing Values
num_imputer = SimpleImputer(strategy="mean")
data[['Age', 'Marks']] = num_imputer.fit_transform(data[['Age', 'Marks']])

In [6]:
cat_imputer = SimpleImputer(strategy="most_frequent")
data[['Gender']] = cat_imputer.fit_transform(data[['Gender']])


print("\nAfter Imputation:\n", data)


After Imputation:
      Name   Age Gender  Marks
0     Anu  20.0      F   85.0
1    Ravi  22.0      M   78.0
2   Priya  21.5      F   92.0
3     Anu  23.0      F   88.2
4    Ravi  21.0      M   88.0
5  sakshi  21.5      F   98.0


In [7]:
# Gender: Label Encoding (Male = 1, Female = 0)
le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])
print("\nAfter Encoding Gender:\n", data)


After Encoding Gender:
      Name   Age  Gender  Marks
0     Anu  20.0       0   85.0
1    Ravi  22.0       1   78.0
2   Priya  21.5       0   92.0
3     Anu  23.0       0   88.2
4    Ravi  21.0       1   88.0
5  sakshi  21.5       0   98.0


In [8]:
scaler = StandardScaler()
data[['Age', 'Marks']] = scaler.fit_transform(data[['Age', 'Marks']])
print("\nAfter Scaling:\n", data)


After Scaling:
      Name       Age  Gender     Marks
0     Anu -1.643168       0 -0.522790
1    Ravi  0.547723       1 -1.666394
2   Priya  0.000000       0  0.620813
3     Anu  1.643168       0  0.000000
4    Ravi -0.547723       1 -0.032674
5  sakshi  0.000000       0  1.601045


In [9]:
# Split Data into Train/Test Sets
X = data[['Age', 'Gender']]
y = data['Marks']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("\nTraining Features:\n", X_train)
print("\nTest Features:\n", X_test)
print("\nTraining Labels:\n", y_train)
print("\nTest Labels:\n", y_test)


Training Features:
         Age  Gender
1  0.547723       1
3  1.643168       0
0 -1.643168       0
4 -0.547723       1

Test Features:
    Age  Gender
5  0.0       0
2  0.0       0

Training Labels:
 1   -1.666394
3    0.000000
0   -0.522790
4   -0.032674
Name: Marks, dtype: float64

Test Labels:
 5    1.601045
2    0.620813
Name: Marks, dtype: float64
