# Importing Files and Libraries


In [9]:
import numpy as np
import pandas as pd
import sklearn

In [5]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
%cd  /content/drive/MyDrive/Succes-O-Meter
%ls

/content/drive/MyDrive/Succes-O-Meter
DataEncoding.ipynb  OriginalStudentPerformance.gsheet  StudentPertformance.gsheet
DataPlot.ipynb	    Preprocessing.ipynb


In [12]:
id = "1exWpJiEtliFT-5KHFaPtoi-oBT8z5zk92ZD01fc5i5o"
name = "StudentPerformance"
url = f"https://docs.google.com/spreadsheets/d/{id}/gviz/tq?tqx=out:csv&sheet={name}"
table = pd.read_csv(url)
table = pd.DataFrame(table)
table

Unnamed: 0,Gender,Nationality,Placeofbirth,Stageid,Gradeid,Sectionid,Topic,Semester,Relation,Raisedhands,Visitedresources,Announcementsview,Discussion,Parentansweringsurvey,Parentschoolsatisfaction,Studentabsencedays,Marks
0,M,Kuwait,Kuwait,PrimarySchool,G-04,A,IT,1,Father,15,16,2,20,Yes,Good,Low,M
1,M,Kuwait,Kuwait,PrimarySchool,G-04,A,IT,1,Father,20,20,3,25,Yes,Good,Low,M
2,M,Kuwait,Kuwait,PrimarySchool,G-04,A,IT,1,Father,10,7,0,30,No,Bad,High,L
3,M,Kuwait,Kuwait,PrimarySchool,G-04,A,IT,1,Father,30,25,5,35,No,Bad,High,L
4,M,Kuwait,Kuwait,PrimarySchool,G-04,A,IT,1,Father,40,50,12,50,No,Bad,High,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,F,Jordan,Jordan,MiddleSchool,G-08,A,Chemistry,2,Father,5,4,5,8,No,Bad,High,L
476,F,Jordan,Jordan,MiddleSchool,G-08,A,Geology,1,Father,50,77,14,28,No,Bad,Low,M
477,F,Jordan,Jordan,MiddleSchool,G-08,A,Geology,2,Father,55,74,25,29,No,Bad,Low,M
478,F,Jordan,Jordan,MiddleSchool,G-08,A,History,1,Father,30,17,14,57,No,Bad,High,L


# Splitting Data


In [67]:
# Splitting the data into features (X) and labels (y)
X = table.drop('Marks', axis=1)
y = table["Marks"]
print(X.shape)
print(y.shape)


(480, 16)
(480,)


In [40]:
# Splitting the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Checking table shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((384, 16), (96, 16), (384,), (96,))

# Encoding


In [70]:
X.dtypes


Gender                      object
Nationality                 object
Placeofbirth                object
Stageid                     object
Gradeid                      int64
Sectionid                   object
Topic                       object
Semester                     int64
Relation                    object
Raisedhands                  int64
Visitedresources             int64
Announcementsview            int64
Discussion                   int64
Parentansweringsurvey        int64
Parentschoolsatisfaction     int64
Studentabsencedays          object
dtype: object

In [69]:
 # Extract the integer part from the "Gradeid" column
X['Gradeid'] = X['Gradeid'].str.extract('(\d+)').astype(int)

# Convert "Parentansweringsurvey" column from Yes/No to 1/0
X['Parentansweringsurvey'] = X['Parentansweringsurvey'].replace({'Yes': 1, 'No': 0})

# Convert "Parentschoolsatisfaction" column from Good/Bad to 1/0
X['Parentschoolsatisfaction'] = X['Parentschoolsatisfaction'].replace({'Good': 1, 'Bad': 0})

print()





In [87]:
# Features to encode (discrete/non-numerical)
categorical_features = X.columns.tolist()

encodedX = pd.get_dummies(data=X[categorical_features], dtype=int)
encodedX


Unnamed: 0,Gradeid,Semester,Raisedhands,Visitedresources,Announcementsview,Discussion,Parentansweringsurvey,Parentschoolsatisfaction,Gender_F,Gender_M,...,Topic_History,Topic_IT,Topic_Math,Topic_Quran,Topic_Science,Topic_Spanish,Relation_Father,Relation_Mum,Studentabsencedays_High,Studentabsencedays_Low
0,4,1,15,16,2,20,1,1,0,1,...,0,1,0,0,0,0,1,0,0,1
1,4,1,20,20,3,25,1,1,0,1,...,0,1,0,0,0,0,1,0,0,1
2,4,1,10,7,0,30,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
3,4,1,30,25,5,35,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
4,4,1,40,50,12,50,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,8,2,5,4,5,8,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
476,8,1,50,77,14,28,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
477,8,2,55,74,25,29,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
478,8,1,30,17,14,57,0,0,1,0,...,1,0,0,0,0,0,1,0,1,0


In [91]:
encodedX['Marks'] = y
encodedX

Unnamed: 0,Gradeid,Semester,Raisedhands,Visitedresources,Announcementsview,Discussion,Parentansweringsurvey,Parentschoolsatisfaction,Gender_F,Gender_M,...,Topic_IT,Topic_Math,Topic_Quran,Topic_Science,Topic_Spanish,Relation_Father,Relation_Mum,Studentabsencedays_High,Studentabsencedays_Low,Marks
0,4,1,15,16,2,20,1,1,0,1,...,1,0,0,0,0,1,0,0,1,M
1,4,1,20,20,3,25,1,1,0,1,...,1,0,0,0,0,1,0,0,1,M
2,4,1,10,7,0,30,0,0,0,1,...,1,0,0,0,0,1,0,1,0,L
3,4,1,30,25,5,35,0,0,0,1,...,1,0,0,0,0,1,0,1,0,L
4,4,1,40,50,12,50,0,0,0,1,...,1,0,0,0,0,1,0,1,0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,8,2,5,4,5,8,0,0,1,0,...,0,0,0,0,0,1,0,1,0,L
476,8,1,50,77,14,28,0,0,1,0,...,0,0,0,0,0,1,0,0,1,M
477,8,2,55,74,25,29,0,0,1,0,...,0,0,0,0,0,1,0,0,1,M
478,8,1,30,17,14,57,0,0,1,0,...,0,0,0,0,0,1,0,1,0,L


In [90]:
y

0      M
1      M
2      L
3      L
4      M
      ..
475    L
476    M
477    M
478    L
479    L
Name: Marks, Length: 480, dtype: object

In [92]:
encodedX.to_csv('EncodedStudentPerformance.csv', index = False)

In [76]:
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer

# # Features to encode (discrete/non-numerical)
# categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# # Making encoder
# one_hot = OneHotEncoder(sparse=False)

# # Creeating ColumnTransformer using encoder
# transformer = ColumnTransformer([("one_hot",
#                                   one_hot,
#                                   categorical_features)], # columns to transform
#                                   remainder="passthrough") # "passthrough" = leave rest columns unchanged

# # 5. Turn the categorical features into numbers (this will return an array-like sparse matrix, not a DataFrame)
# transformed_X = transformer.fit_transform(X)
# encoded_column_names = transformer.named_transformers_['one_hot'].get_feature_names_out(input_features=categorical_features)
# encoded_column_names



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,407,408,409,410,411,412,413,414,415,416
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
476,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
477,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
478,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
