In [1]:
import pandas as pd
import numpy as np

In [2]:
data = '/content/data_science_student_marks.xlsx'
df = pd.read_excel(data)

In [3]:
df.shape

(497, 8)

In [4]:
df.head()

Unnamed: 0,student_id,location,age,sql_marks,excel_marks,python_marks,power_bi_marks,english_marks
0,4,Sydney,24,95,99,87,82,75
1,5,Tokyo,24,99,95,89,86,82
2,6,Berlin,22,72,70,99,79,77
3,7,London,23,97,90,74,72,85
4,8,Tokyo,22,91,71,79,80,75


In [5]:
df.columns

Index(['student_id', 'location', 'age', 'sql_marks', 'excel_marks',
       'python_marks', 'power_bi_marks', 'english_marks'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   student_id      497 non-null    int64 
 1   location        497 non-null    object
 2   age             497 non-null    int64 
 3   sql_marks       497 non-null    int64 
 4   excel_marks     497 non-null    int64 
 5   python_marks    497 non-null    int64 
 6   power_bi_marks  497 non-null    int64 
 7   english_marks   497 non-null    int64 
dtypes: int64(7), object(1)
memory usage: 31.2+ KB


In [7]:
df.describe()

Unnamed: 0,student_id,age,sql_marks,excel_marks,python_marks,power_bi_marks,english_marks
count,497.0,497.0,497.0,497.0,497.0,497.0,497.0
mean,252.0,21.380282,84.661972,85.384306,85.38833,84.545272,84.82495
std,143.615807,2.205714,8.745415,8.782497,8.878668,8.903066,9.060479
min,4.0,18.0,70.0,70.0,70.0,70.0,70.0
25%,128.0,20.0,78.0,78.0,77.0,77.0,77.0
50%,252.0,21.0,85.0,86.0,86.0,84.0,85.0
75%,376.0,23.0,92.0,93.0,94.0,92.0,93.0
max,500.0,25.0,100.0,100.0,100.0,100.0,100.0


In [8]:
correlation_matrix = df[['age', 'sql_marks', 'excel_marks', 'python_marks', 'power_bi_marks', 'english_marks']].corr()
print("Correlation with english_marks:\n", correlation_matrix['english_marks'])

Correlation with english_marks:
 age              -0.026927
sql_marks        -0.034080
excel_marks      -0.042605
python_marks     -0.121632
power_bi_marks    0.037626
english_marks     1.000000
Name: english_marks, dtype: float64


In [9]:
categorical = [var for var in df.columns if df[var].dtype == object]
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :', categorical)

There are 1 categorical variables

The categorical variables are : ['location']


In [10]:
df[categorical].head()

Unnamed: 0,location
0,Sydney
1,Tokyo
2,Berlin
3,London
4,Tokyo


In [11]:
df[categorical].isnull().sum()

Unnamed: 0,0
location,0


In [12]:
df[categorical].isnull()

Unnamed: 0,location
0,False
1,False
2,False
3,False
4,False
...,...
492,False
493,False
494,False
495,False


In [13]:
cat1 = [var for var in categorical if df[var].isnull().sum()!=0]
print(df[cat1].isnull().sum())

Series([], dtype: float64)


In [14]:
for var in categorical:
  print(df[var].value_counts())

location
Tokyo          60
Los Angeles    60
Melbourne      58
New York       57
Toronto        56
Paris          55
Sydney         53
Berlin         52
London         46
Name: count, dtype: int64


In [15]:
df_encoded = pd.get_dummies(df, columns=categorical)

In [16]:
import pandas as pd
print("english_marks data type:", df['english_marks'].dtype)
print("Any missing values in english_marks:", df['english_marks'].isnull().sum())
df['english_marks'] = pd.to_numeric(df['english_marks'], errors='coerce')
# Try threshold: Low (<75), High (≥75)
y = pd.cut(df['english_marks'], bins=[0, 75, 100], labels=['Low', 'High'], include_lowest=True)
X = df_encoded.drop(['student_id', 'english_marks', 'location_Berlin', 'location_London', 'location_Los Angeles',
                     'location_Melbourne', 'location_New York', 'location_Paris', 'location_Sydney',
                     'location_Tokyo', 'location_Toronto'], axis=1)
print("Categories in y:", y.value_counts())
print("Sample of y:", y.head().tolist())
print("Features in X:", X.columns.tolist())

english_marks data type: int64
Any missing values in english_marks: 0
Categories in y: english_marks
High    396
Low     101
Name: count, dtype: int64
Sample of y: ['Low', 'High', 'High', 'High', 'Low']
Features in X: ['age', 'sql_marks', 'excel_marks', 'python_marks', 'power_bi_marks']


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [18]:
# Use Random Forest Classifier with SMOTE and tuned class weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import numpy as np

# Debug: Confirm data
print("y_train categories before training:", y_train.value_counts())
print("y_test categories:", y_test.value_counts())
print("X_train dtypes:", X_train.dtypes)
print("y_train unique values:", y_train.unique())

# Apply SMOTE
smote = SMOTE(sampling_strategy=1.0, random_state=0)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print("y_train categories after SMOTE:", y_train_smote.value_counts())

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=15, min_samples_split=2, min_samples_leaf=1, class_weight={'Low': 4, 'High': 1}, random_state=0)
print("Model parameters:", rf.get_params())
rf.fit(X_train_smote, y_train_smote)
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Feature importance
importances = rf.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print("Which features matter most:\n", feature_importance_df.sort_values(by='Importance', ascending=False))

y_train categories before training: english_marks
High    318
Low      79
Name: count, dtype: int64
y_test categories: english_marks
High    78
Low     22
Name: count, dtype: int64
X_train dtypes: age               int64
sql_marks         int64
excel_marks       int64
python_marks      int64
power_bi_marks    int64
dtype: object
y_train unique values: ['High', 'Low']
Categories (2, object): ['Low' < 'High']
y_train categories after SMOTE: english_marks
Low     318
High    318
Name: count, dtype: int64
Model parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': {'Low': 4, 'High': 1}, 'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}
Accuracy: 0.63
              precision    recal

In [19]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
lr = LogisticRegression(random_state=0, max_iter=1000, class_weight='balanced')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.66
              precision    recall  f1-score   support

        High       0.88      0.65      0.75        78
         Low       0.36      0.68      0.47        22

    accuracy                           0.66       100
   macro avg       0.62      0.67      0.61       100
weighted avg       0.76      0.66      0.69       100

