In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

url = "student-por.csv"
df = pd.read_csv(url, sep=";")

label_encoder = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(label_encoder.fit_transform)

threshold = 5

# Binary target variable
df['at_risk'] = df['G3'] < threshold

# Drop the original target variables G1, G2, G3
df = df.drop(['G1', 'G2', 'G3'], axis=1)

# Specify the columns to include in the feature set X
selected_columns = ['Medu', 'Fedu', 'schoolsup', 'famsup']
X = df[selected_columns]
y = df['at_risk']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

print("\nClassification Report:")
print(classification_report(y_test, predictions))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))


Accuracy: 0.9769230769230769

Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.99      0.99       128
        True       0.00      0.00      0.00         2

    accuracy                           0.98       130
   macro avg       0.49      0.50      0.49       130
weighted avg       0.97      0.98      0.97       130


Confusion Matrix:
[[127   1]
 [  2   0]]


In [30]:
# Collect input from the user/student
new_data = {'Medu': int(input('Enter mother\'s education (numeric): ')),
            'Fedu': int(input('Enter father\'s education (numeric): ')),
            'schoolsup': input('Does the student receive extra educational support? (yes/no): ').lower(),
            'famsup': input('Does the student receive family support? (yes/no): ').lower()}

# Create a DataFrame from the user input
new_df = pd.DataFrame([new_data])

# Map categorical input to numerical values
for col in categorical_cols:
    if col in new_data:
        new_df[col] = label_encoder.transform([new_data[col]])

# Make predictions on the new data
new_predictions = model.predict(new_df[selected_columns])

# Display the prediction
if new_predictions[0]:
    print('The student is predicted to be at risk.')
else:
    print('The student is predicted not to be at risk.')


Enter mother's education (numeric): 1
Enter father's education (numeric): 5
Does the student receive extra educational support? (yes/no): yes
Does the student receive family support? (yes/no): yes
The student is predicted not to be at risk.
