In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load datasets
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('updated_test_data.csv')

# View the first few rows of each dataset
print("Train Data:")
print(train_data.head())
print("Test Data:")
print(test_data.head())


Train Data:
   Applicant ID  Gender O-Level Maths O-Level English O-Level Science  \
0             1  Female             A               C               C   
1             2  Female             D               D               D   
2             3    Male             B               A               D   
3             4  Female             C               E               B   
4             5  Female             C               A               E   

  A-Level Subj 1 Grade 1    A-Level Subj 2 Grade 2 A-Level Subj 3 Grade 3  \
0      Chemistry       B         Geography       D        Physics       B   
1          Maths       E  Computer Science       A        Biology       B   
2      Economics       A           Biology       D       Business       A   
3       Business       C             Maths       C      Chemistry       B   
4          Maths       B           Biology       E      Economics       B   

          Program 1 Choice               Program 2 Choice  \
0          BCom Accountin

In [2]:
# Convert O-Level grades to Pass/Fail
def o_level_pass_fail(grade):
    return 1 if grade in ['A', 'B', 'C'] else 0

for col in ['O-Level Maths', 'O-Level English', 'O-Level Science']:
    train_data[col] = train_data[col].apply(o_level_pass_fail)
    test_data[col] = test_data[col].apply(o_level_pass_fail)


In [3]:
# Convert A-Level grades to points
grade_to_points = {'A': 5, 'B': 4, 'C': 3, 'D': 2, 'E': 1}

for col in ['Grade 1', 'Grade 2', 'Grade 3']:
    train_data[col] = train_data[col].map(grade_to_points)
    test_data[col] = test_data[col].map(grade_to_points)


In [4]:
# One-hot encode all categorical variables
categorical_columns = ['Gender', 'A-Level Subj 1', 'A-Level Subj 2', 'A-Level Subj 3',
                       'Program 1 Choice', 'Program 2 Choice', 'Program 3 Choice']

train_data_encoded = pd.get_dummies(train_data, columns=categorical_columns)
test_data_encoded = pd.get_dummies(test_data, columns=categorical_columns)

# Ensure both datasets have the same columns
test_data_encoded = test_data_encoded.reindex(columns=train_data_encoded.columns, fill_value=0)


In [5]:
scaler = StandardScaler()
train_data_encoded[['Grade 1', 'Grade 2', 'Grade 3']] = scaler.fit_transform(train_data_encoded[['Grade 1', 'Grade 2', 'Grade 3']])
test_data_encoded[['Grade 1', 'Grade 2', 'Grade 3']] = scaler.transform(test_data_encoded[['Grade 1', 'Grade 2', 'Grade 3']])


In [6]:
# Prepare data for Admission Status prediction
X_admission = train_data_encoded.drop(columns=['Admission Status', 'Admitted Program'])
y_admission = train_data_encoded['Admission Status']

# Split training data for validation
X_train_adm, X_val_adm, y_train_adm, y_val_adm = train_test_split(X_admission, y_admission, test_size=0.3, random_state=42)

# Train Model 1
admission_model = RandomForestClassifier(random_state=42)
admission_model.fit(X_train_adm, y_train_adm)

# Validate Model 1
val_predictions_adm = admission_model.predict(X_val_adm)
print("Model 1 - Admission Status Prediction Accuracy:", accuracy_score(y_val_adm, val_predictions_adm))
print(classification_report(y_val_adm, val_predictions_adm))


Model 1 - Admission Status Prediction Accuracy: 0.8444444444444444
              precision    recall  f1-score   support

          No       0.93      0.63      0.75       337
         Yes       0.82      0.97      0.89       563

    accuracy                           0.84       900
   macro avg       0.87      0.80      0.82       900
weighted avg       0.86      0.84      0.84       900



In [7]:
print(train_data_encoded['Admission Status'].value_counts())


Admission Status
Yes    1907
No     1093
Name: count, dtype: int64


In [8]:
# Ensure Admission Status is correctly encoded
train_data['Admission Status'] = train_data['Admission Status'].map({'Yes': 1, 'No': 0})


In [9]:
train_data_encoded = pd.get_dummies(train_data, columns=['Gender', 'A-Level Subj 1', 'A-Level Subj 2', 'A-Level Subj 3'])


In [10]:
admitted_students = train_data_encoded[train_data_encoded['Admission Status'] == 1]
print(f"Number of admitted students: {len(admitted_students)}")


Number of admitted students: 1907


In [11]:
print(train_data_encoded[['Admission Status', 'Admitted Program']].head(10))


   Admission Status           Admitted Program
0                 1              BSc Chemistry
1                 1       BSc Computer Science
2                 1    BSc Biological Sciences
3                 1    BSc Applied Mathematics
4                 1               BCom Finance
5                 1  BSc Environmental Science
6                 0                        NaN
7                 0                        NaN
8                 1            BSc Mathematics
9                 0                        NaN


In [12]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Ensure all categorical columns are encoded
categorical_columns = ['Gender', 'A-Level Subj 1', 'A-Level Subj 2', 'A-Level Subj 3',
                       'Program 1 Choice', 'Program 2 Choice', 'Program 3 Choice']
train_data_encoded = pd.get_dummies(train_data, columns=categorical_columns)

# Filter admitted students
admitted_students = train_data_encoded[train_data_encoded['Admission Status'] == 1]

# Verify number of admitted students
print(f"Number of admitted students: {len(admitted_students)}")
if len(admitted_students) == 0:
    raise ValueError("No admitted students found in the training dataset!")

# Separate features and target
X_program = admitted_students.drop(columns=['Admission Status', 'Admitted Program'])
y_program = admitted_students['Admitted Program']

# Encode the target variable
label_encoder = LabelEncoder()
y_program_encoded = label_encoder.fit_transform(y_program)

# Split the data into training and validation sets
X_train_prog, X_val_prog, y_train_prog, y_val_prog = train_test_split(X_program, y_program_encoded, test_size=0.3, random_state=42)

# Train Model 2
program_model = RandomForestClassifier(random_state=42)
program_model.fit(X_train_prog, y_train_prog)

# Validate Model 2
val_predictions_prog = program_model.predict(X_val_prog)
print("Model 2 - Program Prediction Accuracy:", accuracy_score(y_val_prog, val_predictions_prog))
print(classification_report(y_val_prog, val_predictions_prog))

# Decode predictions back into program names
decoded_predictions = label_encoder.inverse_transform(val_predictions_prog)
print("Decoded Program Predictions:", decoded_predictions[:10])


Number of admitted students: 1907
Model 2 - Program Prediction Accuracy: 0.7050610820244329
              precision    recall  f1-score   support

           0       0.50      0.64      0.56        22
           1       0.70      0.65      0.67        43
           2       0.72      0.78      0.75        40
           3       0.64      0.72      0.68        25
           4       0.83      0.71      0.77        21
           5       0.76      0.73      0.75        30
           6       0.68      0.67      0.67        42
           7       0.83      0.54      0.66        35
           8       0.66      0.78      0.71        32
           9       0.42      0.47      0.44        17
          10       0.68      0.68      0.68        38
          11       0.58      0.56      0.57        25
          12       0.86      0.59      0.70        41
          13       0.62      0.68      0.65        22
          14       0.80      0.71      0.75        28
          15       0.64      0.80      0.71

In [13]:
print("Training dataset Admission Status distribution:")
print(train_data['Admission Status'].value_counts())


Training dataset Admission Status distribution:
Admission Status
1    1907
0    1093
Name: count, dtype: int64


In [14]:
# Prepare test features (drop Admission Status and Admitted Program from features)
X_test = test_data_encoded.drop(columns=['Admission Status', 'Admitted Program'])
#X_test = test_data_encoded

# Predict Admission Status
admission_predictions = admission_model.predict(X_test)

# Predict Admitted Program for those predicted as admitted
program_predictions = [
    program_model.predict([x])[0] if admission == 1 else "Not Admitted"
    for x, admission in zip(X_test.to_numpy(), admission_predictions)
]

In [15]:
print("Admission Status Predictions (first 10):", admission_predictions[:10])


Admission Status Predictions (first 10): ['Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes']


In [16]:
print(set(admission_predictions))  # Print unique values in admission_predictions


{'Yes', 'No'}


In [17]:
# If predictions are already 'Yes' and 'No', skip mapping
if isinstance(admission_predictions[0], str):
    admission_predictions_mapped = admission_predictions
else:
    # Map numerical predictions to 'Yes' and 'No'
    admission_status_map = {1: 'Yes', 0: 'No'}
    admission_predictions_mapped = [admission_status_map[pred] for pred in admission_predictions]

# Predict Admitted Program for those predicted as admitted
program_predictions = []
for x, admission in zip(X_test.to_numpy(), admission_predictions):
    if admission in [1, 'Yes']:  # Adjust for both numeric and string labels
        program_encoded = program_model.predict([x])[0]
        program_name = label_encoder.inverse_transform([program_encoded])[0]
        program_predictions.append(program_name)
    else:
        program_predictions.append("Not Admitted")

# Add predictions to the test dataset
test_data['Admission Status Prediction'] = admission_predictions_mapped
test_data['Admitted Program Prediction'] = program_predictions

# Select relevant columns for the final output
output_columns = ['Applicant ID', 'Admission Status Prediction', 'Admitted Program Prediction']
output_data = test_data[output_columns]

# Display the formatted results
print("Prediction Results:")
print(output_data.head(10))




Prediction Results:
   Applicant ID Admission Status Prediction    Admitted Program Prediction
0          1001                         Yes                  BSc Geography
1          1002                         Yes                   BCom Finance
2          1003                          No                   Not Admitted
3          1004                         Yes        BSc Disaster Management
4          1005                         Yes                BCom Accounting
5          1006                         Yes           BSc Computer Science
6          1007                          No                   Not Admitted
7          1008                         Yes                  BSc Geography
8          1009                         Yes           BSc Computer Science
9          1010                         Yes  BCom Human Capital Management


In [18]:
# Save the dataset with predictions to a CSV file
output_filename = "test_data_with_predictions.csv"
test_data.to_csv(output_filename, index=False)

print(f"Dataset with predictions saved to '{output_filename}'")


Dataset with predictions saved to 'test_data_with_predictions.csv'
