#### Please refer to the Feature_Importance.ipynb to understand how we managed to get the total weighted score. That document will explain how we found the formula to calculate a total weighted score

In [1]:
import pandas as pd
data_path = '/Users/nathanyap/Desktop/DataMining_Project/project/Nathan Findings/TOEFL_IELTS_Combined.csv'
df_admitsFYI = pd.read_csv(data_path)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# we declare the dependent and independent variables
X = df_admitsFYI[['GPA', 'GRE Total', 'TOEFL/IELTS', 'Work Exp', 'Papers']]
y = df_admitsFYI['Status']

# a bit of cleaning
X = X.fillna(X.mean())

# advised to scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# usual split of the data with 70% for training
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# to start and traing the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# working on the predictions
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# extracting the importances
feature_importances = pd.DataFrame(rf_model.feature_importances_, index=X.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)

feature_importances

Unnamed: 0,Importance
GPA,0.490204
GRE Total,0.20645
TOEFL/IELTS,0.186049
Work Exp,0.093192
Papers,0.024105


In [7]:
# using the different weights, we can calculate a total score based on these numbers.
# Let's try ranking the schools first to get the list of schools and thier scores.

# start first by multiplying each feature by its corresponding weight
df_admitsFYI['Weighted_Score'] = (
    df_admitsFYI['GPA'] * feature_importances.loc['GPA', 'Importance'] +
    df_admitsFYI['GRE Total'] * feature_importances.loc['GRE Total', 'Importance'] +
    df_admitsFYI['TOEFL/IELTS'] * feature_importances.loc['TOEFL/IELTS', 'Importance'] +
    df_admitsFYI['Work Exp'] * feature_importances.loc['Work Exp', 'Importance'] +
    df_admitsFYI['Papers'] * feature_importances.loc['Papers', 'Importance']
)

# Calculating the average weighted score for students in each university
average_scores_by_university = df_admitsFYI.groupby('University')['Weighted_Score'].mean().sort_values(ascending=False)

# this will give the top 5 schools
average_scores_by_university.head(5)

University
ETH Zurich                         70.971380
Harvard University                 70.623165
Columbia University                70.053918
Georgia Institute of Technology    69.960539
Carnegie Mellon University         69.895005
Name: Weighted_Score, dtype: float64

In [8]:
# this will give the bottom 5 schools
average_scores_by_university.tail(5)

University
Hult International Business School, Boston    62.842727
Lewis University                              62.723028
Gannon University                             62.431742
Indiana Tech                                  62.071799
Governors State University                    61.807253
Name: Weighted_Score, dtype: float64

In [19]:
# Now we can try building this ranking system where say I am a student with these stats
# 1) GPA: 3.1
# 2) GRE: 300
# 3) TOEFL/IELTS: 8.5
# 4) Work Exp: 2
# 5) Research: 1
# So the total score for this would be 65.247
# then now, I want to ask my system to recommend 5 schools that I should apply where the weighted scores are above me
# and also 5 schools that I should apply that are below me

# load the student details to a map
student_details = {
    'GPA': 3.1,
    'GRE Total': 300,
    'TOEFL/IELTS': 8.5,
    'Work Exp': 2,
    'Papers': 1
}

# Calculating the student's weighted score using the feature importances
student_weighted_score = (
    student_details['GPA'] * feature_importances.loc['GPA', 'Importance'] +
    student_details['GRE Total'] * feature_importances.loc['GRE Total', 'Importance'] +
    student_details['TOEFL/IELTS'] * feature_importances.loc['TOEFL/IELTS', 'Importance'] +
    student_details['Work Exp'] * feature_importances.loc['Work Exp', 'Importance'] +
    student_details['Papers'] * feature_importances.loc['Papers', 'Importance']
)

# Finding the 5 schools above and 5 below the student's score
schools_above = average_scores_by_university[average_scores_by_university > student_weighted_score].nsmallest(5).sort_values(ascending=False)
schools_below = average_scores_by_university[average_scores_by_university < student_weighted_score].nlargest(5)

student_weighted_score

65.24666346339869

In [21]:
schools_above.index.tolist()

['California State University, Long Beach',
 'New Jersey Institute of Technology',
 'Johnson and Wales University, Providence',
 'Indiana State University',
 'Illinois State University']

In [28]:
print("We believe you should apply to these 5 schools as your Targets")
for i in range(5):
    print(f"{i + 1}) {schools_above.index.tolist()[i]}")
print()
print("On the other hand, we believe you should apply to these schoools as your Safeties")
for i in range(5):
    print(f"{i + 1}) {schools_below.index.tolist()[i]}")

We believe you should apply to these 5 schools as your Targets
1) California State University, Long Beach
2) New Jersey Institute of Technology
3) Johnson and Wales University, Providence
4) Indiana State University
5) Illinois State University

On the other hand, we believe you should apply to these schoools as your Safeties
1) James Cook University, Townsville
2) Kettering University, Flint
3) Florida International University, Miami
4) Florida Atlantic University
5) Grand Valley State University
