In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

import streamlit as st

In [19]:
df=pd.read_csv(r'/workspaces/PYTHON_ML/projects/Datasets/qs_worldranking_cleaned.csv')

In [20]:
df.columns

Index(['RANK_2025', 'RANK_2024', 'Institution_Name', 'Location', 'Region',
       'SIZE', 'FOCUS', 'RES.', 'STATUS', 'Academic_Reputation_Score',
       'Academic_Reputation_Rank', 'Employer_Reputation_Score',
       'Employer_Reputation_Rank', 'Faculty_Student_Score',
       'Faculty_Student_Rank', 'Citations_per_Faculty_Score',
       'Citations_per_Faculty_Rank', 'International_Faculty_Score',
       'International_Faculty_Rank', 'International_Students_Score',
       'International_Students_Rank', 'International_Research_Network_Score',
       'International_Research_Network_Rank', 'Employment_Outcomes_Score',
       'Employment_Outcomes_Rank', 'Sustainability_Score',
       'Sustainability_Rank', 'Overall_Score'],
      dtype='object')

In [21]:
#Training Data Selection. For overall Score
df_train = df[df['Overall_Score'].notnull()]
df_train.shape

(593, 28)

In [22]:
# Selecting Features and Target
features = [
    'Academic_Reputation_Score', 'Employer_Reputation_Score', 'Faculty_Student_Score',
    'Citations_per_Faculty_Score', 'International_Faculty_Score', 'International_Students_Score',
    'International_Research_Network_Score', 'Employment_Outcomes_Score', 'Sustainability_Score',
    'Region', 'SIZE', 'Is_International_Faculty_Missing'
]

target = 'Overall_Score'



In [25]:
#Log-transform numerical scores 
#In my Language- Log transformation is used to reduce skewness,stable variance and make data more normally distributed
df_training = df_train.copy()
num_features = features[:-3]
for col in num_features:
    df_training[col] = np.log1p(df_training[col])

In [24]:
df_training.shape

(593, 28)

In [None]:
#making preprocessing pipeline for efficiency
pre_preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('region', OneHotEncoder(drop='first', handle_unknown='ignore'), ['Region']),
    ('size', OrdinalEncoder(categories=[['S', 'M', 'L', 'XL']]), ['SIZE'])
])