In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load cleaned dataset
df = pd.read_csv(r'/workspaces/PYTHON_ML/EDA/datasets/Analysis/qs_worldranking_cleaned.csv')

# Filter non-missing Overall_Score
df_train = df[df['Overall_Score'].notnull()]
print(f"Training data shape: {df_train.shape}")

# Define features (exclude Institution_Name, RANK_2025 to avoid leakage)
features = [
    'Academic_Reputation_Score', 'Employer_Reputation_Score', 'Faculty_Student_Score',
    'Citations_per_Faculty_Score', 'International_Faculty_Score', 'International_Students_Score',
    'International_Research_Network_Score', 'Employment_Outcomes_Score', 'Sustainability_Score',
    'Region', 'SIZE', 'Is_International_Faculty_Missing'
]
target = 'Overall_Score'

# Feature engineering: Log-transform numerical scores
df_train = df_train.copy()
num_features = features[:-3]
for col in num_features:
    df_train[col] = np.log1p(df_train[col])  # Handle skewness

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('region', OneHotEncoder(drop='first', handle_unknown='ignore'), ['Region']),
    ('size', OrdinalEncoder(categories=[['S', 'M', 'L', 'XL']]), ['SIZE'])
])

# Model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split data
X = df_train[features]
y = df_train[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=X['Region'], random_state=42)

# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

# Function to predict Overall_Score by university name
def predict_university_score(university_name, dataset, model):
    uni_row = dataset[dataset['Institution_Name'].str.lower() == university_name.lower()]
    if uni_row.empty:
        return f"University '{university_name}' not found in dataset."
    X_uni = uni_row[features]
    score = model.predict(X_uni)[0]
    return f"Predicted Overall_Score for {university_name}: {score:.2f}"

# Example usage
print(predict_university_score('Massachusetts Institute of Technology (MIT)', df, pipeline))
print(predict_university_score('University of Oxford', df, pipeline))

# Predict missing Overall_Score
df_missing = df[df['Overall_Score'].isnull()].copy()
if not df_missing.empty:
    X_missing = df_missing[features]
    for col in num_features:
        X_missing[col] = np.log1p(X_missing[col])
    df_missing['Overall_Score'] = pipeline.predict(X_missing)
    df_full = pd.concat([df_train, df_missing])
    df_full.to_csv('qs_rankings_predicted.csv', index=False)
    print("Predicted dataset saved as 'qs_rankings_predicted.csv'")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
import streamlit as st

# Load cleaned dataset
df = pd.read_csv('qs_rankings_cleaned.csv')

# Filter non-missing Overall_Score (~40% of data)
df_train = df[df['Overall_Score'].notnull()]
print(f"Training data shape: {df_train.shape}")

# Define features
features = [
    'Academic_Reputation_Score', 'Employer_Reputation_Score', 'Faculty_Student_Score',
    'Citations_per_Faculty_Score', 'International_Faculty_Score', 'International_Students_Score',
    'International_Research_Network_Score', 'Employment_Outcomes_Score', 'Sustainability_Score',
    'Region', 'SIZE', 'Is_International_Faculty_Missing'
]
target = 'Overall_Score'

# Feature engineering: Log-transform numerical scores
df_train = df_train.copy()
num_features = features[:-3]
for col in num_features:
    df_train[col] = np.log1p(df_train[col])

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('region', OneHotEncoder(drop='first', handle_unknown='ignore'), ['Region']),
    ('size', OrdinalEncoder(categories=[['S', 'M', 'L', 'XL']]), ['SIZE'])
])

# Model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split data
X = df_train[features]
y = df_train[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=X['Region'], random_state=42)

# Train and evaluate with cross-validation
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print(f"Cross-Validation R²: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

# Feature importance
feature_names = (num_features + 
                 [f"Region_{r}" for r in pipeline.named_steps['preprocessor'].named_transformers_['region'].get_feature_names_out(['Region'])] + 
                 ['SIZE'])
importances = pipeline.named_steps['model'].feature_importances_
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feature_names)
plt.title('Feature Importance for Overall_Score Prediction')
plt.show()

# Predict missing Overall_Score
df_missing = df[df['Overall_Score'].isnull()].copy()
if not df_missing.empty:
    X_missing = df_missing[features]
    for col in num_features:
        X_missing[col] = np.log1p(X_missing[col])
    df_missing['Overall_Score'] = pipeline.predict(X_missing)
    df_full = pd.concat([df_train, df_missing])
    df_full.to_csv('qs_rankings_predicted.csv', index=False)
    print("Predicted dataset saved as 'qs_rankings_predicted.csv'")
    
    # Validate distribution
    plt.figure(figsize=(8, 5))
    sns.histplot(df_train['Overall_Score'], label='Known Scores', kde=True, color='blue')
    sns.histplot(df_missing['Overall_Score'], label='Predicted Scores', kde=True, color='orange')
    plt.title('Known vs. Predicted Overall_Score Distribution')
    plt.legend()
    plt.show()

# Function for Streamlit app
def predict_university_score(university_name, dataset, model):
    uni_row = dataset[dataset['Institution_Name'].str.lower() == university_name.lower()]
    if uni_row.empty:
        return f"University '{university_name}' not found."
    X_uni = uni_row[features]
    for col in num_features:
        X_uni[col] = np.log1p(X_uni[col])
    score = model.predict(X_uni)[0]
    return f"Predicted Overall_Score for {university_name}: {score:.2f}"

# Streamlit app code (save as app.py)
with open('app.py', 'w') as f:
    f.write("""
import pandas as pd
import pickle
from sklearn.pipeline import Pipeline

# Load model and data
with open('model.pkl', 'rb') as file:
    pipeline = pickle.load(file)
df = pd.read_csv('qs_rankings_cleaned.csv')

def predict_university_score(university_name, dataset, model):
    uni_row = dataset[dataset['Institution_Name'].str.lower() == university_name.lower()]
    if uni_row.empty:
        return f"University '{university_name}' not found."
    X_uni = uni_row[['Academic_Reputation_Score', 'Employer_Reputation_Score', 'Faculty_Student_Score',
                     'Citations_per_Faculty_Score', 'International_Faculty_Score', 'International_Students_Score',
                     'International_Research_Network_Score', 'Employment_Outcomes_Score', 'Sustainability_Score',
                     'Region', 'SIZE', 'Is_International_Faculty_Missing']]
    for col in ['Academic_Reputation_Score', 'Employer_Reputation_Score', 'Faculty_Student_Score',
                'Citations_per_Faculty_Score', 'International_Faculty_Score', 'International_Students_Score',
                'International_Research_Network_Score', 'Employment_Outcomes_Score', 'Sustainability_Score']:
        X_uni[col] = np.log1p(X_uni[col])
    score = model.predict(X_uni)[0]
    return f"Predicted Overall_Score for {university_name}: {score:.2f}"

import streamlit as st
st.title('University Popularity Score Predictor')
uni_name = st.text_input('Enter University Name (e.g., Harvard University):')
if st.button('Predict'):
    result = predict_university_score(uni_name, df, pipeline)
    st.write(result)
""")

# Save model
import pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)
print("Model saved as 'model.pkl'")