In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# 1. Load & Prepare Data
print("üìÇ Loading Integrated Data...")
try:
    df = pd.read_csv('chula_papers_with_quality.csv')
except FileNotFoundError:
    print("‚ùå Error: ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå chula_papers_with_quality.csv (‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏£‡∏±‡∏ô Data Integration ‡∏Å‡πà‡∏≠‡∏ô)")
    exit()

df = df.dropna(subset=['SJR Best Quartile'])

df['target'] = df['SJR Best Quartile'].apply(lambda x: 1 if x == 'Q1' else 0)

def check_inter(country_str):
    country_str = str(country_str)
    if ';' in country_str: return 'Yes' 
    if 'Thailand' not in country_str: return 'Yes' 
    return 'No' 

df['is_inter'] = df['countries_str'].apply(check_inter)


df['primary_subject'] = df['subject_areas_str'].apply(lambda x: str(x).split(';')[0].strip())


X = df[['title', 'is_inter', 'primary_subject']]
y = df['target']

print(f"‚úÖ Data Prepared: {len(df)} samples")
print(f"   - Q1 Papers: {y.sum()} ({y.sum()/len(y)*100:.1f}%)")
print(f"   - Other Papers: {len(y) - y.sum()}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 2. Build AI Pipeline
print("\nüöÄ Building & Training Model...")

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(stop_words='english', max_features=3000, ngram_range=(1,2)), 'title'),

        ('cat', OneHotEncoder(handle_unknown='ignore'), ['is_inter', 'primary_subject'])
    ]
)

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

model.fit(X_train, y_train)

# 3. Evaluation
print("\nüìä Evaluation Results:")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Q2-Q4', 'Q1']))

# 4. Save Model
joblib.dump(model, 'q1_predictor_model.joblib')
print("üíæ Model saved to 'q1_predictor_model.joblib'")

unique_subjects = sorted(df['primary_subject'].unique())
joblib.dump(unique_subjects, 'subject_list.joblib')

üìÇ Loading Integrated Data...
‚úÖ Data Prepared: 18537 samples
   - Q1 Papers: 9680 (52.2%)
   - Other Papers: 8857

üöÄ Building & Training Model...

üìä Evaluation Results:
              precision    recall  f1-score   support

       Q2-Q4       0.68      0.69      0.69      1772
          Q1       0.71      0.70      0.71      1936

    accuracy                           0.70      3708
   macro avg       0.70      0.70      0.70      3708
weighted avg       0.70      0.70      0.70      3708

üíæ Model saved to 'q1_predictor_model.joblib'


['subject_list.joblib']

In [None]:
import pandas as pd
import joblib
import numpy as np

print("üöÄ ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡πÄ‡∏à‡∏≤‡∏∞‡∏•‡∏∂‡∏Å‡∏™‡∏°‡∏≠‡∏á AI ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ï‡∏≠‡∏ö...")

try:
    model_pipeline = joblib.load('q1_predictor_model.joblib')
    df = pd.read_csv('chula_papers_with_quality.csv')
    df = df.dropna(subset=['SJR Best Quartile']) 
    df['is_Q1'] = df['SJR Best Quartile'].apply(lambda x: 1 if x == 'Q1' else 0)
    
    def check_inter(country_str):
        country_str = str(country_str)
        if ';' in country_str: return 'Yes'
        if 'Thailand' not in country_str: return 'Yes'
        return 'No'
    df['is_inter'] = df['countries_str'].apply(check_inter)
    
except Exception as e:
    print(f"‚ùå Error: {e}")
    exit()


print("\n" + "="*50)
print("1Ô∏è‚É£ ‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ô‡∏ï‡πà‡∏≤‡∏á‡∏ä‡∏≤‡∏ï‡∏¥‡∏ä‡πà‡∏ß‡∏¢‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô‡∏î‡πâ‡∏ß‡∏¢‡πÑ‡∏´‡∏°? (Collaboration Impact)")
print("="*50)

inter_q1_rate = df[df['is_inter'] == 'Yes']['is_Q1'].mean() * 100
local_q1_rate = df[df['is_inter'] == 'No']['is_Q1'].mean() * 100

print(f"üåç ‡∏á‡∏≤‡∏ô‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏ï‡πà‡∏≤‡∏á‡∏ä‡∏≤‡∏ï‡∏¥‡∏£‡πà‡∏ß‡∏° (International):  ‡πÇ‡∏≠‡∏Å‡∏≤‡∏™‡πÑ‡∏î‡πâ Q1 ‡∏Ñ‡∏∑‡∏≠ {inter_q1_rate:.2f}%")
print(f"üè† ‡∏á‡∏≤‡∏ô‡∏ó‡∏µ‡πà‡∏°‡∏µ‡πÅ‡∏Ñ‡πà‡∏Ñ‡∏ô‡πÑ‡∏ó‡∏¢ (Local):           ‡πÇ‡∏≠‡∏Å‡∏≤‡∏™‡πÑ‡∏î‡πâ Q1 ‡∏Ñ‡∏∑‡∏≠ {local_q1_rate:.2f}%")

if inter_q1_rate > local_q1_rate:
    diff = inter_q1_rate - local_q1_rate
    print(f"üëâ ‡∏™‡∏£‡∏∏‡∏õ: ‡∏°‡∏µ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ô‡∏ï‡πà‡∏≤‡∏á‡∏ä‡∏≤‡∏ï‡∏¥‡∏ä‡πà‡∏ß‡∏¢ ‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÇ‡∏≠‡∏Å‡∏≤‡∏™‡πÑ‡∏î‡πâ Q1 ‡∏ñ‡∏∂‡∏á +{diff:.2f}%")
else:
    print("üëâ ‡∏™‡∏£‡∏∏‡∏õ: ‡πÑ‡∏°‡πà‡∏ï‡πà‡∏≤‡∏á‡∏Å‡∏±‡∏ô‡∏°‡∏≤‡∏Å")

print("\n" + "="*50)
print("2Ô∏è‚É£ ‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏±‡πâ‡∏á‡∏ä‡∏∑‡πà‡∏≠‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡πÅ‡∏ö‡∏ö‡πÑ‡∏´‡∏ô? (Top Keywords for Q1)")
print("="*50)

vectorizer = model_pipeline.named_steps['preprocessor'].transformers_[0][1]
feature_names = vectorizer.get_feature_names_out()

rf_model = model_pipeline.named_steps['classifier']
importances = rf_model.feature_importances_

text_importances = importances[:len(feature_names)]

indices = np.argsort(text_importances)[::-1]
top_n = 20

print("üî• ‡∏Ñ‡∏≥‡∏®‡∏±‡∏û‡∏ó‡πå 20 ‡∏Ñ‡∏≥‡πÅ‡∏£‡∏Å ‡∏ó‡∏µ‡πà‡∏°‡∏±‡∏Å‡∏à‡∏∞‡∏õ‡∏£‡∏≤‡∏Å‡∏è‡πÉ‡∏ô‡∏á‡∏≤‡∏ô‡∏ß‡∏¥‡∏à‡∏±‡∏¢ Q1 (‡∏ï‡∏≤‡∏°‡∏ô‡πâ‡∏≥‡∏´‡∏ô‡∏±‡∏Å‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç):")
for i in range(top_n):
    print(f"   {i+1}. {feature_names[indices[i]]} (Score: {text_importances[indices[i]]:.4f})")

print("\nüí° ‡∏Ñ‡∏≥‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥: ‡∏•‡∏≠‡∏á‡∏ï‡∏±‡πâ‡∏á‡∏ä‡∏∑‡πà‡∏≠‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡πÇ‡∏î‡∏¢‡πÉ‡∏ä‡πâ‡∏Ñ‡∏≥‡πÄ‡∏´‡∏•‡πà‡∏≤‡∏ô‡∏µ‡πâ ‡∏´‡∏£‡∏∑‡∏≠‡πÅ‡∏ô‡∏ß‡∏ó‡∏≤‡∏á‡∏ô‡∏µ‡πâ")

print("\n" + "="*50)
print("3Ô∏è‚É£ ‡∏≠‡∏¢‡∏π‡πà‡∏ó‡∏µ‡πà‡∏Ñ‡∏ì‡∏∞/‡∏™‡∏≤‡∏Ç‡∏≤‡∏≠‡∏∞‡πÑ‡∏£? (Top Subjects for Q1)")
print("="*50)

df['main_subject'] = df['subject_areas_str'].apply(lambda x: str(x).split(';')[0].strip())

subject_stats = df.groupby('main_subject')['is_Q1'].agg(['mean', 'count'])
subject_stats = subject_stats[subject_stats['count'] > 50] # ‡πÄ‡∏≠‡∏≤‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏™‡∏≤‡∏Ç‡∏≤‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏á‡∏≤‡∏ô‡∏ß‡∏¥‡∏à‡∏±‡∏¢‡πÄ‡∏Å‡∏¥‡∏ô 50 ‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á
subject_stats = subject_stats.sort_values(by='mean', ascending=False).head(10)

print("üèÜ 10 ‡∏™‡∏≤‡∏Ç‡∏≤‡∏ß‡∏¥‡∏ä‡∏≤‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô‡∏á‡∏≤‡∏ô Q1 ‡∏™‡∏π‡∏á‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î:")
for subject, row in subject_stats.iterrows():
    print(f"   - {subject}: {row['mean']*100:.2f}% (‡∏à‡∏≤‡∏Å {row['count']} ‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á)")

üöÄ ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡πÄ‡∏à‡∏≤‡∏∞‡∏•‡∏∂‡∏Å‡∏™‡∏°‡∏≠‡∏á AI ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ï‡∏≠‡∏ö...

1Ô∏è‚É£ ‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ô‡∏ï‡πà‡∏≤‡∏á‡∏ä‡∏≤‡∏ï‡∏¥‡∏ä‡πà‡∏ß‡∏¢‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô‡∏î‡πâ‡∏ß‡∏¢‡πÑ‡∏´‡∏°? (Collaboration Impact)
üåç ‡∏á‡∏≤‡∏ô‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏ï‡πà‡∏≤‡∏á‡∏ä‡∏≤‡∏ï‡∏¥‡∏£‡πà‡∏ß‡∏° (International):  ‡πÇ‡∏≠‡∏Å‡∏≤‡∏™‡πÑ‡∏î‡πâ Q1 ‡∏Ñ‡∏∑‡∏≠ 62.76%
üè† ‡∏á‡∏≤‡∏ô‡∏ó‡∏µ‡πà‡∏°‡∏µ‡πÅ‡∏Ñ‡πà‡∏Ñ‡∏ô‡πÑ‡∏ó‡∏¢ (Local):           ‡πÇ‡∏≠‡∏Å‡∏≤‡∏™‡πÑ‡∏î‡πâ Q1 ‡∏Ñ‡∏∑‡∏≠ 42.16%
üëâ ‡∏™‡∏£‡∏∏‡∏õ: ‡∏°‡∏µ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ô‡∏ï‡πà‡∏≤‡∏á‡∏ä‡∏≤‡∏ï‡∏¥‡∏ä‡πà‡∏ß‡∏¢ ‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÇ‡∏≠‡∏Å‡∏≤‡∏™‡πÑ‡∏î‡πâ Q1 ‡∏ñ‡∏∂‡∏á +20.60%

2Ô∏è‚É£ ‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏±‡πâ‡∏á‡∏ä‡∏∑‡πà‡∏≠‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡πÅ‡∏ö‡∏ö‡πÑ‡∏´‡∏ô? (Top Keywords for Q1)
üî• ‡∏Ñ‡∏≥‡∏®‡∏±‡∏û‡∏ó‡πå 20 ‡∏Ñ‡∏≥‡πÅ‡∏£‡∏Å ‡∏ó‡∏µ‡πà‡∏°‡∏±‡∏Å‡∏à‡∏∞‡∏õ‡∏£‡∏≤‡∏Å‡∏è‡πÉ‡∏ô‡∏á‡∏≤‡∏ô‡∏ß‡∏¥‡∏à‡∏±‡∏¢ Q1 (‡∏ï‡∏≤‡∏°‡∏ô‡πâ‡∏≥‡∏´‡∏ô‡∏±‡∏Å‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç):
   1. thailand (Score: 0.0063)
   2. using (Score: 0.0041)
   3. thai (Score: 0.0037)
  